In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from scipy.sparse import hstack
from scipy.sparse import vstack

In [14]:
def num_feature_normalization(train, test):
    feature_min = pd.concat([train, test], axis=0).min()
    feature_max = pd.concat([train, test], axis=0).max()
    feature_train_norm = (train - feature_min) / (feature_max - feature_min)
    feature_test_norm = (test - feature_min) / (feature_max - feature_min)
    return feature_train_norm, feature_test_norm



In [23]:
cleaned_auto = pd.read_csv('data/cleaned_auto_labeled.csv')
cleaned_hand = pd.read_csv('data/cleaned_hand_labeled.csv')
cleaned_unlabeled = pd.read_csv('data/cleaned_unlabeled.csv')

cleaned_auto['Text'].fillna('', inplace=True)
cleaned_hand['Text'].fillna('', inplace=True)

# text feature
X_train, Y_train = cleaned_auto['Text'], cleaned_auto['Sentiment']
X_test, Y_test = cleaned_hand['Text'], cleaned_hand['Sentiment']
# num_comments feature
C_train, C_test = cleaned_auto['Num_Comments'], cleaned_hand['Num_Comments']
# score feature
S_train, S_test = cleaned_auto['Score'], cleaned_hand['Score']

C_train_norm, C_test_norm = num_feature_normalization(C_train, C_test)
S_train_norm, S_test_norm = num_feature_normalization(S_train, S_test)


In [24]:
#linear regression
vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

X_train_lr, X_val_lr, C_train_lr, C_val_lr, S_train_lr, S_val_lr, Y_train_lr, Y_val_lr = train_test_split(X_train_vect, C_train_norm, S_train_norm, Y_train, test_size=0.2, random_state=42)

In [25]:
# 1 feature
# lr_model = LinearRegression()

alpha = 10
lr_model = Ridge(alpha=alpha)
lr_model.fit(X_train_lr, Y_train_lr)

In [26]:
# MSE and accuracy on validation set
lr_model_val = lr_model.predict(X_val_lr)

result_val_dis = np.where(lr_model_val >= 0.05, 1, np.where(lr_model_val <= -0.05, -1, 0))
Y_val_dis = np.where(Y_val_lr >= 0.05, 1, np.where(Y_val_lr <= -0.05, -1, 0))

val_mse = mean_squared_error(Y_val_dis, result_val_dis)
print(f'Mean Squared Error on Validation Set: {val_mse}')

accuracy = np.sum(result_val_dis == Y_val_dis) / len(Y_val_dis)
print(f'Accuracy test: {accuracy:.2%}')

# MSE and accuracy on test set
lr_model_test = lr_model.predict(X_test_vect)
result_test_dis = np.where(lr_model_test >= 0.05, 1, np.where(lr_model_test <= -0.05, -1, 0))
test_mse = mean_squared_error(Y_test, result_test_dis)
print(f'Mean Squared Error on Test Set: {test_mse}')

accuracy1 = np.sum(result_test_dis == Y_test) / len(Y_test)
print(f'Accuracy test: {accuracy1:.2%}')


Mean Squared Error on Validation Set: 0.5088059928055269
Accuracy test: 77.11%
Mean Squared Error on Test Set: 0.752112676056338
Accuracy test: 65.35%


In [27]:
# 3 features
X_train_ft_lr = hstack([X_train_lr, C_train_lr.values.reshape(-1, 1), S_train_lr.values.reshape(-1, 1)])
X_test_ft_lr = hstack([X_test_vect, C_test_norm.values.reshape(-1, 1), S_test_norm.values.reshape(-1, 1)])
X_val_ft_lr = hstack([X_val_lr, C_val_lr.values.reshape(-1, 1), S_val_lr.values.reshape(-1, 1)])

In [28]:
#lr_model_ft = LinearRegression()

alpha = 10
lr_model_ft = Ridge(alpha=alpha)
lr_model_ft.fit(X_train_ft_lr, Y_train_lr)

In [29]:
# MSE and accuracy on validation set
lr_model_ft_val = lr_model_ft.predict(X_val_ft_lr)
val_ft_mse = mean_squared_error(Y_val_lr, lr_model_ft_val)
print(f'Mean Squared Error on Validation Set for 3 Features: {val_ft_mse}')

result_val_ft_dis = np.where(lr_model_ft_val >= 0.05, 1, np.where(lr_model_ft_val <= -0.05, -1, 0))
accuracy3 = np.sum(result_val_ft_dis == Y_val_dis) / len(Y_val_dis)
print(f'Accuracy test: {accuracy3:.2%}')

# MSE and accuracy on test set
lr_model_ft_test = lr_model_ft.predict(X_test_ft_lr)
result_test_ft_dis = np.where(lr_model_ft_test >= 0.05, 1, np.where(lr_model_ft_test <= -0.05, -1, 0))

test_mse = mean_squared_error(Y_test, result_test_ft_dis)
print(f'Mean Squared Error on Test Set for 3 Features: {test_mse}')

accuracy4 = np.sum(result_test_ft_dis == Y_test) / len(Y_test)
print(f'Accuracy: {accuracy4:.2%}')


Mean Squared Error on Validation Set for 3 Features: 0.13071243403872654
Accuracy test: 77.12%
Mean Squared Error on Test Set for 3 Features: 0.7492957746478873
Accuracy: 65.63%
