In [21]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [22]:
data = Path('data/vader_emolex.csv')
df = pd.read_csv(data)

In [23]:
bins = [0, 2.5, 3.5, 5]
labels = ['Negative', 'Neutral', 'Positive']
df['bins'] = pd.cut(df['stars'], bins=bins, labels=labels)
n = 50000
subset = df.groupby('bins').head(n)

In [24]:
subset.set_index('review_id', inplace=True)
df = subset.drop(columns=['text'])

In [25]:
df.head()

Unnamed: 0_level_0,stars,compound_sentiment,positive,neutral,negative,anger,anticipation,disgust,fear,joy,negative.1,positive.1,sadness,surprise,trust,word_count,bins
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
zurS64w23RAXHPToR0C39Q,4,0.9906,0.195,0.779,0.026,0.0,8.0,1.0,3.0,8.0,0.0,14.0,1.0,8.0,9.0,123.0,Positive
1hAS--WgusxuLwhHND12qw,5,0.9882,0.36,0.64,0.0,0.0,2.0,0.0,0.0,5.0,0.0,6.0,1.0,1.0,2.0,33.0,Positive
3NwwrhNzJ6H1P-5vbsvyMQ,5,0.9723,0.177,0.787,0.036,0.0,3.0,0.0,1.0,6.0,2.0,8.0,0.0,1.0,2.0,63.0,Positive
bHCNw775nDiJXQGxERp5VQ,5,0.9979,0.163,0.81,0.027,1.0,6.0,1.0,3.0,9.0,6.0,20.0,1.0,4.0,8.0,231.0,Positive
kc1aIPCo1I4E_17KuIkY5Q,2,-0.7723,0.091,0.796,0.113,2.0,2.0,2.0,3.0,5.0,3.0,9.0,2.0,1.0,4.0,101.0,Negative


In [26]:
test = df.sample(frac=0.5, random_state=1)
y = test["bins"]
X = test.drop(columns=["bins"])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape

(56250, 16)

In [27]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

In [28]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',max_iter=30000,random_state=1)
classifier

In [29]:
lr_model = classifier.fit(X_train, y_train)

In [30]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 1.0


In [31]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)
testing_predictions = classifier.predict(X_test)

In [32]:
from sklearn.metrics import classification_report
training_report = classification_report(y_train, training_predictions)
print(training_report)

              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00     18783
     Neutral       1.00      1.00      1.00     18787
    Positive       1.00      1.00      1.00     18680

    accuracy                           1.00     56250
   macro avg       1.00      1.00      1.00     56250
weighted avg       1.00      1.00      1.00     56250



In [33]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)
print(testing_report)

              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00      6211
     Neutral       1.00      1.00      1.00      6289
    Positive       1.00      1.00      1.00      6250

    accuracy                           1.00     18750
   macro avg       1.00      1.00      1.00     18750
weighted avg       1.00      1.00      1.00     18750



In [34]:
from joblib import dump
dump(classifier, 'model_ls.joblib')

['model_ls.joblib']