#### model builder - for sentiment classification

In [1]:
"""
Steps:
1. convert the embeddings into expected format and convert the classes to int : 0 - negative, 1 - positive
2. Train SVM and LR
"""
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
import numpy as np
import pickle


In [2]:
# Load the dataset and split it into training and test sets
df = pd.read_csv("training_reviews_with_embeddings.csv")
df.head()


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,review_id,review,sentiment,n_tokens,embeddings
0,0,0,2,I thought this was a wonderful way to spend ti...,positive,208,"[-0.0022488185204565525, -0.02475237101316452,..."
1,1,1,3,Basically there's a family where a little boy ...,negative,174,"[-0.015456135384738445, -0.026828765869140625,..."
2,2,2,7,"This show was an amazing, fresh & innovative i...",negative,213,"[-0.00163047865498811, -0.04207943007349968, 0..."
3,3,3,8,Encouraged by the positive comments about this...,negative,160,"[-0.025794534012675285, -0.015925776213407516,..."
4,4,4,10,Phil the Alien is one of those quirky films wh...,negative,125,"[0.013276934623718262, -0.04165113344788551, -..."


In [3]:
#new_df = pd.DataFrame(df['embeddings'].values.tolist(), columns=[f"embed_{i}" for i in range(1536)])
import numpy as np
df1 = df.dropna(subset=['embeddings'])#.sample(8000)
df1["embeddings"] = df1.embeddings.apply(eval).apply(np.array)

In [4]:
new_df = pd.concat([pd.DataFrame(df1['embeddings'].to_list(), columns=[f"embed_{i}" for i in range(1536)])], axis=1)
new_df.head()

Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_1526,embed_1527,embed_1528,embed_1529,embed_1530,embed_1531,embed_1532,embed_1533,embed_1534,embed_1535
0,-0.002249,-0.024752,0.024616,-0.024629,-0.013524,0.020135,0.008845,-0.02381,-0.010477,-0.023632,...,0.027443,-0.012212,0.027443,-0.048002,-0.050789,0.017185,0.00501,-0.015696,-0.035025,-0.037894
1,-0.015456,-0.026829,-0.016148,-0.012231,-0.000328,-0.0083,0.016055,-0.043894,-0.017026,-0.019061,...,0.019513,-0.021561,0.001586,-0.020245,-0.044214,0.009205,-0.013208,-0.003631,-0.005364,-0.019766
2,-0.00163,-0.042079,0.015727,-0.001767,-0.003291,0.013766,0.002004,-0.022219,-0.035216,-0.013024,...,0.008188,-0.000527,0.024299,-0.027399,-0.015714,0.02255,-0.014574,0.005359,-0.008009,-0.023411
3,-0.025795,-0.015926,0.001318,-0.032243,0.008146,0.016435,-0.004513,-0.036447,-0.020051,-0.017323,...,0.044723,-0.016239,0.020051,-0.040885,-0.01902,0.011076,0.001376,-0.016043,-0.016122,-0.014999
4,0.013277,-0.041651,-0.003943,-0.033479,-0.020533,-0.005315,0.006549,-0.044121,-0.012049,-0.006034,...,0.021538,-0.001658,0.01763,-0.013137,-0.022799,0.016332,-0.030398,-0.011838,-0.032053,-0.036941


In [5]:
new_df['label'] = [1 if value == 'positive' else 0 for value in df1['sentiment']]
print(new_df.shape)
#print(new_df.keys)
new_df.to_csv("final_train_set.csv",index=False)

(26005, 1537)


In [6]:

X_train, X_test, y_train, y_test = train_test_split(new_df.drop('label', axis=1), new_df['label'], test_size=0.2,random_state=42)


In [None]:
# train SVM with cross-validation
svm = SVC(probability=True,kernel='linear')
#svm_scores = cross_val_score(svm, X_train, y_train, cv=5)
svm.fit(X_train, y_train)
#print("SVM cross-validation scores:", svm_scores)

In [None]:
# train a logistic regression model with cross-validation
lr = LogisticRegression(C=1,solver='saga')
#lr_scores = cross_val_score(lr, X_train, y_train, cv=5)
lr.fit(X_train, y_train)
#print("LR cross-validation scores:", lr_scores)

In [None]:

# Train a XGB Classifier model on the training data
xgb = XGBClassifier(n_estimators=200,max_depth=7,eta=0.1)
xgb.fit(X_train, y_train)


In [9]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)


In [7]:
# Load each model for inference
# XGBoost
with open("xgb.pickle", "rb") as file:
    loaded_xgb = pickle.load(file)

# SVM
with open("svm.pickle", "rb") as file:
    loaded_svm = pickle.load(file)

# Logistic Regression
with open("lr.pickle", "rb") as file:
    loaded_lr = pickle.load(file)

In [10]:


# Define the ensemble model as a voting classifier
#ensemble = VotingClassifier(estimators=[('xgb', xgb), ('svm', svm), ('lr', lr)], voting='soft')
ensemble = VotingClassifier(estimators=[('xgb', loaded_xgb), ('svm', loaded_svm), ('lr', loaded_lr),('gnb',gnb)], voting='soft',weights=[1,2,1,1])

# Train the ensemble model on the training data
ensemble.fit(X_train, y_train)

'''
# Fine-tune the ensemble by adjusting the voting weights
weights = ensemble.predict_proba(X_test)
weights = np.average(weights, axis=0, weights=y_test)
ensemble.weights = weights

# Train the fine-tuned ensemble on the entire training data
ensemble.fit(X_train, y_train)
'''

'\n# Fine-tune the ensemble by adjusting the voting weights\nweights = ensemble.predict_proba(X_test)\nweights = np.average(weights, axis=0, weights=y_test)\nensemble.weights = weights\n\n# Train the fine-tuned ensemble on the entire training data\nensemble.fit(X_train, y_train)\n'

In [12]:

# Make predictions on the testing data
xgb_predictions = loaded_xgb.predict(X_test)
svm_predictions = loaded_svm.predict(X_test)
lr_predictions =  loaded_lr.predict(X_test)
gnb_predictions = gnb.predict(X_test)
ensemble_predictions = ensemble.predict(X_test)


In [13]:

#Calculate accuracy for each
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
svm_accuracy = accuracy_score(y_test, svm_predictions)
lr_accuracy = accuracy_score(y_test, lr_predictions)
gnb_accuracy = accuracy_score(y_test,gnb_predictions)
ensemble_accuracy = accuracy_score(y_test, ensemble_predictions)

print("XGBoost Accuracy:", xgb_accuracy)
print("SVM Accuracy:", svm_accuracy)
print("Logistic Regression Accuracy:", lr_accuracy)
print("Gaussian Naive-Bayes Accuracy:",gnb_accuracy)
print("Ensemble Accuracy:", ensemble_accuracy)

XGBoost Accuracy: 0.9338588732935974
SVM Accuracy: 0.9425110555662373
Logistic Regression Accuracy: 0.9392424533743511
Gaussian Naive-Bayes Accuracy: 0.8621418957892713
Ensemble Accuracy: 0.942703326283407


In [14]:
from sklearn.metrics import classification_report
y_pred = ensemble.predict(X_test)
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      3962
           1       0.88      0.89      0.88      1239

    accuracy                           0.94      5201
   macro avg       0.92      0.92      0.92      5201
weighted avg       0.94      0.94      0.94      5201



In [41]:
#SVM Pickle

# Define the pipeline with SVM classifier
pipe = Pipeline([('classifier', SVC())])

# Define the grid search parameters
param_grid = [
    {
        'classifier': [SVC()],
        'classifier__kernel': ['linear', 'rbf'],
        'classifier__C': [1, 10, 100, 1000],
    },
]

# Perform the grid search with 5-fold cross-validation
grid_search = GridSearchCV(pipe, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Save the best model to a pickle file
with open('best_svm_model.pkl', 'wb') as f:
    pickle.dump(grid_search.best_estimator_, f)

# Show the best parameters and the best score
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))


Best parameters: {'classifier': SVC(C=10, kernel='linear'), 'classifier__C': 10, 'classifier__kernel': 'linear'}
Best cross-validation score: 0.94


In [None]:
#LR Pickle
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {'C': [0.1, 1, 10],
              'solver': ['newton-cg', 'liblinear', 'saga']}

lr = LogisticRegression()
grid_search_lr = GridSearchCV(lr, param_grid, cv=5)
grid_search_lr.fit(X_train, y_train)

best_lr_model = grid_search_lr.best_estimator_

# Save the best model as a pickle file
import pickle
with open('best_lr_model.pkl', 'wb') as f:
    pickle.dump(best_lr_model, f)


# Show the best parameters and the best score
print("Best parameters: {}".format(grid_search_lr.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search_lr.best_score_))


In [None]:
#XGB Pickle

param_grid = {'eta': [0.01, 0.1, 1],
              'max_depth': [3, 5, 7, 9]}#,
             # 'n_estimators': [100, 500, 1000]}

xgb_clf = XGBClassifier()
grid_search_xgb = GridSearchCV(xgb_clf, param_grid, cv=5)
grid_search_xgb.fit(X_train, y_train)

best_xgb_model = grid_search_xgb.best_estimator_

# Save the best model as a pickle file
import pickle
with open('best_xgb_model.pkl', 'wb') as f:
    pickle.dump(best_xgb_model, f)

# Show the best parameters and the best score
print("Best parameters: {}".format(grid_search_xgb.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search_xgb.best_score_))

In [14]:
# Save each model as a pickle file
import pickle

# XGBoost
with open("xgb.pickle", "wb") as file:
    pickle.dump(xgb, file)

# SVM
with open("svm.pickle", "wb") as file:
    pickle.dump(svm, file)

# Logistic Regression
with open("lr.pickle", "wb") as file:
    pickle.dump(lr, file)

# Ensemble
with open("ensemble.pickle", "wb") as file:
    pickle.dump(ensemble, file)



In [15]:
import pickle
# Ensemble
with open("ensemble.pickle", "wb") as file:
    pickle.dump(ensemble, file)



: 