# IMDB Model Development, Tuning, and Evaluation

Goal: Achieve highest accuracy possible in using classification to distinguish between positive and negative textual reviews.

Models to be developed:
1. Logistic Regression
2. SVM
3. XGBoost


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

##### Reading in Data
Textual data has already been vectorized using Word2Vec resulting in 500 features representing each review and 1 target feature, Sentiment.

In [4]:
df = pd.read_csv('IMDb_nonstemmed_w2v_500v_data.csv')
print(df.shape)
df.head()

(49582, 501)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,sentiment
0,-0.006993,-0.017787,0.010712,0.001786,-0.202235,-0.010649,-0.035199,0.142727,-0.071552,0.089088,...,-0.09854,0.070369,-0.056713,-0.140328,-0.034548,-0.081482,0.029489,0.045794,-0.007057,1
1,-0.000186,-0.085118,0.075544,0.031908,-0.28304,-0.08143,-0.002124,0.162392,-0.035978,0.026303,...,0.012751,0.048073,0.011637,-0.101292,-0.059473,-0.080104,0.049169,0.067283,0.023935,1
2,0.085147,-0.115304,-0.024152,0.015084,-0.285222,-0.051906,0.040571,0.19317,-0.089491,0.084364,...,-0.043263,0.089965,-0.06065,-0.077252,-0.008706,-0.093932,0.025079,0.094748,-0.069459,1
3,0.045757,-0.0518,-0.002757,0.037521,-0.24577,-0.061446,-0.015328,0.127537,-0.015892,0.099974,...,-0.057313,0.01645,-0.008593,-0.031528,-0.036333,-0.091298,-0.032505,0.066214,0.027054,0
4,0.005742,-0.050887,-0.030296,0.004247,-0.251409,-0.1,0.021926,0.206545,-0.04808,0.125729,...,-0.102182,0.06446,-0.025878,-0.123427,-0.081694,-0.110009,0.020667,0.072723,-0.05701,1


#### Splitting into Training and Testing sets
We will be splitting the data into 80% training and 20% testing sets.

The __training set__ will be used for __fitting__.

The __testing set__ will be used for __final model evaluations__.

In [5]:
# all features except target
X = df.drop('sentiment', axis=1)    # to be used for 
# only target feature (sentiment)
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=34)

#### __Note: For the following models we will be using the optimal hyperparameters found when training models using a vector size of 200__

### 1. Logistic Regression

In [6]:
lr_model = LogisticRegression(C = 1.0, max_iter=1000, penalty='l1', solver = 'saga')
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8909    0.8791    0.8849      4921
           1     0.8824    0.8939    0.8881      4996

    accuracy                         0.8866      9917
   macro avg     0.8866    0.8865    0.8865      9917
weighted avg     0.8866    0.8866    0.8865      9917



### 2. SVM


##### 2.1 Initial Modeling

In [9]:
svc_model = LinearSVC(C = 10, loss = 'squared_hinge', max_iter = 1000, penalty= 'l2')
svc_model.fit(X_train, y_train)
y_pred = svc_model.predict(X_test)
print(classification_report(y_test,y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8880    0.8785    0.8832      4921
           1     0.8816    0.8909    0.8862      4996

    accuracy                         0.8847      9917
   macro avg     0.8848    0.8847    0.8847      9917
weighted avg     0.8848    0.8847    0.8847      9917



### 3. XGBoost

In [10]:
xgb_model = XGBClassifier(learning_rate = 0.2, max_depth = 9, ns_estimator = 100)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8772    0.8476    0.8621      4921
           1     0.8547    0.8831    0.8687      4996

    accuracy                         0.8655      9917
   macro avg     0.8659    0.8653    0.8654      9917
weighted avg     0.8659    0.8655    0.8654      9917

