In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('IMDb_nonstemmed_w2v_500v_data.csv')
df.shape


(49582, 501)

In [3]:
X = df.drop('sentiment', axis = 1)
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)
# ensuring we have the same splits for all models
kf = KFold(n_splits=5)

### 1. Random Forest

1.1 Initial Modeling 

In [4]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8589    0.8005    0.8287      5003
           1     0.8100    0.8661    0.8371      4914

    accuracy                         0.8330      9917
   macro avg     0.8345    0.8333    0.8329      9917
weighted avg     0.8347    0.8330    0.8329      9917



1.2 Random Forest tuning using GridSearchCV

In [5]:
# hyper parameters to be tuned
params = [{
    'n_estimators': [50, 100, 200],
    'criterion': ['gini', 'entropy']
}]

In [6]:
rf_GCV = GridSearchCV(RandomForestClassifier(), param_grid=params, cv = kf, verbose=4)
best_rf = rf_GCV.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ...criterion=gini, n_estimators=50;, score=0.820 total time=  26.6s
[CV 2/5] END ...criterion=gini, n_estimators=50;, score=0.821 total time=  26.9s
[CV 3/5] END ...criterion=gini, n_estimators=50;, score=0.822 total time=  26.5s
[CV 4/5] END ...criterion=gini, n_estimators=50;, score=0.820 total time=  26.6s
[CV 5/5] END ...criterion=gini, n_estimators=50;, score=0.814 total time=  26.7s
[CV 1/5] END ..criterion=gini, n_estimators=100;, score=0.833 total time=  53.2s
[CV 2/5] END ..criterion=gini, n_estimators=100;, score=0.823 total time=  54.2s
[CV 3/5] END ..criterion=gini, n_estimators=100;, score=0.830 total time=  57.5s
[CV 4/5] END ..criterion=gini, n_estimators=100;, score=0.837 total time=  56.6s
[CV 5/5] END ..criterion=gini, n_estimators=100;, score=0.825 total time=  54.5s
[CV 1/5] END ..criterion=gini, n_estimators=200;, score=0.838 total time= 1.8min
[CV 2/5] END ..criterion=gini, n_estimators=200;,

Optimal hyperparameters:

In [9]:
print(best_rf.best_params_)

{'criterion': 'entropy', 'n_estimators': 200}


In [10]:
#79 mins to fi
best_rf.best_estimator_

y_pred = best_rf.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8646    0.8119    0.8374      5003
           1     0.8197    0.8706    0.8444      4914

    accuracy                         0.8410      9917
   macro avg     0.8422    0.8412    0.8409      9917
weighted avg     0.8424    0.8410    0.8409      9917



### 2. Decision Tree

2.1 Inital Modeling

In [11]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.6975    0.6876    0.6925      5003
           1     0.6865    0.6964    0.6914      4914

    accuracy                         0.6919      9917
   macro avg     0.6920    0.6920    0.6919      9917
weighted avg     0.6920    0.6919    0.6919      9917



2.2 Decision Tree tuning using GridSearchCV

In [14]:
# hyper parameters to be tuned
params = [{
    'criterion': ['gini', 'entropy'],
    'max_depth': ['None', 50, 100]
}]
dt_GCV = GridSearchCV(DecisionTreeClassifier(), param_grid=params, cv = kf, verbose=4)
best_dt = dt_GCV.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ......criterion=gini, max_depth=None;, score=nan total time=   0.0s
[CV 2/5] END ......criterion=gini, max_depth=None;, score=nan total time=   0.0s
[CV 3/5] END ......criterion=gini, max_depth=None;, score=nan total time=   0.0s
[CV 4/5] END ......criterion=gini, max_depth=None;, score=nan total time=   0.0s
[CV 5/5] END ......criterion=gini, max_depth=None;, score=nan total time=   0.0s
[CV 1/5] END ......criterion=gini, max_depth=50;, score=0.686 total time=  21.8s
[CV 2/5] END ......criterion=gini, max_depth=50;, score=0.683 total time=  26.4s
[CV 3/5] END ......criterion=gini, max_depth=50;, score=0.677 total time=  22.8s
[CV 4/5] END ......criterion=gini, max_depth=50;, score=0.688 total time=  22.0s
[CV 5/5] END ......criterion=gini, max_depth=50;, score=0.687 total time=  21.9s
[CV 1/5] END .....criterion=gini, max_depth=100;, score=0.687 total time=  20.1s
[CV 2/5] END .....criterion=gini, max_depth=100;,

10 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/donnobanmaldonado/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/donnobanmaldonado/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/donnobanmaldonado/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/donnobanmaldonado/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_v

In [15]:
best_dt.best_params_

{'criterion': 'entropy', 'max_depth': 100}

In [16]:
# fit took 19 minutes
best_dt.best_estimator_

y_pred = best_dt.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.7046    0.6980    0.7013      5003
           1     0.6954    0.7021    0.6987      4914

    accuracy                         0.7000      9917
   macro avg     0.7000    0.7000    0.7000      9917
weighted avg     0.7001    0.7000    0.7000      9917



#### Multi-Layer Perceptron

3.1 Initial Modeling

In [17]:
mlpModel = MLPClassifier()
mlpModel.fit(X_train, y_train)



In [None]:
y_pred = mlpModel.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.8922    0.8919    0.8920      5003
           1     0.8900    0.8903    0.8901      4914

    accuracy                         0.8911      9917
   macro avg     0.8911    0.8911    0.8911      9917
weighted avg     0.8911    0.8911    0.8911      9917



3.2 MLPClassifer tuning with GridSearchCV

In [18]:
params = [{
    'hidden_layer_sizes': [(100,), (200,)],
    'activation': ['logistic', 'adam']
}]

mlp_GCV = GridSearchCV(MLPClassifier(), param_grid=params, cv = kf, verbose=4)
best_mlp = mlp_GCV.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END activation=logistic, hidden_layer_sizes=(100,);, score=0.890 total time=  33.7s
[CV 2/5] END activation=logistic, hidden_layer_sizes=(100,);, score=0.884 total time=  35.2s
[CV 3/5] END activation=logistic, hidden_layer_sizes=(100,);, score=0.884 total time=  33.5s
[CV 4/5] END activation=logistic, hidden_layer_sizes=(100,);, score=0.886 total time=  33.0s
[CV 5/5] END activation=logistic, hidden_layer_sizes=(100,);, score=0.882 total time=  45.3s
[CV 1/5] END activation=logistic, hidden_layer_sizes=(200,);, score=0.890 total time= 1.2min
[CV 2/5] END activation=logistic, hidden_layer_sizes=(200,);, score=0.884 total time=  57.9s
[CV 3/5] END activation=logistic, hidden_layer_sizes=(200,);, score=0.886 total time= 1.1min
[CV 4/5] END activation=logistic, hidden_layer_sizes=(200,);, score=0.886 total time=  41.1s
[CV 5/5] END activation=logistic, hidden_layer_sizes=(200,);, score=0.881 total time=  48.7s
[CV 1/5] E

10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/donnobanmaldonado/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/donnobanmaldonado/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/donnobanmaldonado/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/donnobanmaldonado/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_v

In [19]:
best_mlp.best_params_

{'activation': 'logistic', 'hidden_layer_sizes': (200,)}

In [20]:
# fit took 19 minutes
best_mlp.best_estimator_

y_pred = best_mlp.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9135    0.8463    0.8786      5003
           1     0.8544    0.9184    0.8852      4914

    accuracy                         0.8820      9917
   macro avg     0.8839    0.8823    0.8819      9917
weighted avg     0.8842    0.8820    0.8819      9917

