In [2]:
# import libraries
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import datetime

In [3]:
# Q 12 - 1 - Read reviews from the IMDB50000.csv.
reviews = pd.read_csv("IMDB50000.csv")
reviews.head()
reviews.shape

(50000, 2)

In [4]:
import operator

top_features = 1000
skip_words ="the a and of to is in I that this it /><br was />The />This -- (the"

words = dict()
for index, row in reviews.iterrows():
    for review_word in row["review"].split():
        if review_word in skip_words:
            continue
        if review_word not in words:
            words[review_word] = 1
        else:
            words[review_word] = words[review_word]+1
            
top_words = dict(sorted(words.items(), key=operator.itemgetter(1),reverse=True)[:top_features])
#print("Top", top_features, "words in reviews:")
print(top_words)

{'with': 82569, 'for': 80919, 'but': 66282, 'on': 61197, 'movie': 60762, 'are': 56513, 'film': 54277, 'have': 54009, 'not': 52140, 'you': 50697, 'be': 50539, 'by': 42290, 'one': 41334, 'from': 37876, 'who': 37207, 'like': 36028, 'all': 35272, 'they': 35201, 'has': 32420, 'so': 32386, 'just': 32338, 'or': 32221, 'about': 32036, 'her': 29944, 'out': 28679, 'some': 28052, 'very': 25794, 'more': 25412, 'would': 23578, 'what': 23320, 'when': 22749, 'good': 22582, 'if': 22009, 'their': 21975, 'only': 21694, 'It': 21475, 'really': 21322, 'had': 21305, 'up': 21196, 'even': 20979, "it's": 20949, 'can': 20863, 'which': 20703, 'were': 20552, 'see': 20364, 'my': 20356, 'no': 19951, 'than': 19043, 'she': 18980, 'there': 18160, 'been': 17883, 'into': 17515, 'get': 17333, 'will': 16938, 'much': 16827, 'story': 16443, 'because': 16371, 'other': 15745, 'most': 15508, 'time': 15312, 'we': 15191, 'me': 15173, 'make': 14859, 'do': 14775, 'how': 14754, 'could': 14689, 'also': 14607, 'people': 14414, 'its':

In [5]:
print(datetime.datetime.now())
try:
    features_df = pd.read_csv('features_df_new.txt') 
    print('from file')
except:
    print('build feature df')

    features_df = pd.DataFrame(columns=list(top_words))

    for index, row in reviews.iterrows():
        feature_indx = 0
        feature_row = ["0"] * top_features
        for feature in top_words:
            for review_word in row["review"].split():
                if(feature == review_word):
                    feature_row[feature_indx] = "1"
                    break
            feature_indx = feature_indx + 1

        features_df.loc[len(features_df),:] = feature_row

    features_df.to_csv('features_df_new.txt', index=False)

print(datetime.datetime.now())
print(features_df.shape)
features_df.head()

2021-06-22 13:22:46.343783
from file
2021-06-22 13:22:50.692426
(50000, 1000)


Unnamed: 0,with,for,but,on,movie,are,film,have,not,you,...,By,forward,future,portrayed,unique,uses,class,what's,fails,hold
0,1,1,1,1,0,1,0,1,1,1,...,0,0,0,0,0,0,1,0,0,0
1,1,0,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,1,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

label_df = reviews[["sentiment"]]

label_df.loc[label_df["sentiment"] == "positive", "sentiment"] = "1"
label_df.loc[label_df["sentiment"] == "negative", "sentiment"] = "0"

features_df["label"] = label_df["sentiment"]
print(features_df.shape)
features_df.head()

(50000, 1001)


Unnamed: 0,with,for,but,on,movie,are,film,have,not,you,...,forward,future,portrayed,unique,uses,class,what's,fails,hold,label
0,1,1,1,1,0,1,0,1,1,1,...,0,0,0,0,0,1,0,0,0,1
1,1,0,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,1,1,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
3,1,1,0,0,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

features_df_rand = shuffle(features_df)
features, labels = features_df_rand.iloc[:,0:-1],features_df_rand.loc[:,['label']]
X_train, X_test, y_train, y_test = train_test_split(features, labels,test_size=.2, random_state=42)
y_train
X_train

Unnamed: 0,with,for,but,on,movie,are,film,have,not,you,...,By,forward,future,portrayed,unique,uses,class,what's,fails,hold
19558,1,1,1,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
28406,1,1,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
36206,1,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
26352,1,1,1,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4040,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38031,1,0,1,1,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
12517,0,1,0,0,1,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
33725,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
36463,0,0,1,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=500, random_state=12, activation='logistic', max_iter=500)
clf.fit(X_train, y_train.values.ravel())

MLPClassifier(activation='logistic', alpha=1e-05, hidden_layer_sizes=500,
              max_iter=500, random_state=12, solver='lbfgs')

In [9]:
print("Training set score: %f" % clf.score(X_train, y_train))

clf.predict(X_test)
clf.score(X_test, y_test)
print("Test set score: %f" % clf.score(X_test, y_test))

Training set score: 0.999975
Test set score: 0.801100


In [12]:
from sklearn.neural_network import MLPClassifier
clf2 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=300, random_state=12, activation='logistic', max_iter=500)
clf2.fit(X_train, y_train.values.ravel())

print("Training set score: %f" % clf2.score(X_train, y_train))

clf2.predict(X_test)
clf2.score(X_test, y_test)
print("Test set score: %f" % clf2.score(X_test, y_test))

Training set score: 1.000000
Test set score: 0.798000


Reducing the nodes from hidden layer, overfitted the model.
I tried adding layers or nodes to single hidden layer, performance did not increase