In [2]:
# Import libraries
import re
import sys
from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Autograding
import tests_lab4
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# train test split and cross validation
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

##  Sentiment analysis on the IMDB dataset

In [3]:
imdb_df = pd.read_csv("imdb_master.csv", encoding="ISO-8859-1", index_col="Unnamed: 0")
imdb_df = imdb_df.query('label == "neg" | label == "pos"')
train_df = imdb_df.query('type == "train"')
test_df = imdb_df.query('type == "test"')
train_df

Unnamed: 0,type,review,label,file
25000,train,Story of a man who has unnatural feelings for ...,neg,0_3.txt
25001,train,Airport '77 starts as a brand new luxury 747 p...,neg,10000_4.txt
25002,train,This film lacked something I couldn't put my f...,neg,10001_4.txt
25003,train,"Sorry everyone,,, I know this is supposed to b...",neg,10002_1.txt
25004,train,When I was little my parents took me along to ...,neg,10003_1.txt
...,...,...,...,...
49995,train,"Seeing as the vote average was pretty low, and...",pos,9998_9.txt
49996,train,"The plot had some wretched, unbelievable twist...",pos,9999_8.txt
49997,train,I am amazed at how this movie(and most others ...,pos,999_10.txt
49998,train,A Christmas Together actually came before my t...,pos,99_8.txt


In [4]:
X_train, y_train = train_df["review"], train_df["label"]
X_test, y_test = test_df["review"], test_df["label"]

In [5]:
# from pandas_profiling import ProfileReport
# ProfileReport(train_df.query("label == 'pos'"), explorative=True)

negative reviews length: 
Max length 8969, Median length	976.5, Mean length 1303.19936, Min length 52

positive reviews length: 
Max length 13704, Median length 982, Mean length 1347.42648, Min length	70

## Model building and hyperparameter optimization

In [6]:
# Adapted code
def store_cross_val_results(model_name, scores, results_dict):
    """
    Stores mean scores from cross_validate in results_dict for
    the given model model_name.

    Parameters
    ----------
    model_name :
        scikit-learn classification model
    scores : dict
        object return by `cross_validate`
    results_dict: dict
        dictionary to store results

    Returns
    ----------
        None

    """
    results_dict[model_name] = {
        "mean_train_accuracy": "{:0.4f}".format(np.mean(scores["train_score"])),
        "mean_valid_accuracy": "{:0.4f}".format(np.mean(scores["test_score"])),
        "mean_fit_time (s)": "{:0.4f}".format(np.mean(scores["fit_time"])),
        "mean_score_time (s)": "{:0.4f}".format(np.mean(scores["score_time"])),
        "std_train_score": "{:0.4f}".format(scores["train_score"].std()),
        "std_valid_score": "{:0.4f}".format(scores["test_score"].std()),
    }

### Baseline DummyClassifier 

In [7]:
results_dict = {}

In [8]:
dummy = DummyClassifier(strategy="prior")
dummy_pipe = make_pipeline(CountVectorizer(binary=True), DummyClassifier(strategy="most_frequent"))
scores = cross_validate(dummy_pipe, X_train, y_train, return_train_score=True)
store_cross_val_results("dummy", scores, results_dict)

In [9]:
results_dict

{'dummy': {'mean_train_accuracy': '0.5000',
  'mean_valid_accuracy': '0.5000',
  'mean_fit_time (s)': '4.2760',
  'mean_score_time (s)': '1.0344',
  'std_train_score': '0.0000',
  'std_valid_score': '0.0000'}}

### Different classifiers 

In [10]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "RBF SVM": SVC(),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=2000),
}
results_dict = {}

In [11]:
for model_name, model in models.items():
    pipe = Pipeline(steps=[("pca", CountVectorizer(binary = True)), ("classifier", model)])
    scores = cross_validate(pipe, X_train, y_train, cv=2, return_train_score=True, n_jobs=-1)
    store_cross_val_results(model_name, scores, results_dict)

In [12]:
pd.DataFrame(results_dict).T

Unnamed: 0,mean_train_accuracy,mean_valid_accuracy,mean_fit_time (s),mean_score_time (s),std_train_score,std_valid_score
Decision Tree,1.0,0.705,13.8674,2.7347,0.0,0.0014
RBF SVM,0.9738,0.8646,177.082,115.5021,0.0002,0.0018
Naive Bayes,0.9377,0.8088,3.1778,2.6625,0.0058,0.0117
Logistic Regression,0.9994,0.852,5.3049,2.6935,0.0001,0.0004


>The LogReg and RBF SVM are very close, with high validation scores. Then the NB, and followed by DT. 

> The distance between the train and validation scores for RBF are approx 10, for NB is 13, for LogReg is 14, and for DT is 30. So the RBF is actually fitting the best.

> In terms of timing, the NB and LogReg fit very fast, with RBF being around 30x to 50x longer, and DT around 5x longer. 

> The only one that seems to be severely overfitting is the DT, which is expected.

### Hyperparameter optimization 

In [13]:
from sklearn.linear_model import LogisticRegression
pipe_3_3 = Pipeline(steps=[("pca", CountVectorizer(binary = True)), 
                           ("classifier", LogisticRegression(max_iter = 1000))])

In [14]:
param_grid_random = {"classifier__C": 10.0 ** np.arange(-3, 3),
                 "pca__max_features": [3500,7500,1000]}

random_searching = RandomizedSearchCV(pipe_3_3, 
                                      param_distributions=param_grid_random, 
                                      n_jobs=-1, n_iter=10, return_train_score=True, cv=2)
random_searching.fit(X_train, y_train)

RandomizedSearchCV(cv=2,
                   estimator=Pipeline(steps=[('pca',
                                              CountVectorizer(binary=True)),
                                             ('classifier',
                                              LogisticRegression(max_iter=1000))]),
                   n_jobs=-1,
                   param_distributions={'classifier__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                                        'pca__max_features': [3500, 7500,
                                                              1000]},
                   return_train_score=True)

In [15]:
pd.DataFrame(random_searching.cv_results_)[
    [
        "rank_test_score",
        "mean_fit_time",
        "mean_test_score",
        "param_classifier__C",
        "param_pca__max_features"
    ]
].set_index("rank_test_score").sort_index()

Unnamed: 0_level_0,mean_fit_time,mean_test_score,param_classifier__C,param_pca__max_features
rank_test_score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,5.346765,0.86228,0.1,7500
2,4.902316,0.85836,0.01,7500
3,6.288775,0.84536,1.0,7500
4,4.772601,0.84408,0.01,1000
5,7.90556,0.83888,10.0,1000
6,6.741022,0.8384,100.0,1000
7,6.166258,0.83088,0.001,7500
8,5.709755,0.83,0.001,3500
9,5.676651,0.81868,0.001,1000
10,10.915322,0.80696,100.0,3500


In [16]:
print("Best cv score from grid search: %.3f" % random_searching.best_score_)
random_searching.best_params_
# random_searching.best_estimator_

Best cv score from grid search: 0.862


{'pca__max_features': 7500, 'classifier__C': 0.1}

##  Model interpretation

In [17]:
# random_searching.best_estimator_.fit(X_train, y_train)

In [18]:
weights = random_searching.best_estimator_['classifier'].coef_.flatten()
#classifier = logreg
vocab = random_searching.best_estimator_['pca'].get_feature_names()
#pca = countvec
inds = np.argsort(random_searching.best_estimator_['classifier'].coef_.flatten())

negative_words = [vocab[index] for index in inds[:20]]
positive_words = [vocab[index] for index in inds[-20:]]
negative_words_weights = [weights[index] for index in inds[:20]]
positive_words_weights = [weights[index] for index in inds[-20:]]

pd.DataFrame(
    {
        "Negative Words": negative_words, "Negative Word Weights": negative_words_weights,
        "Positive Words": positive_words,"Positive Word Weights": positive_words_weights,
    })

Unnamed: 0,Negative Words,Negative Word Weights,Positive Words,Positive Word Weights
0,worst,-1.530608,hooked,0.621352
1,waste,-1.417391,surprisingly,0.640163
2,awful,-1.184617,perfectly,0.653714
3,poorly,-1.153789,gem,0.66601
4,disappointment,-1.148894,today,0.669176
5,boring,-0.970015,appreciated,0.674908
6,disappointing,-0.934189,noir,0.694941
7,dull,-0.879641,loved,0.695086
8,lacks,-0.849454,funniest,0.698207
9,mess,-0.820419,wonderful,0.703071


> The classification seems correct, with negative words like "worst" as highly negative, while words like "hooked", are positive. 

> This task is best suited for models that handle binary data, with learning from the examples and not the features. So, it would be harder for RBFs or NBs to get most informative features, while DT might actually do better, due to it's binary nature.

## Test score and final evaluation 

In [19]:
pipe_5 = random_searching.best_estimator_

In [20]:
pipe_5.fit(X_train, y_train)
pipe_5.score(X_test, y_test)

0.87756

> Score of 0.877 is better than the previous 0.862.

> The test scores are very similiar, which means we don't have much overfitting or underfitting. I trust these test scores because we also didn't violate any golden rules + we have a large data set. 

In [21]:
reviews = pd.DataFrame({"reviews":X_train,
                       "prediction": pipe_5.predict(X_train),
                       "probabilities": pipe_5.predict_proba(X_train).tolist()})

In [22]:
positive_review_predict = reviews.query("prediction == 'pos'")
positive_review_predict.sort_values('probabilities').iloc[0:5]

Unnamed: 0,reviews,prediction,probabilities
43417,By now you've probably heard a bit about the n...,pos,"[2.4873436643702007e-12, 0.9999999999975127]"
40085,By 1987 Hong Kong had given the world such fil...,pos,"[6.94970037073972e-09, 0.9999999930502996]"
40546,Mukhsin is a beautiful movie about a first lov...,pos,"[2.739420634778611e-08, 0.9999999726057937]"
38084,Romance is in the air and love is in bloom in ...,pos,"[3.8195575480237665e-08, 0.9999999618044245]"
46752,"""Twelve Monkeys"" is odd and disturbing, yet be...",pos,"[7.257617962164176e-08, 0.9999999274238204]"


In [23]:
negative_review_predict = reviews.query("prediction == 'neg'")
negative_review_predict.sort_values('probabilities', ascending = False).iloc[0:5]

Unnamed: 0,reviews,prediction,probabilities
29702,Zombi 3 starts as a group of heavily armed men...,neg,"[0.9999999998966679, 1.0333211105564274e-10]"
30388,"Sexo Cannibal, or Devil Hunter as it's more co...",neg,"[0.9999999982309533, 1.76904665983887e-09]"
35171,Munchies starts in deepest darkest Peru (looks...,neg,"[0.9999999977927764, 2.2072236201023745e-09]"
26780,This is crap....utter crap. I cannot believe a...,neg,"[0.9999999945945717, 5.405428302356882e-09]"
33321,Scarecrow is set in the small American town of...,neg,"[0.9999999877892887, 1.2210711313025467e-08]"
