In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import json

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from mytools import load_csv, preprocess_n_train, predict_n_evaluate, evaluate

## Build corpus from train.csv file  

In [None]:
traindf = load_csv("train")
traindf.head()

In [6]:
traindf.columns

Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment'],
      dtype='object')

In [11]:
traindf.isna().sum()

movieid                  0
reviewerName             0
isFrequentReviewer       0
reviewText            6447
sentiment                0
dtype: int64

In [12]:
traindf["reviewText"].fillna(" ", inplace=True) # replace NaN with empty string

In [7]:
pos_neg_ratio = traindf["sentiment"].value_counts()[0] / traindf["sentiment"].value_counts()[1]
pos_neg_ratio

2.014204492842195

In [57]:
text = traindf["reviewText"]
text

0         Henry Selick’s first movie since 2009’s Corali...
1         With a cast that reads like the Vogue Oscar pa...
2         Creed II does not give us anything but another...
3         I know what you're thinking, but this is no Li...
4         Director Fernando Meirelles tells the story wi...
                                ...                        
162753    A top-notch thriller with genuine surprises an...
162754    Some people find Derek Zoolander funny and lik...
162755    This fun, gentle comedy focuses mainly on them...
162756    The film is rescued by a strong third act, but...
162757            A peerless exercise in stimulus response.
Name: reviewText, Length: 162758, dtype: object

## Stop words  

In [58]:
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS

In [120]:
stop_words_all = json.load(open("stop_words.json", "r"))
type(stop_words_all), len(stop_words_all)

(list, 1160)

In [None]:
len(ENGLISH_STOP_WORDS), ENGLISH_STOP_WORDS

In [121]:
def get_tfidfs(text, stop_words=stop_words_all, max_features=10000):
    tvec = TfidfVectorizer(stop_words=stop_words, max_features=max_features)
    tvec.fit(text)
    text_transformed = tvec.transform(text)
    return text_transformed

In [122]:
text_transformed = get_tfidfs(text)
text_transformed.shape



(162758, 10000)

In [123]:
pos_text = traindf[traindf["sentiment"] == "POSITIVE"]["reviewText"]
neg_text = traindf[traindf["sentiment"] == "NEGATIVE"]["reviewText"]
pos_text.shape, neg_text.shape

((108761,), (53997,))

In [124]:
pos_text_transformed = get_tfidfs(pos_text)
neg_text_transformed = get_tfidfs(neg_text)
pos_text_transformed.shape, neg_text_transformed.shape

((108761, 10000), (53997, 10000))

In [125]:
def get_vocab(text, stop_words=stop_words_all, max_features=10000):
    tvec = TfidfVectorizer(stop_words=stop_words, max_features=max_features)
    tvec.fit(text)
    return tvec.vocabulary_

In [126]:
pos_vocab = get_vocab(pos_text)
neg_vocab = get_vocab(neg_text)
pos_vocab, neg_vocab

({'henry': 4238,
  'movie': 5884,
  '2009': 59,
  'coraline': 1974,
  'motion': 5864,
  'masterpiece': 5547,
  'creed': 2082,
  'superior': 8696,
  'rocky': 7547,
  'sequel': 7862,
  'wins': 9841,
  'points': 6679,
  'expect': 3176,
  'knockout': 5040,
  'thinking': 8987,
  'limitless': 5258,
  'bradley': 1070,
  'cooper': 1967,
  'lucy': 5372,
  'taps': 8849,
  'brain': 1072,
  'thrills': 9018,
  'skills': 8102,
  'passing': 6411,
  'hour': 4364,
  'director': 2518,
  'fernando': 3413,
  'tells': 8909,
  'story': 8500,
  'urgency': 9490,
  'sharp': 7939,
  'visual': 9623,
  'compositions': 1796,
  'washed': 9707,
  'cinematography': 1566,
  'gangster': 3772,
  'life': 5230,
  'rich': 7462,
  'piece': 6577,
  'storytelling': 8505,
  'feels': 3398,
  'bucks': 1187,
  'heartfelt': 4194,
  'lovely': 5351,
  'performance': 6490,
  'scott': 7765,
  'bit': 900,
  'long': 5317,
  'cartoon': 1350,
  'feature': 3386,
  'sign': 8033,
  'makers': 5442,
  'fell': 3402,
  'love': 5348,
  'ratatouil

In [127]:
len(pos_vocab), len(neg_vocab)

(10000, 10000)

In [128]:
len(pos_vocab.keys() & neg_vocab.keys())


7695

In [129]:
len(pos_vocab.keys() - neg_vocab.keys())

2305

In [130]:
len(neg_vocab.keys() - pos_vocab.keys())

2305

In [None]:
pos_vocab

In [None]:
neg_vocab

## Function for Weighted Class TF-IDF  
[Link](https://www.deepwizai.com/projects/how-to-correctly-use-tf-idf-with-imbalanced-data)  

In [148]:
def get_tfidf_vocab(traindf, stop_words='english', max_features=10000):
    pos_text = traindf[traindf["sentiment"] == "POSITIVE"]["reviewText"]
    neg_text = traindf[traindf["sentiment"] == "NEGATIVE"]["reviewText"]

    n_pos_features = round((len(pos_text) / len(text)) * max_features)
    n_neg_features = round((len(neg_text) / len(text)) * max_features)

    tvec_pos = TfidfVectorizer(stop_words=stop_words, max_features=n_pos_features)
    tvec_pos.fit(pos_text)
    pos_vocab = tvec_pos.vocabulary_

    tvec_neg = TfidfVectorizer(stop_words=pos_vocab, max_features=n_neg_features)
    tvec_neg.fit(neg_text)
    neg_vocab = tvec_neg.vocabulary_

    vocab_combined = pos_vocab | neg_vocab
    return vocab_combined

In [150]:
v = get_tfidf_vocab(traindf)

10000

In [153]:
v.keys()



## n_gram  

In [154]:
tvec_ng = TfidfVectorizer(ngram_range=(1,2))
tvec_ng.fit(text)
text_transformed_ng = tvec_ng.transform(text)
text_transformed_ng

<162758x1096578 sparse matrix of type '<class 'numpy.float64'>'
	with 6254759 stored elements in Compressed Sparse Row format>

In [155]:
tvec_ng.vocabulary_

{'henry': 437471,
 'selick': 827930,
 'first': 358192,
 'movie': 623832,
 'since': 848567,
 '2009': 3111,
 'coraline': 219433,
 'his': 444471,
 'fifth': 348234,
 'stop': 889616,
 'motion': 622509,
 'masterpiece': 590100,
 'henry selick': 437520,
 'selick first': 827936,
 'first movie': 358598,
 'movie since': 625894,
 'since 2009': 848602,
 '2009 coraline': 3120,
 'coraline his': 219439,
 'his fifth': 445547,
 'fifth stop': 348269,
 'stop motion': 889727,
 'motion masterpiece': 622561,
 'with': 1070138,
 'cast': 171369,
 'that': 930838,
 'reads': 773659,
 'like': 556788,
 'the': 937221,
 'vogue': 1038539,
 'oscar': 694376,
 'party': 712015,
 'guest': 417259,
 'list': 562510,
 'valentine': 1027738,
 'day': 239927,
 'should': 841765,
 'have': 428815,
 'been': 114863,
 'can': 164745,
 'miss': 609690,
 'cinema': 188133,
 'instead': 489343,
 'of': 661701,
 'standard': 881359,
 'hollywood': 450664,
 'schmaltz': 816995,
 'with cast': 1070927,
 'cast that': 171934,
 'that reads': 935306,
 'rea