# Imports

In [74]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from string import punctuation
from xgboost import XGBClassifier
from time import time
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier

# Helpers

In [2]:
stemmer = nltk.stem.snowball.SnowballStemmer('english')

stopwords = nltk.corpus.stopwords.words('english')

num_re = re.compile(r'\d+')
words_re = re.compile(r'\w+')
html_re = re.compile('<.*?>')

def prepare_en(text: str) -> str:
    tokens = word_tokenize(text.lower())
    tokens = [re.sub(html_re, '', t) for t in tokens]
    tokens = [stemmer.stem(t) for t in tokens]
    tokens = [t for t in tokens if t not in punctuation]
    tokens = [t for t in tokens if t not in stopwords]
    tokens = [t for t in tokens if num_re.match(t) is None]
    tokens = [t for t in tokens if words_re.match(t) is not None]

    return ' '.join(tokens)

# Read data

In [3]:
df = pd.read_csv('./data/imdb.csv')

df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


# Prepare data

In [4]:
df['review_prepared'] = df.review.apply(prepare_en)

df

KeyboardInterrupt: 

In [None]:
df['label'] = df.sentiment == 'positive'

df

In [None]:
df.to_csv('./data/imdb_prepared.csv', index=False)

# Re-load data

In [5]:
df = pd.read_csv('./data/imdb_prepared.csv')

df

Unnamed: 0,review,sentiment,review_prepared,label
0,One of the other reviewers has mentioned that ...,positive,one review mention watch oz episod hook right ...,True
1,A wonderful little production. <br /><br />The...,positive,wonder littl product br br film techniqu veri ...,True
2,I thought this was a wonderful way to spend ti...,positive,thought wonder way spend time hot summer weeke...,True
3,Basically there's a family where a little boy ...,negative,basic famili littl boy jake think zombi closet...,False
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visual stun film...,True
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movi right good job n't creativ origin...,True
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogu bad act idiot direct anno...,False
49997,I am a Catholic taught in parochial elementary...,negative,cathol taught parochi elementari school nun ta...,False
49998,I'm going to have to disagree with the previou...,negative,go disagre previous comment side maltin one se...,False


# Vectorize (get X and y)

In [6]:
vectorizer = TfidfVectorizer(tokenizer=word_tokenize, min_df=.005)
X = vectorizer.fit_transform(df.review_prepared).toarray()

len(vectorizer.get_feature_names_out())



2586

In [7]:
vectorizer.get_feature_names_out()[:100]

array(['.', 'abandon', 'abil', 'abl', 'abov', 'abrupt', 'absolut',
       'absurd', 'abus', 'academi', 'accent', 'accept', 'access', 'accid',
       'accident', 'accompani', 'accomplish', 'accord', 'account',
       'accur', 'accus', 'achiev', 'across', 'act', 'action', 'activ',
       'actor', 'actress', 'actual', 'ad', 'adam', 'adapt', 'add',
       'addict', 'addit', 'address', 'adequ', 'admir', 'admit', 'adopt',
       'ador', 'adult', 'advanc', 'advantag', 'adventur', 'advertis',
       'advic', 'advis', 'affair', 'affect', 'afford', 'afraid', 'africa',
       'african', 'afternoon', 'afterward', 'again', 'age', 'agent',
       'ago', 'agre', 'ahead', 'ai', 'aid', 'aim', 'air', 'aka', 'al',
       'ala', 'alan', 'albeit', 'albert', 'alcohol', 'alex', 'alic',
       'alien', 'aliv', 'all', 'allen', 'allow', 'almost', 'alon',
       'along', 'alreadi', 'alright', 'also', 'alter', 'altern',
       'although', 'alway', 'amateur', 'amateurish', 'amaz', 'america',
       'american', 'am

In [8]:
y = df.label.astype(int).to_numpy()

In [9]:
y.shape

(50000,)

# Train test split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# XGBoost

In [18]:
bst = XGBClassifier(n_estimators=200, max_depth=10, objective='binary:logistic')

bst.fit(X_train, y_train)

In [19]:
y_pred = bst.predict(X_test)

len(y_test[y_test == y_pred]) / len(y_test) * 100

86.56

## Generalize

In [38]:
def eval_xgboost(min_df:float=0.005, n_estimators:int=20, max_depth:int=5):
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, min_df=min_df)
    X = vectorizer.fit_transform(df.review_prepared).toarray()
    y = df.label.astype(int).to_numpy()
    n_tokens = len(vectorizer.get_feature_names_out())
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

    bst = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, objective='binary:logistic')
    start = time()
    bst.fit(X_train, y_train)
    time_took = time() - start

    y_pred = bst.predict(X_test)
    accuracy = len(y_test[y_test == y_pred]) / len(y_test) * 100

    return accuracy, n_tokens, time_took, vectorizer, bst

In [46]:
configs = [
    {'min_df': .005, 'n_estimators': 2, 'max_depth': 3},
    {'min_df': .01, 'n_estimators': 2, 'max_depth': 3},
    {'min_df': .003, 'n_estimators': 2, 'max_depth': 3},
    {'min_df': .005, 'n_estimators': 10, 'max_depth': 3},
    {'min_df': .005, 'n_estimators': 2, 'max_depth': 7},
    {'min_df': .005, 'n_estimators': 100, 'max_depth': 3},
    {'min_df': .005, 'n_estimators': 200, 'max_depth': 3},
    {'min_df': .005, 'n_estimators': 100, 'max_depth': 5},
]

for conf in tqdm(configs):
    results = eval_xgboost(**conf)
    conf['accuracy'] = results[0]
    conf['n_tokens'] = results[1]
    conf['time_took'] = results[2]
    conf['vectorizer'] = results[3]
    conf['bst'] = results[4]

100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [05:29<00:00, 41.23s/it]


In [47]:
pd.DataFrame(configs).drop(columns=['vectorizer', 'bst'])

Unnamed: 0,min_df,n_estimators,max_depth,accuracy,n_tokens,time_took
0,0.005,2,3,70.1,2586,2.187343
1,0.01,2,3,69.93,1562,1.698792
2,0.003,2,3,69.78,3705,8.41691
3,0.005,10,3,76.9,2586,3.623488
4,0.005,2,7,75.21,2586,6.412896
5,0.005,100,3,84.52,2586,17.561496
6,0.005,200,3,86.21,2586,32.123636
7,0.005,100,5,85.83,2586,44.349294


In [71]:
configs[6]['bst'].predict(configs[6]['vectorizer'].transform([prepare_en('That was my favorite movie, I like it')]))

array([1])

In [72]:
configs[6]['bst'].predict(configs[6]['vectorizer'].transform([prepare_en("Very bad movie, I have nothing else to say. bad bad bad bad bad bad bad bad")]))

array([1])

In [73]:
configs[5]['bst'].predict(configs[5]['vectorizer'].transform([prepare_en("Very bad movie, I have nothing else to say. bad bad bad bad bad bad bad bad")]))

array([0])

# Random forest

## Poc

In [75]:
vectorizer = TfidfVectorizer(tokenizer=word_tokenize, min_df=.005)
X = vectorizer.fit_transform(df.review_prepared).toarray()
y = df.label.astype(int).to_numpy()
n_tokens = len(vectorizer.get_feature_names_out())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

rf = RandomForestClassifier(max_depth=2)
rf.fit(X_train, y_train)



In [76]:
y_pred = rf.predict(X_test)

len(y_test[y_test == y_pred]) / len(y_test) * 100

79.72

## Generalize

In [80]:
def eval_rf(min_df:float=0.005, max_depth:int=2):
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, min_df=min_df)
    X = vectorizer.fit_transform(df.review_prepared).toarray()
    y = df.label.astype(int).to_numpy()
    n_tokens = len(vectorizer.get_feature_names_out())
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

    rf = RandomForestClassifier(max_depth=max_depth)
    start = time()
    rf.fit(X_train, y_train)
    time_took = time() - start

    y_pred = rf.predict(X_test)
    accuracy = len(y_test[y_test == y_pred]) / len(y_test) * 100

    return accuracy, n_tokens, time_took, vectorizer, rf

In [85]:
a = [1, 2, 3]
a[-1:]

[3]

In [83]:
configs_rf = [
    {'min_df': .005, 'max_depth': 2},
    {'min_df': .01, 'max_depth': 2},
    {'min_df': .003, 'max_depth': 2},
    {'min_df': .005, 'max_depth': 3},
    {'min_df': .005, 'max_depth': 5},
    {'min_df': .005, 'max_depth': 15},
    {'min_df': .005, 'max_depth': 30},
    {'min_df': .005, 'max_depth': 50},
]

for conf in tqdm(configs_rf):
    results = eval_rf(**conf)
    conf['accuracy'] = results[0]
    conf['n_tokens'] = results[1]
    conf['time_took'] = results[2]
    conf['vectorizer'] = results[3]
    conf['rf'] = results[4]

100%|█████████████████████████████████████████████████████████████████████████████████████| 8/8 [06:18<00:00, 47.28s/it]


In [86]:
configs_rf.append({'min_df': .005, 'max_depth': 100})
conf = configs_rf[-1]
results = eval_rf(**conf)
conf['accuracy'] = results[0]
conf['n_tokens'] = results[1]
conf['time_took'] = results[2]
conf['vectorizer'] = results[3]
conf['rf'] = results[4]



In [87]:
pd.DataFrame(configs_rf).drop(columns=['vectorizer', 'rf'])

Unnamed: 0,min_df,max_depth,accuracy,n_tokens,time_took
0,0.005,2,80.42,2586,5.155904
1,0.01,2,78.54,1562,4.360136
2,0.003,2,78.07,3705,10.467425
3,0.005,3,79.94,2586,7.494113
4,0.005,5,81.68,2586,12.183826
5,0.005,15,82.97,2586,30.854578
6,0.005,30,83.63,2586,49.064405
7,0.005,50,84.59,2586,61.090945
8,0.005,100,84.68,2586,75.474065


In [90]:
configs_rf[8]['rf'].predict(configs_rf[8]['vectorizer'].transform([prepare_en('That was my favorite movie, I like it')]))

array([1])

In [91]:
configs_rf[8]['rf'].predict(configs_rf[8]['vectorizer'].transform([prepare_en("Very bad movie, I have nothing else to say. bad bad bad bad bad bad bad bad")]))

array([0])