In [73]:
import sys
from pathlib import Path
PROJECT_DIR = str(Path().resolve().parents[0])
if PROJECT_DIR not in sys.path:
    sys.path.append(PROJECT_DIR)


from collections import Counter
import pandas as pd
import numpy as np
from sklearn import model_selection, linear_model, metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import naive_bayes
from nltk.tokenize import word_tokenize
    
from src import config

In [74]:
df = pd.read_csv(config.INPUT_DIR / config.TRAIN_FILE)
df["sentiment"] = df["sentiment"].apply(lambda row: 1 if row=="positive" else 0)

df["folds"] = -1

df = df.sample(frac=1.0).reset_index(drop=True)
features = df["review"]

kf = model_selection.StratifiedKFold(n_splits=5)
for fold, (train_idx, val_idx) in enumerate(kf.split(X=features, y=df["sentiment"])):
    df.loc[val_idx, "folds"] = fold
    
df

Unnamed: 0,review,sentiment,folds
0,This is the biggest load of crap that I have s...,0,0
1,I love this movie. It is great film that combi...,1,0
2,This movie is about sexual obsession. Bette Da...,1,0
3,From the beginning of the movie I had a feelin...,0,0
4,I was just watching a Forensic Files marathon ...,0,0
...,...,...,...
24995,This film was such a mess I actually reimburse...,0,4
24996,"A very ordinary made-for-tv product, ""Tyson"" a...",0,4
24997,"I didn't really expect much from ""The Night Li...",0,4
24998,I found the storyline in this movie to be very...,1,4


In [75]:
%%time
fold_id = 0

df_train = df.loc[df["folds"] != fold_id, :].reset_index(drop=True)
df_validation = df.loc[df["folds"] == fold_id, :].reset_index(drop=True)

count_vec = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)

count_vec.fit(df_train["review"])

features_train = count_vec.transform(df_train["review"])
features_validation = count_vec.transform(df_validation["review"])

#clf = linear_model.LogisticRegression()
clf = naive_bayes.MultinomialNB()
clf.fit(features_train, df_train["sentiment"])

preds = clf.predict(features_validation)
acc = metrics.accuracy_score(df_validation["sentiment"], preds)
prec = metrics.precision_score(df_validation["sentiment"], preds)

print(f"Accuracy: {np.round(acc,3)}\t Precision: {np.round(prec, 3)}")

Accuracy: 0.835	 Precision: 0.86
CPU times: user 1min 34s, sys: 39.6 ms, total: 1min 35s
Wall time: 1min 35s


In [70]:
sentence_array = count_vec.transform(["this is the first document. and this this this"]).toarray()
# np.nonzero(sentence_array)
sentence_transformed = sentence_array[sentence_array.nonzero()]
sentence_transformed

array([1, 1, 1, 1, 1, 1, 4])

In [71]:
len(sentence_transformed)/len(count_vec.vocabulary_)*100 

0.006970445311877638

In [60]:
count_vec.get_feature_names()[28774]

'document'