In [None]:
from collections import OrderedDict
from operator import itemgetter

In [70]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [None]:
nlp = en_core_web_sm.load()

In [None]:
reviews = pd.read_csv('data/reviews_temp.csv', usecols=['review_id', 'text', 'spoiler', 'movie_id'])

In [None]:
is_starwars = reviews['movie_id'] == 2488496
starwars = reviews[is_starwars]

In [None]:
subj_verbs = list()
# not_stop_words = set(['i', 'you', 'he', 'she', 'it', 'we', 'they'])
# stop_words = [word for word in nlp.Defaults.stop_words if word not in not_stop_words]
for row, review in starwars.iterrows():
    pairs = list()
    doc = nlp(review.text)
    for k, sentence in enumerate(doc.sents):
        # print(k, sentence)
        for token in sentence:
            if (token.dep == nsubj or token.dep == nsubjpass) and token.head.pos == VERB:
            # and not token.lower_ in stop_words and not token.head.lemma_.lower() in stop_words:
                compounds = [child.lower_ for child in token.children if child.dep_ == 'compound']
                compounds.append(token.lower_)
                pairs.append('-'.join(compounds) + '|' + token.head.lemma_.lower())
        # print('---')
    subj_verbs.append(', '.join(pairs))
    if row % 100 == 0:
        print('Finished review ', row)
starwars.insert(loc=4, column='subj_verb', value=subj_verbs)

In [81]:
x = starwars['subj_verb'].tolist()
y = starwars['spoiler'].tolist()

In [82]:
vec = CountVectorizer(tokenizer=lambda x: x.split(', '), min_df=10)
x = vec.fit_transform(x)
print(x.shape)

(4160, 1523)


In [83]:
sum_words = x.sum(axis=0) 
word_freqs = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
word_freqs = sorted(word_freqs, key = lambda x: x[1], reverse=True)
word_freqs[:10]

[('it|be', 8899),
 ('i|be', 3275),
 ('that|be', 2639),
 ('this|be', 2619),
 ('i|think', 1913),
 ('he|be', 1801),
 ('movie|be', 1369),
 ('i|see', 1349),
 ('i|have', 1280),
 ('they|be', 1080)]

In [None]:
info_gain = dict(zip(vec.get_feature_names(), mutual_info_classif(x, y, discrete_features=True)))

In [None]:
sorted(info_gain.items(), key=itemgetter(1), reverse=True)

In [84]:
svc = SVC(gamma=0.3163)
svc_scores = cross_val_score(svc, x, y, scoring='f1', cv=5)
print(np.mean(svc_scores))

0.6743345939022972


In [85]:
mnb = MultinomialNB()
mnb_scores = cross_val_score(mnb, x, y, scoring='f1', cv=5)
print(np.mean(mnb_scores))

0.6639206649998106


In [86]:
rfc = RandomForestClassifier()
rfc_scores = cross_val_score(rfc, x, y, scoring='f1', cv=5)
print(np.mean(rfc_scores))

0.6062955257627902


In [87]:
lr = LogisticRegression()
lr_scores = cross_val_score(lr, x, y, scoring='f1', cv=5)
print(np.mean(lr_scores))

0.6365850705394855


In [88]:
dt = DecisionTreeClassifier()
dt_scores = cross_val_score(dt, x, y, scoring='f1', cv=5)
print(np.mean(dt_scores))

0.585621129959496
