In [1]:
from collections import OrderedDict
from operator import itemgetter

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from scipy.sparse import hstack
import spacy
from spacy import displacy
from spacy.symbols import nsubj, nsubjpass, VERB
import en_core_web_sm

In [3]:
nlp = en_core_web_sm.load()

In [4]:
reviews = pd.read_csv('data/reviews_temp.csv', usecols=['review_id', 'text', 'spoiler', 'movie_id'])

In [5]:
is_starwars = reviews['movie_id'] == 2488496
starwars = reviews[is_starwars]

In [6]:
subj_verbs = list()
# not_stop_words = set(['i', 'you', 'he', 'she', 'it', 'we', 'they'])
# stop_words = [word for word in nlp.Defaults.stop_words if word not in not_stop_words]
for row, review in starwars.iterrows():
    pairs = list()
    doc = nlp(review.text)
    for k, sentence in enumerate(doc.sents):
        # print(k, sentence)
        for token in sentence:
            if (token.dep == nsubj or token.dep == nsubjpass) and token.head.pos == VERB:
            # and not token.lower_ in stop_words and not token.head.lemma_.lower() in stop_words:
                compounds = [child.lower_ for child in token.children if child.dep_ == 'compound']
                compounds.append(token.lower_)
                pairs.append('-'.join(compounds) + '|' + token.head.lemma_.lower())
        # print('---')
    subj_verbs.append(', '.join(pairs))
    if row % 100 == 0:
        print('Finished review ', row)
starwars.insert(loc=4, column='subj_verb', value=subj_verbs)

Finished review  682900
Finished review  683000
Finished review  683100
Finished review  683200
Finished review  683300
Finished review  683400
Finished review  683500
Finished review  683600
Finished review  683700
Finished review  683800
Finished review  683900
Finished review  684000
Finished review  684100
Finished review  684200
Finished review  684300
Finished review  684400
Finished review  684500
Finished review  684600
Finished review  684700
Finished review  684800
Finished review  684900
Finished review  685000
Finished review  685100
Finished review  685200
Finished review  685300
Finished review  685400
Finished review  685500
Finished review  685600
Finished review  685700
Finished review  685800
Finished review  685900
Finished review  686000
Finished review  686100
Finished review  686200
Finished review  686300
Finished review  686400
Finished review  686500
Finished review  686600
Finished review  686700
Finished review  686800
Finished review  686900


In [7]:
x1 = starwars['subj_verb'].tolist()
x2 = starwars['text'].tolist()
y = starwars['spoiler'].tolist()

In [8]:
vec1 = CountVectorizer(tokenizer=lambda x: x.split(', '), min_df=10)
vec2 = CountVectorizer(ngram_range=(1, 1), min_df=10)
x1 = vec1.fit_transform(x1)
x2 = vec2.fit_transform(x2)
x = hstack([x1, x2])
print(x1.data.nbytes, x2.data.nbytes, x.data.nbytes)
print(x1.shape, x2.shape, x.shape)

(4160, 1523) (4160, 5535) (4160, 7058)


In [9]:
sum_words = x.sum(axis=0) 
word_freqs = [(word, sum_words[0, idx]) for word, idx in vec1.vocabulary_.items()]
word_freqs = sorted(word_freqs, key = lambda x: x[1], reverse=True)
word_freqs[:10]

[('it|be', 8899),
 ('i|be', 3275),
 ('that|be', 2639),
 ('this|be', 2619),
 ('i|think', 1913),
 ('he|be', 1801),
 ('movie|be', 1369),
 ('i|see', 1349),
 ('i|have', 1280),
 ('they|be', 1080)]

In [10]:
info_gain = dict(zip(vec1.get_feature_names(), mutual_info_classif(x, y, discrete_features=True)))

In [11]:
sorted(info_gain.items(), key=itemgetter(1), reverse=True)

[('he|be', 0.02266245240077911),
 ('she|be', 0.010415188725271859),
 ('rey|be', 0.009672947434754515),
 ('i|think', 0.009009300365075794),
 ('that|be', 0.00862161013593642),
 ('she|have', 0.008557961629756718),
 ('he|have', 0.008520431538535872),
 ('kylo-ren|be', 0.007768423715969915),
 ('luke|be', 0.007641534386371226),
 ('he|kill', 0.007054852291189418),
 ('it|be', 0.00702416671627247),
 ('we|get', 0.00674886199998001),
 ('who|be', 0.006669911602040687),
 ('death|be', 0.006191319231646236),
 ('we|see', 0.006038656068090057),
 ('that|destroy', 0.005791173227133759),
 ('this|be', 0.005490746556014289),
 ('i|mean', 0.0054172746713829715),
 ('she|know', 0.005170368920837097),
 ('he|get', 0.0050530631734637165),
 ('han|die', 0.004929980757664723),
 ('han-solo|die', 0.004909038214873126),
 ('we|have', 0.004848010151974394),
 ('kylo|be', 0.0047433627561073),
 ('finn|be', 0.004737875678000159),
 ('they|have', 0.004721065749031014),
 ('he|take', 0.004696386522451404),
 ('he|look', 0.004661649

In [17]:
svc = SVC(gamma=0.3)
svc_scores = cross_val_score(svc, x, y, scoring='f1', cv=5)
print(np.mean(svc_scores))

0.46508621790647087


In [13]:
mnb = MultinomialNB()
mnb_scores = cross_val_score(mnb, x, y, scoring='f1', cv=5)
print(np.mean(mnb_scores))

0.7139121418745308


In [14]:
rfc = RandomForestClassifier()
rfc_scores = cross_val_score(rfc, x, y, scoring='f1', cv=5)
print(np.mean(rfc_scores))

0.6736130426911069


In [15]:
lr = LogisticRegression()
lr_scores = cross_val_score(lr, x, y, scoring='f1', cv=5)
print(np.mean(lr_scores))

0.6938698594238304


In [16]:
dt = DecisionTreeClassifier()
dt_scores = cross_val_score(dt, x, y, scoring='f1', cv=5)
print(np.mean(dt_scores))

0.6532013974354661


In [21]:
print(x1.data.nbytes, x2.data.nbytes, x.data.nbytes)

653784 5796960 6450744
