In [151]:
import pandas as pd
import re
from gensim import parsing
import gensim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
import os
import glob

In [143]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(word2vec))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [81]:
files = ['../data/raw/review_polarity/txt_sentoken/pos/' + n for n in os.listdir('../data/raw/review_polarity/txt_sentoken/pos')]
pos = pd.concat([pd.read_csv(item, names=['review'], sep='\n') for item in files])
pos['score'] = 1
pos.head()

Unnamed: 0,review,score
0,assume nothing .,1
1,the phrase is perhaps one of the most used of ...,1
2,"the phrase especially goes for oscar novak , a...",1
3,"novak ( matthew perry ) , a shy , clumsy , chi...",1
4,one of these is the job of restoring a popular...,1


In [106]:
files = ['../data/raw/review_polarity/txt_sentoken/neg/' + n for n in os.listdir('../data/raw/review_polarity/txt_sentoken/neg')]
neg = pd.concat([pd.read_csv(item, names=['review'], sep='\n') for item in files])
neg['score'] = 0
neg.head()

Unnamed: 0,review,score
0,bad . bad .,0
1,bad .,0
2,that one word seems to pretty much sums up bey...,0
3,"if that summary isn't enough for you , how abo...",0
4,still haven't got the point ?,0


In [123]:
X = pd.concat([pos, neg])
X['review'] = X['review'].map(parsing.preprocess_string)
model = gensim.models.Word2Vec(X['review'], size=100)
w2v = dict(zip(model.wv.index2word, model.wv.vectors))

In [124]:
reviews = pd.concat([pos, neg])
reviews['review'] = reviews['review'].map(parsing.preprocess_string).map(' '.join)
reviews.head()

Unnamed: 0,review,score
0,assum,1
1,phrase impress rumor hardli,1
2,phrase especi goe oscar novak architect main f...,1
3,novak matthew perri shy clumsi chicago base ar...,1
4,job restor popular build charl newman dylan mc...,1


In [108]:
print ("Null Value Statistics:", '\n \n', reviews.isnull().sum()) ## Sum will tell the total n
reviews.head()

Null Value Statistics: 
 
 review    0
score     0
dtype: int64


Unnamed: 0,review,score
0,assum,1
1,phrase impress rumor hardli,1
2,phrase especi goe oscar novak architect main f...,1
3,novak matthew perri shy clumsi chicago base ar...,1
4,job restor popular build charl newman dylan mc...,1


In [139]:
x_train, x_test, y_train, y_test = train_test_split(X['review'], X['score'], 
                                                    test_size=0.33, random_state=42)
x_train

18    [count, half, dozen, time, blue, screen, painf...
28          [let, greas, forerunn, armada, movi, music]
14    [plot, involv, peopl, dare, spend, night, haun...
9                        [interest, know, custom, come]
18     [develop, connect, charact, given, reason, care]
10    [thing, somewhat, astrai, film, middl, segment...
26                            [rate, scale, low, scale]
15               [end, goe, kind, movi, oscar, perform]
29    [back, zane, david, warner, lovejoi, cal, pers...
46         [civil, action, provid, drama, lover, crave]
0                                             [confess]
31    [shue, talent, bare, seen, josh, brolin, under...
53            [actual, better, equal, aw, remak, haunt]
5     [max, take, like, sheedi, find, protect, anim,...
14    [obsess, lose, job, famili, saniti, loss, comp...
31    [carli, determin, free, husband, clutch, armi,...
4     [wai, spielberg, trivial, awesom, evil, stori,...
4                    [czech, russian, english, s

In [110]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)

In [111]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)
x_train_tfidf[:10]

<10x21922 sparse matrix of type '<class 'numpy.float64'>'
	with 83 stored elements in Compressed Sparse Row format>

In [140]:
model = gensim.models.Word2Vec(x_train, size=100)
w2v = dict(zip(model.wv.index2word, model.wv.vectors))
w2v

{'film': array([ 9.4788486e-01,  3.1524351e-01, -2.8323418e-01,  5.1439035e-01,
        -1.0835768e+00, -5.7160878e-01, -4.5930785e-03, -1.5409761e-04,
         3.9707291e-01,  3.7468842e-01, -5.2753782e-01, -4.1141686e-01,
        -2.6095459e-01,  1.6386500e-01, -4.1401312e-01,  1.4756361e-01,
        -7.9721831e-02,  2.5031292e-01, -9.0875125e-01,  8.8702016e-02,
        -2.2511871e-01, -3.0110320e-01, -2.5482613e-01,  7.6562218e-02,
         1.2189967e-01, -3.6430380e-01, -3.6891437e-01, -3.9784727e-01,
        -8.6350381e-01,  3.4641838e-01,  5.7455713e-01,  5.6030184e-01,
         9.0194598e-02, -4.9524087e-01,  4.0569600e-01,  2.2137353e-01,
         8.7037109e-02, -1.0436343e+00,  1.1860867e-01, -2.4886733e-01,
        -2.4512477e-01,  2.8546649e-01,  3.6181045e-01, -4.1837305e-02,
         1.5177473e-01, -4.4754571e-01, -6.0153908e-01, -5.1176578e-01,
        -3.1549177e-01, -2.2892286e-01,  3.2243821e-01, -6.9365159e-02,
        -2.7962250e-01, -3.1113157e-01,  4.9830726e-01, 

In [149]:
mev = MeanEmbeddingVectorizer(w2v)
x_train_vectorized = mev.transform(x_train)
x_train_vectorized

array([[ 0.83220792,  0.010947  , -0.45913172, ..., -0.16402283,
         0.02499637, -0.37956914],
       [ 0.81938112,  0.08701404, -0.41574901, ..., -0.12378488,
         0.05208975, -0.33101338],
       [ 0.82745987, -0.01532776, -0.51535672, ..., -0.17772393,
        -0.00298375, -0.40242359],
       ...,
       [ 0.79807401, -0.05555562, -0.45642152, ..., -0.11932014,
        -0.08298218, -0.46026319],
       [ 0.82842416, -0.02631896, -0.5836792 , ..., -0.19266126,
        -0.0287671 , -0.42937955],
       [ 0.71944636,  0.00381224, -0.40646175, ..., -0.15800208,
        -0.00632001, -0.36033699]])

In [152]:
nmf = NMF(n_components=100)
x_train_vectorized_nmf = nmf.fit_transform(x_train_vectorized)

ValueError: Negative values in data passed to NMF (input X)

In [153]:
clf = naive_bayes.GaussianNB()
clf.fit(x_train_vectorized, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [155]:
x_new = mev.transform(x_test)
# x_new_tfidf = tfidf_transformer.transform(x_new_counts)

predicted = clf.predict(x_new)

In [156]:
counter  = 0
for doc, category in zip(X_test, predicted):
    print('%r => %s' % (doc, category))
    if(counter == 10):
        break
    counter += 1    

['shadow', 'year', 'girl', 'interrupt', 'likewis', 'follow', 'footstep', 'great', 'masterpiec', 'like', 'cuckoo', 'nest', 'trainspot'] => 1
['level', 'violenc', 'home', 'extrem', 'alex', 'scheme', 'nasti', 'kill'] => 1
['ask'] => 0
['hope', 'propuls', 'stellar', 'perform', 'messeng', 'look', 'like', 'timeless', 'epic', 'make'] => 0
['speak', 'go', 'window', 'scene', 'zellwegg', 'jump', 'window'] => 1
['mother', 'seri', 'setback', 'like', 'dump', 'dream', 'boat', 'dentist', 'bochner', 'met', 'beach', 'wit', 'daughter', 'mimic', 'whini', 'optimist', 'sai', 'try', 'act'] => 1
['appear', 'worker', 'permit', 'home', 'famili'] => 1
['man', 'todai'] => 0
['fight', 'club', 'grow', 'thing', 'control', 'cult', 'statu', 'begin', 'futur', 'soldier', 'prove', 'worth', 'stand', 'outsid', 'tyler', 'hous', 'dai'] => 1
['slide', 'door', 'refreshingli', 'differ', 'year'] => 0
['cap', 'live', 'rendit', 'girl', 'bounci', 'hit', 'wealth', 'laugh', 'merriment', 'sure', 'follow', 'right'] => 1


In [157]:
np.mean(predicted == y_test)

0.5339451259481225