In [1]:
import pandas as pd
import re
from gensim import parsing
import gensim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes, svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics import classification_report
import os
import glob

In [10]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(word2vec))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [2]:
files = ['../data/raw/review_polarity/txt_sentoken/pos/' + n for n in os.listdir('../data/raw/review_polarity/txt_sentoken/pos')]
pos = pd.concat([pd.read_csv(item, names=['review'], sep='\n') for item in files])
pos['score'] = 1
pos.head()

Unnamed: 0,review,score
0,assume nothing .,1
1,the phrase is perhaps one of the most used of ...,1
2,"the phrase especially goes for oscar novak , a...",1
3,"novak ( matthew perry ) , a shy , clumsy , chi...",1
4,one of these is the job of restoring a popular...,1


In [3]:
files = ['../data/raw/review_polarity/txt_sentoken/neg/' + n for n in os.listdir('../data/raw/review_polarity/txt_sentoken/neg')]
neg = pd.concat([pd.read_csv(item, names=['review'], sep='\n') for item in files])
neg['score'] = 0
neg.head()

Unnamed: 0,review,score
0,bad . bad .,0
1,bad .,0
2,that one word seems to pretty much sums up bey...,0
3,"if that summary isn't enough for you , how abo...",0
4,still haven't got the point ?,0


In [4]:
X = pd.concat([pos, neg])
X['review'] = X['review'].map(parsing.preprocess_string)
model = gensim.models.Word2Vec(X['review'], size=100)
w2v = dict(zip(model.wv.index2word, model.wv.vectors))

In [5]:
reviews = pd.concat([pos, neg])
reviews['review'] = reviews['review'].map(parsing.preprocess_string).map(' '.join)
reviews.head()

Unnamed: 0,review,score
0,assum,1
1,phrase impress rumor hardli,1
2,phrase especi goe oscar novak architect main f...,1
3,novak matthew perri shy clumsi chicago base ar...,1
4,job restor popular build charl newman dylan mc...,1


In [5]:
print ("Null Value Statistics:", '\n \n', reviews.isnull().sum()) ## Sum will tell the total n
reviews.head()

Null Value Statistics: 
 
 review    0
score     0
dtype: int64


Unnamed: 0,review,score
0,assum,1
1,phrase impress rumor hardli,1
2,phrase especi goe oscar novak architect main f...,1
3,novak matthew perri shy clumsi chicago base ar...,1
4,job restor popular build charl newman dylan mc...,1


In [6]:
x_train, x_test, y_train, y_test = train_test_split(reviews['review'], reviews['score'], 
                                                    test_size=0.33, random_state=42)
x_train

18    count half dozen time blue screen painfulli ob...
28                 let greas forerunn armada movi music
14    plot involv peopl dare spend night haunt hous ...
9                             interest know custom come
18            develop connect charact given reason care
10    thing somewhat astrai film middl segment conce...
26                                 rate scale low scale
15                      end goe kind movi oscar perform
29    back zane david warner lovejoi cal person assi...
46                civil action provid drama lover crave
0                                               confess
31    shue talent bare seen josh brolin underus rest...
53                   actual better equal aw remak haunt
5     max take like sheedi find protect anim cruel t...
14    obsess lose job famili saniti loss compar expe...
31    carli determin free husband clutch armi succe ...
4          wai spielberg trivial awesom evil stori film
4                         czech russian english 

In [8]:
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)

In [7]:
tfidf_vect = TfidfVectorizer()
x_train_tfcounts = tfidf_vect.fit_transform(x_train)

In [9]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

In [24]:
model = gensim.models.Word2Vec(x_train, size=100)
w2v = dict(zip(model.wv.index2word, model.wv.vectors))
w2v

{' ': array([-0.02756337, -0.04307849, -0.07227907, -0.00168923,  0.30623513,
         0.10183889, -0.03415686,  0.16453923, -0.05294365,  0.15894502,
         0.1062631 , -0.03746662, -0.16582747, -0.02075319,  0.00613705,
         0.05674098, -0.21400727,  0.08301394, -0.24215934,  0.04499188,
        -0.10694627, -0.21148835, -0.22981957,  0.0400447 , -0.06803989,
        -0.05631337,  0.07543762, -0.13421564, -0.1905035 , -0.02631425,
        -0.1976282 ,  0.28530148,  0.15207982,  0.05078449,  0.17170477,
        -0.30765215, -0.05062347, -0.0924077 , -0.3940846 ,  0.09348131,
        -0.11173032, -0.07663573, -0.22411093, -0.5548751 ,  0.07742526,
         0.32112288, -0.18629889,  0.29409108,  0.23598044,  0.06060413,
        -0.30986577, -0.24635299, -0.05859535, -0.18675478, -0.02390215,
         0.15303263,  0.01175566,  0.01427303,  0.23127289, -0.13889965,
         0.02469964, -0.13279133,  0.16478877,  0.2372686 ,  0.11304331,
        -0.148419  , -0.0348288 , -0.13273746,

In [25]:
mev = MeanEmbeddingVectorizer(w2v)
x_train_vectorized = mev.transform(x_train)
x_train_vectorized

array([[-4.51642908e-02, -5.90913258e-02, -3.01562492e-02, ...,
         1.72521845e-02,  1.28185563e-02, -7.62414485e-02],
       [-7.95966014e-03, -3.84604745e-02,  3.78949451e-03, ...,
         2.11524740e-02, -1.09174019e-02, -1.11074954e-01],
       [ 6.88383952e-05, -3.83178145e-02,  4.94126184e-03, ...,
         9.50452033e-03,  3.97482589e-02, -6.11308627e-02],
       ...,
       [ 1.23906545e-02, -5.00042550e-02,  7.37306371e-04, ...,
         5.85244969e-02,  6.73879981e-02, -9.67892259e-02],
       [-7.99076557e-02, -2.81998683e-02,  4.04551215e-02, ...,
         1.52592137e-01,  6.83605894e-02, -2.38662958e-01],
       [-1.48885734e-02, -3.62893939e-02,  2.32749134e-02, ...,
         3.70450169e-02,  1.01137981e-01, -1.03964232e-01]])

In [26]:
nmf = NMF(n_components=100)
x_train_vectorized_nmf = nmf.fit_transform(x_train_vectorized)

ValueError: Negative values in data passed to NMF (input X)

In [None]:
clf = naive_bayes.MultinomialNB()
clf.fit(x_train_tfidf, y_train)

In [9]:
# x_new = mev.transform(x_test)

# x_new = count_vect.transform(x_test)
# x_new_tfidf = tfidf_transformer.transform(x_new)

x_test_tfidf = tfidf_vect.transform(x_train)

# predicted = clf.predict(x_new_tfidf) 

In [None]:
counter  = 0
for doc, category in zip(x_new_tfidf, predicted):
    print('%r => %s' % (doc, category))
    if(counter == 10):
        break
    counter += 1    

In [None]:
np.mean(predicted == y_test)

In [17]:
classifier_rbf = svm.SVC(gamma='scale')
classifier_rbf.fit(x_train_tfcounts, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
prediction_rbf = classifier_rbf.predict(x_test_tfidf)

In [15]:
len(prediction_rbf)

43362

In [16]:
len(y_train)

43362

In [52]:
y_train

18    0
28    1
14    0
9     0
18    0
10    1
26    1
15    1
29    1
46    1
0     0
31    0
53    0
5     0
14    1
31    1
4     0
4     1
33    0
6     0
1     1
12    1
18    1
30    1
3     0
26    0
13    1
40    0
7     0
3     1
     ..
23    1
37    0
7     0
19    1
15    1
16    0
49    0
0     0
31    1
21    1
15    0
39    1
25    0
16    0
16    0
68    0
9     1
20    0
20    0
13    1
18    1
12    0
22    1
19    0
16    0
32    0
24    0
35    1
66    1
16    0
Name: score, Length: 43362, dtype: int64

In [53]:
classification_report()

NameError: name 'classification_report' is not defined

In [57]:
x_train_tfidf.toarray()

False