In [1]:
import numpy as np
import pandas as pd
import nltk
import scipy
from scipy.sparse import hstack
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

try:
    from toolz import itertoolz, compose
except:
    !pip install toolz
    from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck

In [2]:
# źródło: StackOverflow

class SkipGramVectorizer(CountVectorizer):
    def build_analyzer(self):    
        preprocess = self.build_preprocessor()
        stop_words = self.get_stop_words()
        tokenize = self.build_tokenizer()
        return lambda doc: self._word_skip_grams(
                compose(tokenize, preprocess, self.decode)(doc),
                stop_words)

    def _word_skip_grams(self, tokens, stop_words=None):
        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]

        return compose(cmap(' '.join), pluck([0, 1]), sliding_window(2))(tokens)

In [3]:
df = pd.read_csv('reviews_train.csv')[:10000]
print(df.shape)
df = df.dropna()
print(df.shape)
df.head(15)

(10000, 9)
(10000, 9)


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,summary,unixReviewTime,reviewTime,score
0,A35C43YE9HU9CN,B0064X7B4A,Joan Miller,"[0, 0]",I have decided not to play this game. I can't...,Friends,1396396800,"04 2, 2014",1.0
1,AHFS8CGWWXB5B,B00H1P4V3E,WASH ST. GAMER,"[3, 4]",The Amazon Appstore free app of the day for Ju...,"Amazon Makes This ""Longest Spring Ever"" for Fi...",1402272000,"06 9, 2014",2.0
2,A3EW8OTQ90NVHM,B00CLVW82O,Kindle Customer,"[0, 4]",this game was so mush fun I wish I could play ...,best,1368921600,"05 19, 2013",5.0
3,AJ3GHFJY1IUTD,B007T9WVKM,BrawlMaster4,"[0, 2]","Its pretty fun and very good looking, but you...",Fun Game,1350172800,"10 14, 2012",5.0
4,A3JJGBS4EL603S,B00J206J5E,"K. Wilson ""thesupe""","[0, 0]",good graphics; immersive storyline; hard to st...,great game!,1396915200,"04 8, 2014",5.0
5,A3RL7Y2FJBDHJ0,B006H7TC3Q,hi,"[2, 5]",its very good.u use fotos on ur device and it ...,very good,1337817600,"05 24, 2012",5.0
6,AUHVMC0PURGO8,B006R6VG9K,A.Mccullough,"[0, 0]",the game is very fun and fast paced. It also k...,fun and fast paced,1401926400,"06 5, 2014",5.0
7,A1Z37DUIWXJNLN,B00B63HT8Q,"Julie Quick ""Beach Bum Wannabe""","[0, 0]",great app! A quick look at the weather... not...,A great alternative to the current (stupid) ve...,1377388800,"08 25, 2013",4.0
8,AF7ZE5MRM6CW2,B00BL0I7WG,T.dd,"[0, 0]",So fare I like it haven't had it long enough t...,easy fun,1369612800,"05 27, 2013",5.0
9,A1TTH51E2651BJ,B00GRXA7GG,Joni,"[0, 0]",This classic Mahjong comes with nice graphics ...,Mahjong Premium,1394841600,"03 15, 2014",5.0


In [4]:
import pickle
with open('clfasifier.pickle', 'rb') as f:
    classifier = pickle.load(f)
    
with open('vectorizers.pickle', 'rb') as f:
    vText, vSummary, ngramText, ngramSummary = pickle.load(f)

In [5]:
fullData = hstack([vText.transform(df['reviewText']), vSummary.transform(df['summary']),
                ngramText.transform(df['reviewText']), ngramSummary.transform(df['summary'])])

In [6]:
prediction = classifier.predict(fullData)

In [7]:
accuracy_score(df['score'], prediction)

0.6936