In [1]:

import gzip
import itertools
import string
import wordcloud
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import pylab as pl

from collections import Counter
from sklearn import svm
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

%matplotlib inline

In [2]:
wordcloud


<module 'wordcloud' from '/anaconda2/lib/python2.7/site-packages/wordcloud/__init__.pyc'>

In [3]:

def parse_gz(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def convert_to_DF(path):
    i = 0
    df = {}
    for d in parse_gz(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
sports_outdoors = convert_to_DF('/Users/vikaschhillar/Downloads/reviews_Sports_and_Outdoors_5.json.gz')


In [5]:
print('Dataset size: {:,} words'.format(len(sports_outdoors)))


Dataset size: 296,337 words


In [6]:
sports_outdoors["reviewTime"] = pd.to_datetime(sports_outdoors["reviewTime"])


In [7]:
sports_outdoors = sports_outdoors[['asin', 'summary', 'reviewText', 'overall', 'reviewerID', 'reviewerName', 'helpful', 'reviewTime',
      'unixReviewTime']]

In [8]:
sports_outdoors.head(100)


Unnamed: 0,asin,summary,reviewText,overall,reviewerID,reviewerName,helpful,reviewTime,unixReviewTime
0,1881509818,Woks very good,This came in on time and I am veru happy with ...,5.0,AIXZKN4ACSKI,David Briner,"[0, 0]",2014-01-26,1390694400
1,1881509818,Works as well as the factory tool,I had a factory Glock tool that I was using fo...,5.0,A1L5P841VIO02V,Jason A. Kramer,"[1, 1]",2012-02-02,1328140800
2,1881509818,"It's a punch, that's all.",If you don't have a 3/32 punch or would like t...,4.0,AB2W04NI4OEAD,J. Fernald,"[2, 2]",2012-02-28,1330387200
3,1881509818,It's a punch with a Glock logo.,This works no better than any 3/32 punch you w...,4.0,A148SVSWKTJKU6,"Jusitn A. Watts ""Maverick9614""","[0, 0]",2012-02-05,1328400000
4,1881509818,"Ok,tool does what a regular punch does.",I purchased this thinking maybe I need a speci...,4.0,AAAWJ6LW9WMOO,Material Man,"[0, 0]",2013-04-23,1366675200
5,1881509818,Glock punch tool - needed for your Glock and o...,"Needed this tool to really break down my G22, ...",5.0,A2XX2A4OJCDNLZ,RatherLiveInKeyWest,"[0, 0]",2012-11-02,1351814400
6,1881509818,Great tool,If u don't have it .. Get it. All you need to ...,5.0,A283UOBQRUNM4Q,Thomas Dragon,"[0, 0]",2014-06-10,1402358400
7,2094869245,Bright!,This light will no doubt capture the attention...,4.0,AWG3H90WVZ0Z1,Alec Nelson,"[0, 0]",2013-08-31,1377907200
8,2094869245,Be seen,"Light and laser torch work well, very bright. ...",5.0,A3V52OTJHKIJZX,"A. Saenz Jr. ""Bettering self""","[0, 1]",2013-05-27,1369612800
9,2094869245,Bicycle rear tail light,Does everything it says it will do. I would li...,5.0,A3SZBE5F3UQ9EC,"ChasRat ""ChasRat""","[0, 0]",2013-11-02,1383350400


In [9]:

products = sports_outdoors['overall'].groupby(sports_outdoors['asin']).count()
print("Number of Unique Products in the Sports & Outdoors Category = {}".format(products.count()))

Number of Unique Products in the Sports & Outdoors Category = 18357


In [10]:
sorted_products = products.sort_values(ascending=False)

print("Top 20 Reviewed Products:\n")
print(sorted_products[:20], )
print('Most Reviewed Product, B001HBHNHE - has {} reviews.'.format(products.max()))


Top 20 Reviewed Products:

(asin
B001HBHNHE    1042
B001T7QJ9O     763
B000S5ODN2     647
B0010O748Q     513
B0000C50K3     427
B002ZYRV2E     401
B002OKWHVO     398
B000GCRWCG     393
B001HBHNHY     372
B0035L35A8     359
B004U8CP88     357
B001WJ577O     355
B004TNWD40     349
B006X9DLQM     344
B00178CS4K     343
B006QF3TW4     323
B003NFI092     309
B00200E0HM     307
B001949TKS     298
B000JZ7JM8     293
Name: overall, dtype: int64,)
Most Reviewed Product, B001HBHNHE - has 1042 reviews.


In [11]:
def tokenize(text):
    tokenized = word_tokenize(text)
    no_punc = []
    for review in tokenized:
        line = "".join(char for char in review if char not in string.punctuation)
        no_punc.append(line)
    tokens = lemmatize(no_punc)
    return tokens


def lemmatize(tokens):
    lmtzr = WordNetLemmatizer()
    lemma = [lmtzr.lemmatize(t) for t in tokens]
    return lemma

In [12]:
reviews = sports_outdoors['reviewText']


In [13]:
reviews = reviews.apply(lambda x: tokenize(x))


In [14]:
reviews[:11]


0     [This, came, in, on, time, and, I, am, veru, h...
1     [I, had, a, factory, Glock, tool, that, I, wa,...
2     [If, you, do, nt, have, a, 332, punch, or, wou...
3     [This, work, no, better, than, any, 332, punch...
4     [I, purchased, this, thinking, maybe, I, need,...
5     [Needed, this, tool, to, really, break, down, ...
6     [If, u, do, nt, have, it, , Get, it, , All, yo...
7     [This, light, will, no, doubt, capture, the, a...
8     [Light, and, laser, torch, work, well, , very,...
9     [Does, everything, it, say, it, will, do, , I,...
10    [Very, bright, , I, would, recommend, this, li...
Name: reviewText, dtype: object

In [None]:
cloud = wordcloud.WordCloud(background_color='gray', max_font_size=60, 
                                relative_scaling=1).generate(' '.join(sports_outdoors.reviewText))


In [None]:
sorted_products[9178:9180]


In [None]:
print("Bottom 20 Reviewed Products:\n")
print(sorted_products[18337:])
print('Least Reviewed Product (Sorted), B003Z6HUZE - has {} reviews.'.format(products.min()))

In [None]:
products.mode()


In [None]:
sports_outdoors['reviewText'][:11]


In [None]:
stops = stopwords.words('english')


In [None]:
stops