In [3]:
import gensim
import pandas as pd

# Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Cell Phones & Accessories category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset:

<a href='http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz'> http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz</a>

In [6]:
# import gzip
# with gzip.open('./reviews_Cell_Phones_and_Accessories_5.json.gz', 'rb') as gz_file:
#     # Read the contents of the .gz file
#     content = gz_file.read()

# # Write the contents to the output file
# with open('./Cell_Phones_and_Accessories_Reviews.json', 'wb') as output_file:
#     output_file.write(content)

In [7]:
df = pd.read_json("Cell_Phones_and_Accessories_Reviews.json", lines=True)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
40964,A3SGA86CXLS14L,B004TBY6YW,fun cuber,"[0, 0]",I have been looking for good quality single ea...,5,good,1370390400,"06 5, 2013"


In [8]:
df.sample(5)


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
140809,A29EAOP4EI5DAQ,B00AG6NSNO,PharoahzenRob,"[0, 0]",I bought this to bring my old shelf system bac...,4,My old system lives!,1399248000,"05 5, 2014"
59403,A2CRCLLI3R9OUK,B005SKGJX4,Doncheial,"[0, 0]",For the price you can't complain and I only ha...,4,"Great Price, Can't Complain...",1390176000,"01 20, 2014"
55698,AKMQ4GM0YRD89,B005LFXBJG,Adam Turney,"[0, 0]",I bought this for my Motorola RAZR M phone bec...,5,Perfect Charger,1393545600,"02 28, 2014"
50488,A2K97VU88RR9Y4,B005C66O2A,Jake,"[0, 0]",I use the phone all day and it is nice having ...,3,Great value,1354147200,"11 29, 2012"
99418,A32TII5IOQNG1Y,B008D4X4GW,johnathon j johnson,"[0, 0]",It allows for easy case removal and a light gr...,4,Not bad at all.,1388016000,"12 26, 2013"


In [9]:
df.shape

(194439, 9)

# Simple Preprocessing & Tokenization
The first thing to do for any data science task is to clean the data. For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. This is something we will do over here too.

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.


In [10]:

review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text[:5]

0    [they, look, good, and, stick, good, just, don...
1    [these, stickers, work, like, the, review, say...
2    [these, are, awesome, and, make, my, phone, lo...
3    [item, arrived, in, great, time, and, was, in,...
4    [awesome, stays, on, and, looks, great, can, b...
Name: reviewText, dtype: object

In [14]:
review_text.iloc[0]

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [13]:
df.reviewText[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

# Training the Word2Vec Model
Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter.

Workers define how many CPU threads to be used.

<b>Initialize the model</b>



In [15]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)

<b>Build Vocabulary</b>



In [16]:
model.build_vocab(review_text, progress_per=1000)


<b>Train the Word2Vec Model</b>


In [23]:
model.train(review_text[:50000], total_examples=model.corpus_count, epochs=model.epochs)

(16036334, 21792175)

Save the Model
Save the model so that it can be reused in other applications

In [34]:
model.save("./word2vec-amazon-cell-accessories-reviews-short.model")

In [35]:
model.wv.most_similar("bad")

[('terrible', 0.6641182899475098),
 ('shabby', 0.6113999485969543),
 ('good', 0.5968520045280457),
 ('horrible', 0.5547590851783752),
 ('awful', 0.5219196081161499),
 ('poor', 0.5087665319442749),
 ('sad', 0.5069642066955566),
 ('funny', 0.490601509809494),
 ('legit', 0.4825233817100525),
 ('crappy', 0.47718724608421326)]

In [22]:
model.wv.most_similar("mixed")

[('rave', 0.6440388560295105),
 ('positive', 0.6153465509414673),
 ('raving', 0.5430460572242737),
 ('cnet', 0.5405381917953491),
 ('favorable', 0.529464840888977),
 ('complaining', 0.5181417465209961),
 ('pcmag', 0.5136297345161438),
 ('leery', 0.5130190253257751),
 ('posted', 0.5035485625267029),
 ('stories', 0.49264267086982727)]

In [36]:
model.wv.similarity(w1="cheap", w2="inexpensive")

0.59198886

In [37]:
model.wv.similarity(w1="great", w2="good")


0.7403073

In [30]:
model.wv.similarity(w1="mountain", w2="good")


-0.10269565