In [13]:
import numpy as np
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from tensorflow.keras import models, layers

# Supervised Embedding

In [83]:
reviews = ['nice food',
          'amazing restaurant',
          'too good',
          'just loved it!',
          'will go again',
          'horrible food',
          'never go there',
          'poor service',
          'poor quality',
          'needs improvement',
          'best i had',
          'nothing like it',
          'truly amazing',
          'good service',
          'best food',
          'tastes like shit',
          'bad food',
          'worst restaurant',
          'garbage',
          'disgusting place']

sentiment = np.array([1,1,1,1,1,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0])

In [86]:
vocab_size = 300
encoded_reviews = [one_hot(review, vocab_size) for review in reviews]
encoded_reviews

[[248, 247],
 [102, 285],
 [269, 98],
 [207, 230, 88],
 [40, 281, 140],
 [82, 247],
 [272, 281, 56],
 [153, 296],
 [153, 179],
 [288, 216],
 [87, 23, 72],
 [32, 204, 88],
 [80, 102],
 [98, 296],
 [87, 247],
 [269, 204, 53],
 [254, 247],
 [45, 285],
 [251],
 [197, 71]]

In [1]:
#Max length for padding

In [87]:
def find_max_length(r):
    list_len = [len(i) for i in r]
    return max(list_len)

In [88]:
max_length = find_max_length(encoded_reviews)
padded_reviews = pad_sequences(encoded_reviews, maxlen=max_length, padding='post')
padded_reviews

array([[248, 247,   0],
       [102, 285,   0],
       [269,  98,   0],
       [207, 230,  88],
       [ 40, 281, 140],
       [ 82, 247,   0],
       [272, 281,  56],
       [153, 296,   0],
       [153, 179,   0],
       [288, 216,   0],
       [ 87,  23,  72],
       [ 32, 204,  88],
       [ 80, 102,   0],
       [ 98, 296,   0],
       [ 87, 247,   0],
       [269, 204,  53],
       [254, 247,   0],
       [ 45, 285,   0],
       [251,   0,   0],
       [197,  71,   0]])

In [89]:
embedded_vector_size = 4

model = models.Sequential([
    layers.Embedding(vocab_size, embedded_vector_size, input_length=max_length, name='embedding'),
    layers.Flatten(),
    layers.Dense(1, activation='sigmoid')
])

In [90]:
X = padded_reviews
y = sentiment

In [91]:
model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 4)              1200      
                                                                 
 flatten_4 (Flatten)         (None, 12)                0         
                                                                 
 dense_4 (Dense)             (None, 1)                 13        
                                                                 
Total params: 1,213
Trainable params: 1,213
Non-trainable params: 0
_________________________________________________________________


In [92]:
model.fit(X, y, epochs=50, verbose=0)

<keras.callbacks.History at 0x10da8bbd9a0>

In [93]:
loss, accuracy = model.evaluate(X, y)



In [94]:
weights = model.get_layer('embedding').get_weights()[0]
len(weights)

300

In [100]:
weights[82]

array([ 0.01060714,  0.06238991,  0.09704973, -0.04147163], dtype=float32)

In [99]:
weights[251]

array([ 0.02720933,  0.01039754,  0.07972359, -0.0585075 ], dtype=float32)

# Word2Vec

In [14]:
import gensim
import pandas as pd

In [3]:
df = pd.read_csv('./data/Musical_instruments_reviews.csv')
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [4]:
df.dropna(inplace=True)

In [5]:
df.shape

(10227, 9)

In [6]:
reviews = df.reviewText.apply(gensim.utils.simple_preprocess)
reviews.head()

0    [not, much, to, write, about, here, but, it, d...
1    [the, product, does, exactly, as, it, should, ...
2    [the, primary, job, of, this, device, is, to, ...
3    [nice, windscreen, protects, my, mxl, mic, and...
4    [this, pop, filter, is, great, it, looks, and,...
Name: reviewText, dtype: object

In [7]:
df.reviewText[0]

"Not much to write about here, but it does exactly what it's supposed to. filters out the pop sounds. now my recordings are much more crisp. it is one of the lowest prices pop filters on amazon so might as well buy it, they honestly work the same despite their pricing,"

In [8]:
model = gensim.models.Word2Vec(
        window=5,
        min_count=2,
        workers=2
)

In [9]:
model.build_vocab(reviews, progress_per=1000)

In [10]:
model.epochs

5

In [11]:
model.train(reviews, total_examples=model.corpus_count, epochs=model.epochs)

(3217765, 4344135)

In [129]:
model.save('./word2vec/amazon-product-reviews-model')

In [12]:
model.wv.most_similar('bad')

[('big', 0.8147752285003662),
 ('why', 0.706118643283844),
 ('guess', 0.6888792514801025),
 ('description', 0.6804072260856628),
 ('saying', 0.6750804781913757),
 ('fair', 0.6745425462722778),
 ('considering', 0.674339234828949),
 ('complaining', 0.6702887415885925),
 ('huge', 0.668746829032898),
 ('star', 0.6680545210838318)]