In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Embedding,Flatten

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("/content/drive/MyDrive/alexa_reviews.csv")
df.head()

Unnamed: 0,verified_reviews,feedback
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


In [4]:
# Feature and Target
X = df["verified_reviews"]
y = df["feedback"]

In [5]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=1)

In [6]:
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

tokenizer.index_word

{1: 'the',
 2: 'i',
 3: 'to',
 4: 'it',
 5: 'and',
 6: 'a',
 7: 'my',
 8: 'is',
 9: 'for',
 10: 'love',
 11: 'echo',
 12: 'this',
 13: 'with',
 14: 'have',
 15: 'of',
 16: 'great',
 17: 'in',
 18: 'that',
 19: 'on',
 20: 'but',
 21: 'alexa',
 22: 'so',
 23: 'you',
 24: 'music',
 25: 'as',
 26: 'not',
 27: 'use',
 28: 'like',
 29: 'up',
 30: 'can',
 31: 'was',
 32: 'we',
 33: 'works',
 34: 'be',
 35: 'all',
 36: 'very',
 37: 'easy',
 38: 'just',
 39: 'sound',
 40: 'set',
 41: 'more',
 42: 'one',
 43: 'good',
 44: 'me',
 45: 'do',
 46: 'are',
 47: 'when',
 48: 'dot',
 49: 'product',
 50: 'an',
 51: 'or',
 52: 'amazon',
 53: 'if',
 54: 'get',
 55: 'speaker',
 56: 'device',
 57: 'has',
 58: 'home',
 59: '34',
 60: "it's",
 61: 'still',
 62: 'had',
 63: 'from',
 64: 'play',
 65: 'really',
 66: 'at',
 67: 'she',
 68: 'will',
 69: 'would',
 70: 'prime',
 71: 'am',
 72: 'what',
 73: 'other',
 74: 'our',
 75: 'time',
 76: 'out',
 77: 'also',
 78: 'smart',
 79: 'than',
 80: 'no',
 81: 'much',
 8

In [7]:
vocab_length = len(tokenizer.index_word)
vocab_length

3632

In [8]:
# text to sequence
train_sequence = tokenizer.texts_to_sequences(X_train)

In [9]:
# document length
doc_length = []
for doc in train_sequence:
    doc_length.append(len(doc))
max(doc_length)

555

In [10]:
np.quantile(doc_length,0.99)

151.96000000000004

In [11]:
max_len = 51
train_matrix = sequence.pad_sequences(train_sequence,maxlen=max_len)
train_matrix

array([[   0,    0,    0, ..., 1234,   19,   44],
       [   0,    0,    0, ...,  765,    6, 1521],
       [   0,    0,    0, ...,    4,   30,   45],
       ...,
       [   0,    0,    0, ...,  315,   16,   49],
       [   0,    0,    0, ...,    0,  525,  570],
       [   0,    0,    0, ...,    1,   11,  141]], dtype=int32)

In [12]:
# test data
test_sequence = tokenizer.texts_to_sequences(X_test)
test_matrix = sequence.pad_sequences(test_sequence,maxlen=max_len)
test_matrix

array([[  0,   0,   0, ..., 655,  67,   8],
       [249,   1, 132, ...,   4,  50, 976],
       [  0,   0,   2, ...,   7, 611, 134],
       ...,
       [  0,   0,   0, ..., 427,  11,  48],
       [  0,   0,   0, ...,  10,   1,  48],
       [  0,   0,   0, ..., 482,  15,  24]], dtype=int32)

In [13]:
## Neural Network
model = Sequential()
model.add(Embedding(input_dim = vocab_length+1, # Input vector length for each token
                    output_dim = 50, # Output vector length for each token
                    input_length=max_len, # Document length
                    mask_zero=True)) # 0 is not a token
model.add(Flatten())
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 51, 50)            181650    
                                                                 
 flatten (Flatten)           (None, 2550)              0         
                                                                 
Total params: 181,650
Trainable params: 181,650
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.add(Dense(64,activation="tanh"))
model.add(Dense(64,activation="tanh"))
model.add(Dense(1,activation="sigmoid"))

In [15]:
model.compile(optimizer="adam",loss="binary_crossentropy")
model.fit(train_matrix,y_train,epochs=5,batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd015f64350>

In [16]:
y_pred = model.predict(test_matrix)

In [17]:
y_pred = np.where(y_pred >= 0.5,1,0)

In [18]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.56      0.41      0.47        73
           1       0.95      0.97      0.96       872

    accuracy                           0.93       945
   macro avg       0.75      0.69      0.72       945
weighted avg       0.92      0.93      0.92       945

