# 03-Amazon-Reviews

## Data loading and basic preprocessing

In [1]:
# load the data
import pandas as pd
file = open('amazon_reviews.txt', 'r') 

In [2]:
data = [line for line in file]
data[0]

'__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"\n'

In [3]:
len(data)

10000

In [4]:
y = [0 if x.split(' ')[0] == '__label__1' else 1 for x in data]
sentences = [x.split(' ', 1)[1][:-1].lower() for x in data]

In [5]:
sentences[:10]

['great cd: my lovely pat has one of the great voices of her generation. i have listened to this cd for years and i still love it. when i\'m in a good mood it makes me feel better. a bad mood just evaporates like sugar in the rain. this cd just oozes life. vocals are jusat stuunning and lyrics just kill. one of life\'s hidden gems. this is a desert isle cd in my book. why she never made it big is just beyond me. everytime i play this, no matter black, white, young, old, male, female everybody says one thing "who was that singing ?"',
 "one of the best game music soundtracks - for a game i didn't really play: despite the fact that i have only played a small portion of the game, the music i heard (plus the connection to chrono trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. there is an incredible mix of fun, epic, and emotional songs. those sad and beautiful tracks i especially like, as there's not too many of those kinds of s

## Data tokenization

In [6]:
from tensorflow.keras.preprocessing import text, sequence

max_features = 10000

# tokenization
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(sentences)
tokenized_sentences = tokenizer.texts_to_sequences(sentences)


In [7]:
import numpy as np

lengths = [len(seq) for seq in tokenized_sentences]
print('min length:', np.min(lengths))
print('max length:', np.max(lengths))
print('mean length:', np.mean(lengths))
print('median length:', np.median(lengths))

min length: 10
max length: 205
mean length: 77.2038
median length: 69.0


In [19]:
# padding
maxlen = 64

X = sequence.pad_sequences(tokenized_sentences, maxlen=maxlen)

In [20]:
len(X), len(y)

(10000, 10000)

In [49]:
X.shape

(10000, 64)

## Split the dataset

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

## Build the model

In [46]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout

def my_RNN():

    model = Sequential()
    model.add(Embedding(input_dim=max_features, output_dim=32, input_length=maxlen))
    model.add(SimpleRNN(units=32, return_sequences=True))
    model.add(SimpleRNN(units=32, return_sequences=False))
    model.add(Dense(units=1, activation='sigmoid'))

    return model

In [47]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import TensorBoard

model = my_RNN()

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Define now our callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True)]

model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test), epochs=50, batch_size=64, callbacks=callbacks)

Train on 9000 samples, validate on 1000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50


<tensorflow.python.keras.callbacks.History at 0x7f0f9d4f0b00>

In [48]:
from sklearn.metrics import accuracy_score

print('accuracy on train with NN:', accuracy_score(model.predict(X_train).round(), y_train))
print('accuracy on test with NN:', accuracy_score(model.predict(X_test).round(), y_test))

accuracy on train with NN: 1.0
accuracy on test with NN: 0.799
