# Amazon Reviews Analysis - Neural Networks
---
<b>By David Penny<b>

In [1]:
# Let's start by importing our (very) favorite libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<b> Read in the clean data <b>
    
Let's start off by reading in three versions of the data to use as comparision.
    
The first is much larger as it has not been downsampled.

In [14]:
df_final = pd.read_csv('data/df_final.csv')

In [15]:
display(df_final.head(1))
print(df_final.shape)

Unnamed: 0,doc_id,falsified,rating,verified_purchase,product_category,product_id,product_title,review_title,review_text_x,sentiment,...,mean_word_len,num_chars,num_punctuations,num_scentences_in_text,flesch_ease,flesch_kincaid_grade,automated_readability_index,overall_readability_index,total_sentiment,average_review_sentiment
0,1,1,4,0,PC,B00008NG7N,"Targus PAUK10U Ultra Mini USB Keypad, Black",useful,"When least you think so, this product will sav...",1,...,4.086957,116,3,2,102.1,1.9,3.6,5.0,6.0,3.0


(20809, 24)


In [16]:
df_numerical = pd.read_csv('data/df_numerical.csv').drop(columns='Unnamed: 0')

In [17]:
display(df_numerical.head(1))
print(df_numerical.shape)

Unnamed: 0,falsified,rating,verified_purchase,sentiment,num_words_in_text,num_stopwords,num_words_in_text_no_stop,num_unique_words,mean_word_len,num_chars,num_punctuations,num_scentences_in_text,flesch_ease,flesch_kincaid_grade,automated_readability_index,overall_readability_index,total_sentiment,average_review_sentiment
0,1,4,0,1,23,11,12,21,4.086957,116,3,2,102.1,1.9,3.6,5.0,6.0,3.0


(19989, 18)


In [18]:
df_downsampled = pd.read_csv('data/df_downsampled.csv').drop(columns='Unnamed: 0')

In [19]:
display(df_downsampled.head(1))
print(df_downsampled.shape)

Unnamed: 0,falsified,verified_purchase,sentiment,num_punctuations,flesch_kincaid_grade,overall_readability_index,total_sentiment,average_review_sentiment
0,1,1,1,5,10.5,11.0,16.0,8.0


(9104, 8)


In [6]:
df_downsampled.columns

Index(['falsified', 'verified_purchase', 'sentiment', 'num_punctuations',
       'flesch_kincaid_grade', 'overall_readability_index', 'total_sentiment',
       'average_review_sentiment'],
      dtype='object')

In [20]:
# Here we choose which version of the df we will use
#dataframe = df_final
#dataframe = df_numerical
dataframe = df_downsampled

<b> Dummy Classifier <b>

Let's get a baseline to see how a dummy classifier would score.

In [21]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Separate input features and target
y = dataframe['falsified']
X = dataframe.drop(columns='falsified', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

# DummyClassifier to predict only target 0
dummy = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

# checking unique labels
print('Unique predicted labels: ', (np.unique(dummy_pred)))

# checking accuracy
print('Test score: ', accuracy_score(y_test, dummy_pred))

Unique predicted labels:  [1]
Test score:  0.4995606326889279


### 1. Basic NN Model

In [22]:
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

Let's start off by testing a very basic NN model to get another baseline for our accuracy potential.

/// CAUTION /// This takes a long time to run!

In [26]:
y = dataframe['falsified']
X = dataframe.drop(columns='falsified')

print(y.shape)
print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

NN_model = MLPClassifier(hidden_layer_sizes=(17,128,128,128,128,2), solver='lbfgs', max_iter=10000)
NN_model.fit(X,y);

print(f'The test accuracy is: {NN_model.score(X,y):0.3f}')

(9104,)
(9104, 7)
The test accuracy is: 0.482


### 2. Tensorflow

In [28]:
import tensorflow as tf
import datetime
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [10]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

5826 train examples
1457 validation examples
1821 test examples


In [11]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('falsified')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [12]:
# Setting up our data sets
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [13]:
# Describing our data sets
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['total_sentiment'])
  print('A batch of targets:', label_batch )

Every feature: ['verified_purchase', 'sentiment', 'num_punctuations', 'flesch_kincaid_grade', 'overall_readability_index', 'total_sentiment', 'average_review_sentiment']
A batch of ages: tf.Tensor([ 9.  4. 16.  9. 13.], shape=(5,), dtype=float64)
A batch of targets: tf.Tensor([1 1 0 1 0], shape=(5,), dtype=int64)


In [14]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('falsified')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [15]:
batch_size = 5 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [16]:
for feature_batch, label_batch in train_ds.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of ages:', feature_batch['total_sentiment'])
  print('A batch of targets:', label_batch )

Every feature: ['verified_purchase', 'sentiment', 'num_punctuations', 'flesch_kincaid_grade', 'overall_readability_index', 'total_sentiment', 'average_review_sentiment']
A batch of ages: tf.Tensor([14. 15. 12. 10. 22.], shape=(5,), dtype=float64)
A batch of targets: tf.Tensor([1 1 0 1 1], shape=(5,), dtype=int64)


In [17]:
# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]

In [25]:
feature_columns = []

# DOWNSAMPLED numeric cols
for header in ['verified_purchase', 'sentiment', 'num_punctuations',
       'flesch_kincaid_grade', 'overall_readability_index', 'total_sentiment',
       'average_review_sentiment']:
  feature_columns.append(feature_column.numeric_column(header))

In [26]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [27]:
batch_size = 50
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [55]:
# Clear any logs from previous runs
!rm -rf ./logs/ 

<b> Here's where the magic happens: <b>

In [61]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(4, activation='sigmoid'),
  layers.Dense(128, activation='sigmoid'),
  layers.Dense(128, activation='sigmoid'),
  layers.Dense(128, activation='sigmoid'),
  layers.Dense(1)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

model.fit(train_ds,
          validation_data=val_ds,
          callbacks=[tensorboard_callback],
          epochs=25)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x14da080b8>

In [62]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.8215266466140747


<b> Summary <b>
    
It's not bad! But I was hoping that we would score much higher than 82%.

In [58]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [300]:
%tensorboard --logdir logs/fit

ERROR: Failed to launch TensorBoard (exited with 1).
Contents of stderr:
Traceback (most recent call last):
  File "/Users/davepenny/opt/anaconda3/envs/deeplearning/bin/tensorboard", line 8, in <module>
    sys.exit(run_main())
  File "/Users/davepenny/opt/anaconda3/envs/deeplearning/lib/python3.6/site-packages/tensorboard/main.py", line 65, in run_main
    default.get_plugins() + default.get_dynamic_plugins(),
  File "/Users/davepenny/opt/anaconda3/envs/deeplearning/lib/python3.6/site-packages/tensorboard/default.py", line 125, in get_dynamic_plugins
    "tensorboard_plugins"
  File "/Users/davepenny/opt/anaconda3/envs/deeplearning/lib/python3.6/site-packages/tensorboard/default.py", line 124, in <listcomp>
    for entry_point in pkg_resources.iter_entry_points(
  File "/Users/davepenny/opt/anaconda3/envs/deeplearning/lib/python3.6/site-packages/pkg_resources/__init__.py", line 2460, in load
    self.require(*args, **kwargs)
  File "/Users/davepenny/opt/anaconda3/envs/deeplearning/lib