# Research Question

We'll be using a Neural Network model and Natural Language Processing to perform sentinment analysis on reviews. Our goal is to build a model that can predict if a sentence is positive or negative.

In [1]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

# Data Preparation

The raw data must be modified from it's raw form in order to be used in out Neural Network.

## Import Data

We are using three differet data source files, each contains 500 sentences labeled positive or negative. The sentances are sourced from Amazon reviews, IMDB reviews, and Yelp reviews. 

We import each file in its ``.txt`` form and convert them into dataframes.

In [2]:
with open('data/amazon_cells_labelled.txt') as f:
    reader = csv.reader(f, delimiter='\t')
    amazon = list(reader)

amazon_df = pd.DataFrame(amazon, columns=['sentence', 'label'])
amazon_df.head()

Unnamed: 0,sentence,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [3]:
with open('data/imdb_labelled.txt') as f:
    reader = csv.reader(f, delimiter='\t')
    imdb = list(reader)
    
imdb_df = pd.DataFrame(imdb, columns=['sentence', 'label'])
imdb_df.head()

Unnamed: 0,sentence,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [4]:
with open('data/yelp_labelled.txt') as f:
    reader = csv.reader(f, delimiter='\t')
    yelp = list(reader)

yelp_df = pd.DataFrame(yelp, columns=['sentence', 'label'])
yelp_df.head()

Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


Now we concat the three dataframes into a single one.

In [5]:
df = pd.concat([amazon_df, imdb_df, yelp_df])

df['label'] = df['label'].astype(int)
df.head()

Unnamed: 0,sentence,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [6]:
# chapter 16, page 536
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

X, y = preprocess(df['sentence'], df['label'])

In [7]:
from collections import Counter
vocabulary = Counter()
for text in X:
    vocabulary.update(list(text.numpy()))

print(vocabulary.most_common(5))
print('Total unique words: ', len(vocabulary))

[(b'<pad>', 3528719), (b'the', 1480), (b'and', 1110), (b'I', 864), (b'a', 851)]
Total unique words:  5927


In [8]:
vocab_size = int(len(vocabulary) / 2)
truncated_vocab = [w for w, c in vocabulary.most_common()[:vocab_size]]
print('New Vocab Size: ', len(truncated_vocab))

New Vocab Size:  2963


In [26]:
words = tf.constant(truncated_vocab)
word_ids = tf.range(len(truncated_vocab), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)


def encode_words(X_batch, y_batch):
    X_, y_ = preprocess(X_batch, y_batch)
    return table.lookup(X_), y_

X_train, X_test, y_train, y_test = train_test_split(df['sentence'], df['label'], test_size=0.2, random_state=42)
X_train, y_train = encode_words(X_train, y_train)
X_test, y_test = encode_words(X_test, y_test)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
embed_size = 128
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

## Exploratory Data Analysis

We want to explore the dataset more before we built our model. In particular, we want to know more about the length and word count of our sentences.

In [None]:
# count characters and words
df['chars'] = df['sentence'].apply(len)
df['words'] = df['sentence'].apply(lambda x: len(x.split()))
df.head()

In [None]:
plt.hist(df['words'], bins=50)
plt.xlim(0, 100)
plt.show()

avg_words = np.mean(df['words'])
print('Average words: ', avg_words)
print('Max words: ', np.max(df['words']))
print('Min words: ', np.min(df['words']))

### Vocabulary

In [None]:
count_vectorizer = CountVectorizer()
word_vector = count_vectorizer.fit(df['sentence'])
words = word_vector.get_feature_names_out()
word_counts = pd.DataFrame(dict(features= words, count = np.sum(word_vector.transform(df['sentence']).toarray(),axis=0)))
print('Total number of unique words:', len(word_counts))

In [None]:
word_counts.sort_values('count', ascending=False)[0:10]

## Tokenization

Now we tokenize the sentences using keras preprocessing. Then we can look at the lengths of the sentances.

In [None]:
# Tokenize training and testdata
tok = keras.preprocessing.text.Tokenizer()
tok.fit_on_texts(df['sentence'])
X = tok.texts_to_sequences(df['sentence'])

" ".join(map(str,X[0]))

## Padding

As shown above the average sentence length is 13 words. We want our input sentences to all be the same length, so we'll pad all our sentances to be 13 words long. 

In [None]:
# Pad sequences so each is the length of 13, the average
X= keras.preprocessing.sequence.pad_sequences(X, padding='post', maxlen=avg_words)

reverse_word_map = dict(map(reversed, tok.word_index.items()))

' '.join(reverse_word_map[i] for i in X[0] if i!=0) # exclude 0 due to padding

# Network Architecture



In [None]:
vocab_size = len(tok.word_index) + 1

# Create Model and layers
# model = keras.Sequential([
#     keras.layers.Embedding(vocab_size, 64),
#     keras.layers.GlobalAveragePooling1D(),
#     keras.layers.Dense(16, activation=tf.nn.relu),
#     keras.layers.Dropout(0.1),
#     keras.layers.Dense(1, activation=tf.nn.sigmoid)
# ])

model = keras.Sequential([
    keras.layers.Embedding(vocab_size, 64),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid)
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.3, random_state=42)
#Fit model
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test), batch_size=32)

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Validation Accuracy:  {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)


plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf()  

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()