<h1>Sentiment Analysis</h1>
<hr/>

<p>Using the IMDB dataset of movie reviews, predict whether the review is positive or negative.

The dataset can be downloaded from <a href='http://ai.stanford.edu/%7Eamaas/data/sentiment/' target='_blank'>here</a>.</p>

In [None]:
import os
import re
import nltk
import pickle
import gensim
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [None]:
import warnings
warnings.filterwarnings('ignore')

<h2>Cleaning and tokenizing helper functions</h2>

In [None]:
replacements = """aren't    are not
can't   cannot
couldn't    could not
didn't  did not
doesn't does not
don't   do not
hadn't  had not
hasn't  has not
haven't have not
he'd    he would
he'll   he will
he's    he is
i'd i would
i'll    i will
i'm i am
i've    i have
isn't   is not
it's    it is, it has
let's   let us
mustn't must not
shan't  shall not
she'd   she would
she'll  she will
she's   she is
shouldn't   should not
that's  that is
there's there is
they'd  they would
they'll they will
they're they are
they've they have
we'd    we would
we're   we are
we've   we have
weren't were not
what'll what will
what're what are
what's  what is
what've what have
where's where is
who'd   who would
who'll  who will
who're  who are
who's   who is
who've  who have
won't   will not
wouldn't    would not
you'd   you would
you'll  you will
you're  you are
you've  you have"""

# expand words with apostrophe
splitted = []
for r in replacements.split('\n'):
    splitted.append(re.split(r'\s', r, maxsplit=1))

# load stopwords
stopwords = []
with open('stopwords.txt', 'r') as f:
    stopwords = f.read().lower().split('\n')

def expand_sent(sent):
    for split in splitted:
        sent = re.sub(split[0], split[1], sent)
    return sent

def remove_stopwords(tokens):
    new_tokens = []
    for word in tokens:
        if word not in stopwords:
            new_tokens.append(word)
    return new_tokens

def clean_text(sent):
    sent = expand_sent(sent.lower())
    # removes 's eg: Amy's will become Amy
    sent = re.sub(r"'s", "", sent)
    # removes the hyphen from words joined together by it
    sent = re.sub(r'(.*?)-(.*?)', r'\1 \2', sent)
    # removes puntuations, extra characters and html tages
    sent = re.sub(r'[\'"!@:.,?#*\n()]|(<.*?>)', " ", sent)
    # removes more than 2 consecutive same characters with just 2
    sent = re.sub(r'(.)\1+', r'\1\1', sent)
    # removes 2 or more spaces
    sent = re.sub(r' +', " ", sent)
    return sent.strip()

<h2>Load and create the dataset</h2>

In [None]:
# load the dataset
dataset_path = './IMDB_dataset/aclimdb'
reviews = []
labels = []
max_len = -1

for s in ['train', 'test']:
    pos_path = os.path.join(dataset_path, s, 'pos')
    for file in tqdm(os.listdir(pos_path)):
        with open(os.path.join(pos_path, file), 'r') as f:
            text = remove_stopwords(nltk.word_tokenize(clean_text(f.read())))
        if len(text) > max_len: max_len = len(text)
        reviews.append(text)
        labels.append(1)
    
    neg_path = os.path.join(dataset_path, s, 'neg')
    for file in tqdm(os.listdir(neg_path)):
        with open(os.path.join(neg_path, file), 'r') as f:
            text = remove_stopwords(nltk.word_tokenize(clean_text(f.read())))
        if len(text) > max_len: max_len = len(text)
        reviews.append(text)
        labels.append(0)

<h3>Load pretrained embeddings</h3>

<p>Download the pretrained word2vec model from <a href='https://wikipedia2vec.github.io/wikipedia2vec/pretrained/' target='_blank'>here</a></p>

In [None]:
# load the embedding
embedding = gensim.models.KeyedVectors.load_word2vec_format('word2vec.txt', binary=False, limit=500000)

print('Vocab size:', len(embedding.wv.vocab))

In [None]:
# uncomment below code to save the limited embeddings to new file
# embedding.save_word2vec_format('small_word2vec.txt', binary=False)

In [None]:
def word2idx(word):
    return embedding.wv.vocab[word].index

def idx2word(idx):
    return embedding.wv.index2word[idx]

<h2>Prepare the data for the ML model</h2>

In [None]:
# pad the reviews to same length
X = np.zeros([len(reviews), max_len], dtype=np.int32)
y = np.array(labels, dtype=np.int32)

for i, review in enumerate(tqdm(reviews)):
    for t, word in enumerate(review):
        X[i, t] = word2idx(word)

In [None]:
# split and shuffle the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

print("Train set size:", len(X_train))
print("Test set size:", len(X_test))

<h2>Create a model</h2>

In [None]:
import tensorflow as tf
tf.__version__

In [None]:
def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])
    plt.show()

In [None]:
# load the embedding weights and get other values
embedding_weights = embedding.wv.syn0
vocab_size, embedding_size = embedding_weights.shape

In [None]:
# define the model, loss and optimizer, and print the summary
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_size, weights=[embedding_weights]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)),
    tf.keras.layers.Dense(64, activation='tanh'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer=tf.keras.optimizers.Adam(1e-3),
              metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=15, validation_data=[X_test, y_test], batch_size=200)

In [None]:
plot_graphs(history, 'loss')
plot_graphs(history, 'accuracy')

In [None]:
# run this file if you are satisfied with the model performance
model.save('model.h5')

<h2>Test the model on new reviews</h2>

In [None]:
# run the pipeline
def predict_sentiment(text):
    text = remove_stopwords(nltk.word_tokenize(clean_text(text)))
    X = np.zeros([1, max_len], dtype=np.int32)
    for i, word in enumerate(text):
        X[0, i] = word2idx(word)
    y_pred = model.predict(X)
    print(y_pred)

In [None]:
r = 'It is an awesome film. Kudos to the actors. Must watch!'
predict_sentiment(r)