# Import necessary depencencies

In [2]:
import pandas as pd
import numpy as np
import text_normalizer as tn
import model_evaluation_utils as meu
import nltk

np.set_printoptions(precision=2, linewidth=80)

# Load and normalize data

In [7]:
dataset = pd.read_csv(r'movie_reviews.csv')

# take a peek at the data
print(dataset.head())
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

# normalize datasets
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('but')
stop_words.remove('not')

norm_train_reviews = tn.normalize_corpus(train_reviews, stopwords=stop_words)
norm_test_reviews = tn.normalize_corpus(test_reviews, stopwords=stop_words)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


# Tokenize train & test datasets

In [30]:
tokenized_train = [tn.tokenizer.tokenize(text) for text in norm_train_reviews]
tokenized_test  = [tn.tokenizer.tokenize(text) for text in norm_test_reviews]

# Build Vocabulary Mapping (word to index)

In [41]:
from collections import Counter

# build word to index vocabulary
token_counter = Counter([token for review in tokenized_train for token in review])
vocab_map     = {item[0]: index+1 for index, item in enumerate(dict(token_counter).items())}
max_index     = np.max(list(vocab_map.values()))
vocab_map['PAD_INDEX']       = 0
vocab_map['NOT_FOUND_INDEX'] = max_index+1
vocab_size    = len(vocab_map)

# view vocabulary size and part of the vocabulary map
print('Vocabulary Size:', vocab_size)
print('Sample slice of vocabulary map:', dict(list(vocab_map.items())[10:20]))

Vocabulary Size: 84225
Sample slice of vocabulary map: {'first': 11, 'thing': 12, 'strike': 13, 'brutality': 14, 'unflinche': 15, 'scene': 16, 'violence': 17, 'set': 18, 'word': 19, 'go': 20}


# Encode and Pad datasets & Encode prediction class labels

In [42]:
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder

# get max length of train corpus and initialize label encoder
le          = LabelEncoder()
num_classes = 2 # positive -> 1, negative -> 0
max_len     = np.max([len(review) for review in tokenized_train])

## Train reviews data corpus
# Convert tokenized text reviews to numeric vectors
train_X = [[vocab_map[token] for token in tokenized_review] for tokenized_review in tokenized_train]
train_X = sequence.pad_sequences(train_X, maxlen=max_len) # pad 
## Train prediction class labels
# Convert text sentiment labels (negative\positive) to binary encodings (0/1)
train_y = le.fit_transform(train_sentiments)

## Test reviews data corpus
# Convert tokenized text reviews to numeric vectors
test_X = [[vocab_map[token] if vocab_map.get(token) else vocab_map['NOT_FOUND_INDEX'] 
           for token in tokenized_review] 
              for tokenized_review in tokenized_test]
test_X = sequence.pad_sequences(test_X, maxlen=max_len)
## Test prediction class labels
# Convert text sentiment labels (negative\positive) to binary encodings (0/1)
test_y = le.transform(test_sentiments)

# view vector shapes
print('Max length of train review vectors:', max_len)
print('Train review vectors shape:', train_X.shape, ' Test review vectors shape:', test_X.shape)

Max length of train review vectors: 1473
Train review vectors shape: (35000, 1473)  Test review vectors shape: (15000, 1473)


# Build the LSTM Model Architecture

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, SpatialDropout1D
from keras.layers import LSTM

In [7]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1473, 128)         10780800  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 1473, 128)         0         
_________________________________________________________________
lstm (LSTM)                  (None, 64)                49408     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 10,830,273
Trainable params: 10,830,273
Non-trainable params: 0
_________________________________________________________________
None


# Visualize model architecture

In [8]:
!pip install pydot graphviz
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model, show_shapes=True, show_layer_names=False, 
                 rankdir='TB').create(prog='dot', format='svg'))

You should consider upgrading via the '/Users/Andy/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m
('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


AttributeError: 'NoneType' object has no attribute 'create'

# Train the model

In [9]:
batch_size = 100
model.fit(train_X, train_y, epochs=5, batch_size=batch_size, 
          shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa989dd1fd0>

# Predict and Evaluate Model Performance

In [43]:
pred_test = model.predict_classes(test_X)
predictions = le.inverse_transform(pred_test.flatten())



In [44]:
print(pred_test)

[[0]
 [1]
 [0]
 ...
 [0]
 [1]
 [0]]


In [59]:
pred_test[:50]

array([[0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1]], dtype=int32)

In [60]:
print(predictions)

['negative' 'positive' 'negative' ... 'negative' 'positive' 'negative']


In [3]:
from keras.models import load_model

#model.save('LSTM_model.h5')  # creates a HDF5 file 'my_model.h5'
#del model  # deletes the existing model

# returns a compiled model
# identical to the previous one
model = load_model('LSTM_model.h5')

In [4]:
meu.display_model_performance_metrics(true_labels=test_sentiments, predicted_labels=predictions, 
                                      classes=['positive', 'negative'])  

NameError: name 'test_sentiments' is not defined

In [5]:
pred_test = model.predict()
predictions2 = le.inverse_transform(pred_test.flatten())

IndexError: list index out of range

In [54]:
tweets = np.array([
    ["An Englishman, a Scotsman and an Irishman walk into a bar. The Englishman wanted to go so they all had to leave. #Brexitjokes"],
    ["Why do we need any colour passport? We should just be able to shout, “British! Less of your nonsense!” and stroll straight through."],
    ["Q: With Britain leaving the EU how much space was created? A: Exactly 1GB"],
    ["VOTERS: we want to give a boat a ridiculous name UK: no VOTERS: we want to break up the EU and trash the world economy UK: fine"],
    ["#BrexitJokes How did the Brexit chicken cross the road? \"I never said there was a road. Or a chicken\"."],
    ["After #brexit, when rapper 50 cent performs in GBR he'll appear as 10.000 pounds. #brexitjokes"],
    ["I long for the simpler days when #Brexit was just a term for leaving brunch early."],
    ["Say goodbye to croissants, people. Delicious croissants. We're stuck with crumpets FOREVER."],
    ["Hello, I am from Britain, you know, the one that got tricked by a bus"],
    ["How many Brexiteers does it take to change a light bulb? None, they are all walked out because they didn’t like the way the electrician did it."]])

tweet_df = pd.DataFrame(tweets, columns=['tweet_content'])
tweet_df.head()

Unnamed: 0,tweet_content
0,"An Englishman, a Scotsman and an Irishman walk..."
1,Why do we need any colour passport? We should ...
2,Q: With Britain leaving the EU how much space ...
3,VOTERS: we want to give a boat a ridiculous na...
4,#BrexitJokes How did the Brexit chicken cross ...


In [55]:
norm_tweets = tn.normalize_corpus(tweet_df['tweet_content'], stopwords=stop_words)
tokenized_tweets  = [tn.tokenizer.tokenize(text) for text in norm_tweets]

# build word to index vocabulary
token_counter = Counter([token for review in tokenized_tweets for token in review])
vocab_map     = {item[0]: index+1 for index, item in enumerate(dict(token_counter).items())}
max_index     = np.max(list(vocab_map.values()))

vocab_map['PAD_INDEX']       = 0
vocab_map['NOT_FOUND_INDEX'] = max_index+1

vocab_size    = len(vocab_map)

# view vocabulary size and part of the vocabulary map
print('Vocabulary Size:', vocab_size)
print('Sample slice of vocabulary map:', dict(list(vocab_map.items())))

#get max length of train corpus and initialize label encoder
le          = LabelEncoder()
num_classes = 2 # positive -> 1, negative -> 0
max_len     = np.max([len(review) for review in tokenized_test])


## Test reviews data corpus
# Convert tokenized text reviews to numeric vectors
tweet_ready = [[vocab_map[token] for token in tokenized_review] for tokenized_review in tokenized_tweets]
tweet_ready = sequence.pad_sequences(tweet_ready, maxlen=max_len) # pad 


# view vector shapes
print('Max length of tweet review vectors:', max_len)
print('Tweet vectors shape:', tweet_ready.shape)

Vocabulary Size: 84
Sample slice of vocabulary map: {'englishman': 1, 'scotsman': 2, 'irishman': 3, 'walk': 4, 'bar': 5, 'want': 6, 'go': 7, 'leave': 8, 'brexitjoke': 9, 'need': 10, 'colour': 11, 'passport': 12, 'able': 13, 'shout': 14, 'british': 15, 'less': 16, 'nonsense': 17, 'stroll': 18, 'straight': 19, 'q': 20, 'britain': 21, 'eu': 22, 'much': 23, 'space': 24, 'create': 25, 'exactly': 26, 'gb': 27, 'voter': 28, 'give': 29, 'boat': 30, 'ridiculous': 31, 'name': 32, 'uk': 33, 'no': 34, 'break': 35, 'trash': 36, 'world': 37, 'economy': 38, 'fine': 39, 'brexitjokes': 40, 'brexit': 41, 'chicken': 42, 'cross': 43, 'road': 44, 'never': 45, 'say': 46, 'rapper': 47, 'cent': 48, 'perform': 49, 'gbr': 50, 'appear': 51, 'pound': 52, 'long': 53, 'simple': 54, 'day': 55, 'term': 56, 'brunch': 57, 'early': 58, 'goodbye': 59, 'croissant': 60, 'people': 61, 'delicious': 62, 'stick': 63, 'crumpet': 64, 'forever': 65, 'hello': 66, 'know': 67, 'one': 68, 'got': 69, 'trick': 70, 'bus': 71, 'many': 72

In [25]:
from collections import Counter

In [56]:
my_pred_test = model.predict(tweet_ready)
#

In [57]:
print(my_pred_test)

[[0.52]
 [0.99]
 [0.79]
 [0.86]
 [0.07]
 [0.45]
 [0.37]
 [0.93]
 [0.92]
 [0.31]]


In [35]:
from keras.preprocessing import sequence
from sklearn.preprocessing import LabelEncoder

In [68]:
predictions2 = le.inverse_transform(my_pred_test.flatten())

ValueError: y contains previously unseen labels: [0.07 0.31 0.37 0.45 0.52 0.79 0.86 0.92 0.93 0.99]

In [67]:
my_pred_test_trans = le.fit_transform(train_sentiments)

In [69]:
print(predictions2)

NameError: name 'predictions2' is not defined

In [71]:
predictions2 = [1 if p > 0.5 else 0 for p in my_pred_test]
predictions3 = ['Positive' if p > 0.5 else 'Negative' for p in my_pred_test]

In [72]:
print(predictions2)

[1, 1, 1, 1, 0, 0, 0, 1, 1, 0]


In [73]:
print(predictions3)

['Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Negative', 'Negative', 'Positive', 'Positive', 'Negative']
