In [1]:
# importing basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import csv

In [7]:
# importing keras specific libraries
from keras.models import Sequential
from keras.layers import Dense, LSTM
# we are treating the text as a 1D representation so we'll be using 1D convolutions
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.embeddings import Embedding

# some auxiliary imports
from keras import callbacks
from keras.preprocessing import sequence
from keras.models import load_model
from keras.utils.vis_utils import plot_model
from keras.utils import np_utils

Using TensorFlow backend.


A brief intro to the dataset, our dataset consists of 4 columns, of which <b>Phrase</b> and <b> Sentiment</b> are important to us. The follow up sentences are just a breakdown of the main sentence. The <b> SentenceId</b> tells us that the following phrases are a part of the same sentence, it's just broken down into differnt phrases.

In [8]:
df = pd.read_csv('Data/train.tsv', sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [9]:
phrases = df.iloc[:,2:3].values
sentiments = df.iloc[:,3:4].values

In [10]:
# let's have a look at those
phrases[0:5]

array([['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'],
       ['A series of escapades demonstrating the adage that what is good for the goose'],
       ['A series'],
       ['A'],
       ['series']], dtype=object)

In [11]:
sentiments[0:5]

array([[1],
       [2],
       [2],
       [2],
       [2]])

We need to preprocess the data, which means we need to get rid of the punctuations, and convert all the words to lowercase, so that while tokenizing, "The" and "the" are not considered as different.<br> We'll be using <b> re</b> for this.

In [12]:
def preprocess_phrase(phrase):
    """
    It's task is to remove the punctuation, and convert everything to lowercase
    Input: a string
    Returns: a string
    """
    phrase = re.sub("[^a-zA-Z]", " ", phrase)
    phrase = phrase.lower()
    return phrase

In [13]:
preprocess_phrase("Hi, i am Kishan Kumar.")

'hi  i am kishan kumar '

In [14]:
# let's build a list of string having clean phrases
clean_phrases = []
for phrase in phrases:
    clean_phrases.append(preprocess_phrase(phrase[0]))
# show first 5 clean phrases
clean_phrases[0:5]

['a series of escapades demonstrating the adage that what is good for the goose is also good for the gander   some of which occasionally amuses but none of which amounts to much of a story  ',
 'a series of escapades demonstrating the adage that what is good for the goose',
 'a series',
 'a',
 'series']

We'll be needing a vocabulary which means we want all the words that has appeared in our sentence.

In [15]:
# join all the clean phrases with a space between two phrases
full_text = ' '.join(clean_phrases)

In [16]:
full_text[0:300]

'a series of escapades demonstrating the adage that what is good for the goose is also good for the gander   some of which occasionally amuses but none of which amounts to much of a story   a series of escapades demonstrating the adage that what is good for the goose a series a series of escapades de'

In [17]:
words = full_text.split()
words[0:10]

['a',
 'series',
 'of',
 'escapades',
 'demonstrating',
 'the',
 'adage',
 'that',
 'what',
 'is']

<b>Note: warning</b><br>
it is highly possible that we might miss some of the words that are present in the test set and when it is presented to our network it just crashes. to account for that we'll also include those words

In [18]:
df_test = pd.read_csv('Data/test.tsv', sep='\t')
test_phrases = df_test.iloc[:, 2:3].values
clean_test_phrases = []
for phrase in test_phrases:
    clean_test_phrases.append(preprocess_phrase(phrase[0]))

full_text_test = ' '.join(clean_test_phrases)
test_words = full_text_test.split()
len(test_words)

423806

In [19]:
complete_words = test_words + words
len(complete_words)

1496427

In [20]:
# let's check what are the ratings that we have
np.unique(sentiments)

array([0, 1, 2, 3, 4])

We need to create a mapping of these words to integer, for e.g.<br>
"the" -> 0,<br>
"likes" -> 4,<br>
"Cosmos" -> 8, <br>
"kishan" -> 5,<br>
"universe" -> 9,<br>
so that a sentence like, "kishan likes cosmos" will be converted into [5,4,8]

In [21]:
from collections import Counter
counts = Counter(complete_words)
vocabulary = sorted(counts, key=counts.get, reverse=True)

In [22]:
# unique words that we have
len(vocabulary)

17582

In [23]:
# let's have a look at them
vocabulary[0:20]

['the',
 'a',
 'of',
 'and',
 'to',
 's',
 'in',
 'is',
 'that',
 'it',
 'as',
 'with',
 'for',
 'its',
 'film',
 'an',
 'movie',
 'this',
 'but',
 'be']

In [24]:
# let's create an encoding for each word
vocab2int = {word: ii for ii, word in enumerate(vocabulary, 1)}
vocab2int['person']

860

In [25]:
review_encoding = []
test_review_encoding = []
for phrase in clean_phrases:
    review_encoding.append([vocab2int[word] for word in phrase.split()])
test_review_encoding = []
for phrase in clean_test_phrases:
    test_review_encoding.append([vocab2int[word] for word in phrase.split()])

In [26]:
review_encoding[3:7]

[[2], [315], [3, 16533, 7722, 1, 8381, 9, 53, 8, 47, 13, 1, 3977], [3]]

In [27]:
# let's found out whether we have reviews with zero length
def check_zero(review_encoding):
    review_lens = Counter([len(x) for x in review_encoding])
    print("Zero length reviews: ", review_lens[0])
    print("Maximum lenght of review: ", max(review_lens))

In [28]:
check_zero(review_encoding)

Zero length reviews:  159
Maximum lenght of review:  48


In [29]:
# remove zero length reviews
# for this we will first get the indices of non zero reviews
ids = [i for i, review in enumerate(review_encoding) if len(review) != 0]

# overwrite
review_encoding = [review_encoding[i] for i in ids]
sentiments = [sentiments[i] for i in ids]

In [30]:
# check again for zero length review
check_zero(review_encoding)

Zero length reviews:  0
Maximum lenght of review:  48


Maximum length of the review is 48 and it is too large for RNN to remember, it will be better if we truncate these to a smaller steps, it is highly possible that we might miss the gist of it but it is highly unlikely.<br>
first lets find what is the average lenght of the review

In [31]:
tmp = []
for phrases in review_encoding:
    tmp.append(len(phrases))

print("average length of reviews: ", np.average(np.array(tmp)))
print("standard deviation of length of reviews: ", np.std(np.array(tmp)))

average length of reviews:  6.880141884914144
standard deviation of length of reviews:  6.559452929772498


In [32]:
# to take into account the most cases let's choose our length to be 13
MAX_REVIEW_LENGTH = 13
X_train = sequence.pad_sequences(review_encoding, maxlen=MAX_REVIEW_LENGTH)
X_test = sequence.pad_sequences(test_review_encoding, maxlen=MAX_REVIEW_LENGTH)

In [33]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

X_train shape:  (155901, 13)
X_test shape:  (66292, 13)


In [34]:
X_train[:5]

array([[   88,   591, 12283,    19,   621,     3,    88,  2832,     5,
           52,     3,     2,    42],
       [  315,     3, 16533,  7722,     1,  8381,     9,    53,     8,
           47,    13,     1,  3977],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     2,   315],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     2],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,   315]], dtype=int32)

In [35]:
# we also have 0 in our corpus, but as you can infer we do not have any word corresponding to 0
total_words = len(vocab2int) + 1
print(total_words)

17583


In [36]:
# one hot encoding the labels
y_train = np_utils.to_categorical(sentiments, 5)
print("y_train shape: ", y_train.shape)

y_train shape:  (155901, 5)


In [37]:
# now comes the training part
# we'll be using callbackls, what they do is to take a record of what has happened after each epoch
# ModelCheckpoints is used to save the model after evry epoch if the model accuracy has improved over test set
# EarlyStopping is used to stop the training if the model has not learned anything after a certain number of epochs
callbacks = [callbacks.ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, save_weights_only=False),
            callbacks.EarlyStopping(monitor='val_loss', patience=5),
            callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False)]

In [39]:
# Model Architecture
model = Sequential()
model.add(Embedding(total_words, output_dim=64, input_length=MAX_REVIEW_LENGTH, dropout=0.5))
model.add(LSTM(100))
# model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
# model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
# model.add(MaxPooling1D(pool_size=2))
# add LSTM

model.add(Dense(units=5, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

  This is separate from the ipykernel package so we can avoid doing imports until


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 13, 64)            1125312   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               66000     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 505       
Total params: 1,191,817
Trainable params: 1,191,817
Non-trainable params: 0
_________________________________________________________________
None


In [40]:
# let's train it
model.fit(X_train, y_train, validation_split=0.2, epochs=10, verbose=1, batch_size=32, callbacks=callbacks)

Instructions for updating:
Use tf.cast instead.
Train on 124720 samples, validate on 31181 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

KeyboardInterrupt: 

In [127]:
model.save("final.h5")

In [130]:
test_pred = model.predict_classes(X_test)

In [128]:
# testing phase
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

SyntaxError: invalid syntax (<ipython-input-129-6397f5d3110a>, line 1)

In [53]:
from keras.preprocessing.text import Tokenizer
token = Tokenizer()
token.fit_on_texts(clean_phrases)

In [56]:
# let's summarize what has been learned by the tokenizer
print(token.document_count)

156060


In [62]:
# integer encode documents
encoded_phrases = token.texts_to_matrix(clean_phrases, mode='binary')

In [63]:
encoded_phrases[0]

array([0., 1., 1., ..., 0., 0., 0.])