In [None]:
# Keeping track of frequency of a single token
frequency = defaultdict(int)
for text in wordlist:
    for token in text:
        frequency[token] += 1
        
# Apply Threshold to limit the vocabulary size, discarding the tokens which appeard number of times below the threshold limit 
FREQ_THRESHOLD = 5

thresholded_wordlist =  [[token for token in text if frequency[token] > FREQ_THRESHOLD]
          for text in wordlist]

# Create Dictionary based on the word list
dictionary = Dictionary(thresholded_wordlist)

# Number of tokens
print("Number of Tokens - {}".format(len(dictionary)))

In [None]:
# Make this statement true to run from scratch [It takes time to process the text]
if 1 != 0:
    wordlist = []
    for i in tqdm(range(df.shape[0])):
        wordlist.append(process_text(df['review'].iloc[i]))
        
    with open('vocabulary.txt', 'wb') as vocabulary:
        pickle.dump(wordlist, vocabulary)
    vocabulary.close()

In [1]:
import pandas as pd
import spacy
import pickle
from tqdm import tqdm
from gensim.corpora import Dictionary
from sklearn.utils import shuffle
from collections import defaultdict
import numpy as np

nlp = spacy.load('en_core_web_sm')
DATA_LIMIT = 1000



In [2]:
df = pd.read_csv('./imdb_master.csv', encoding='latin1')
df_neg = df[df['label'] == 'neg']
df_pos = df[df['label'] == 'pos']
df = pd.concat((df_pos[:DATA_LIMIT], df_neg[:DATA_LIMIT]))

def process_text(input_string, return_string=False, stem=False):
    text = nlp(u'' + input_string)
    if stem == True:
        text = [tok.lemma_ for tok in text if (tok.is_alpha and not tok.is_stop)]
    else:
        text = [tok.lower_ for tok in text if (tok.is_alpha and not tok.is_stop)]
    if return_string == True:
        return " ".join(text)
    return text

In [3]:
# Load vocabulary
wordlist = []
with open('vocabulary.txt', 'rb') as vocabulary:
    wordlist = pickle.load(vocabulary)

In [4]:
len(wordlist[0])

69

In [11]:
# Here we build just the skeleton model for the  word2vec, where we will fit the wordlist next.
import multiprocessing
from gensim.models.word2vec import Word2Vec
word2vec_model = Word2Vec(window = 5, workers = multiprocessing.cpu_count(), iter = 100, min_count =1, hs=1, negative=0)
print (word2vec_model)

Word2Vec(vocab=0, size=100, alpha=0.025)


In [12]:
# Now we fit in the wordlist into the skeleton model
word2vec_model.build_vocab(wordlist)
print (word2vec_model)

Word2Vec(vocab=22449, size=100, alpha=0.025)


In [13]:
# Now we will see how much time needed to train
%time word2vec_model.train(wordlist, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)

Wall time: 29 s


(20549283, 21977100)

In [15]:
# WE test a single word if its shape (shape of the produced vector of this word) has right number of features
word2vec_model.wv['cat'].shape

(100,)

In [16]:
# We test the similarities of a given word in the entire review, using these created vectors
word2vec_model.wv.most_similar('bellucci')

[('belucci', 0.44353532791137695),
 ('idolized', 0.3700087070465088),
 ('takagi', 0.3576205372810364),
 ('risk', 0.35502320528030396),
 ('romane', 0.3549274802207947),
 ('limos', 0.35447216033935547),
 ('revolving', 0.3485170900821686),
 ('sommerish', 0.34799548983573914),
 ('passages', 0.34680700302124023),
 ('prequels', 0.3446471691131592)]

In [18]:
# We can also test these similarities differentiating good/bad movies, like this:
word2vec_model.wv.most_similar(positive=['movie', 'good'], negative=['bad'])

[('movies', 0.43051135540008545),
 ('film', 0.423736572265625),
 ('undoubtably', 0.4197717607021332),
 ('paved', 0.3855682611465454),
 ('films', 0.3520013093948364),
 ('believe', 0.3511885106563568),
 ('rapped', 0.34690144658088684),
 ('immediately', 0.33001941442489624),
 ('heartening', 0.3207009434700012),
 ('cinema', 0.31249696016311646)]

#### Now we move on to carry on these following tasks:
1. Create a numpy empty/random matrix with dimension of [Voc size+1 X, Embedding dimension]
2. Load the embeddings into that word
3. Create keras embedding layer with the same configuration and load weights there
4. Train a RNN/CNN to classify

In [17]:
# We create a dictionary to store words to their indices
vocabulary = Dictionary(wordlist)
len(vocabulary)

22449

In [20]:
# Tasks 1 & 2...
# We create a matrix with the given dimensions...we declare the embedding dimension to be 100 as defualt
# embedding dimensions show the no. of features of each word
embedding_dim = 100
embedding = np.zeros((len(vocabulary)+1, embedding_dim))

# Now we fill up this embeddings into this embedding matrix:
for i in range(len(vocabulary)):
        embedding[i+1] = word2vec_model.wv[ vocabulary[i] ]
embedding.shape                
# This will be 1 more than the original vocabulary, since all the words are one element shifted to the right
# for all the unknown tokens to be in the first cell of the embedding matrix

(22450, 100)

In [21]:
# Tasks 3 & 4...
# First we need keras to import and then create the embedding layer
import keras
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense

Using TensorFlow backend.


In [22]:
# To make all of these reviews equal length, we can either use padding, 
# or we can delete many words from each review to get an equal length for all the review...
# We need padding so that all the reviews become of equal length
# this will make training easier 
from keras.preprocessing.sequence import pad_sequences
# We first create an un-padded training X...then we use padding
X = []
for review in wordlist:
    X.append(np.array(vocabulary.doc2idx(review)) + 1)
    
train_x = pad_sequences(X, value = 0, maxlen=200)

In [26]:
# this will show that each elemnt (here, elements are lists) will have equal length of 200
len(train_x[0]), len(train_x[1599])

(200, 200)

In [30]:
# the other extra cells in each review are considered as 0s...this is the result of padding
train_x[100]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0, 4397,   31, 4378,  619, 4367,   17,  508,  372, 4388,  168,
        507,  192,  278,  308, 1033,   79, 1527, 4369, 3898, 4269,  858,
        406, 4125,  508, 1482,  312, 4377,  168,  973,  138,  308, 4389,
        521, 4394,  493,  805,   44,  624,   44,  516,   44,  516,  964,
        521,   31,   18,  411,  454,   31, 1163,  456, 4220,  203,  521,
       4372, 4368,  219, 3627,  159, 4390, 4371,  268, 4370,  138,  308,
       4397,  305,  473, 4373, 4395, 4380, 4376,  280, 4381, 4385, 4391,
       4383,  611, 1050,  619, 1560,  113,  372, 43

In [31]:
# We now train true labels of y similarly, as we did during svd
train_y = np.concatenate((np.ones(1000), np.zeros(1000)))

In [32]:
# Finally we first construct the embedding layer
# the load weights
# and lastly classify using CNN
model = Sequential()
model.add(Embedding(len(vocabulary)+1, output_dim = embedding_dim, weights =[embedding], trainable = False ))
model.add(GRU(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [33]:
# at last, we compile and then we fit using our created CNN classifier
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Train....')
model.fit(train_x, train_y, batch_size = 128, epochs =15, validation_split=0.3)

Train....
Train on 1400 samples, validate on 600 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x2aa3a484c50>