# Recurrent Neural Networks


The following algorithm uses a Backpropagation with a SGD optimizer

### The IMDb Movie Review Dataset

In this section, we will train a simple logistic regression model to classify movie reviews from the 50k IMDb review dataset that has been collected by Maas et. al.

AL Maas, RE Daly, PT Pham, D Huang, AY Ng, and C Potts. Learning word vectors for sentiment analysis. In Proceedings of the 49th Annual Meeting of the Association for Computational Lin- guistics: Human Language Technologies, pages 142–150, Portland, Oregon, USA, June 2011. Association for Computational Linguistics

[Source: http://ai.stanford.edu/~amaas/data/sentiment/]

The dataset consists of 50,000 movie reviews from the original "train" and "test" subdirectories. The class labels are binary (1=positive and 0=negative) and contain 25,000 positive and 25,000 negative movie reviews, respectively. For simplicity, I assembled the reviews in a single CSV file.


## 1. Preprocess Data

We are going to import the data and tokenize using the Tokeinzer from keras


In [1]:
import re
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

stop = stopwords.words('english')
porter = PorterStemmer()


def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    text = [w for w in text.split() if w not in stop]
    tokenized = [porter.stem(w) for w in text]
    return text


def featureVecMethod(words, model, num_features):
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0

    index2word_set = set(model.wv.index2word)

    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])

    featureVec = np.divide(featureVec, nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    counter = 0
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    for review in reviews:
        if counter%1000 == 0:
            print("Review %d of %d"%(counter,len(reviews)))

        reviewFeatureVecs[counter] = featureVecMethod(review, model, num_features)
        counter = counter+1

    return reviewFeatureVecs


df = pd.read_csv('shuffled_movie_data.csv')


X = df['review']
y = df['sentiment']

In [6]:
sub_df = df[:10000]

X = sub_df['review']
y = sub_df['sentiment']

xx = np.array([tokenizer(i) for i in X])

In [86]:
wmodel = Word2Vec(xx, size=100, window=5, min_count=1, workers=4)
print(wmodel.wv.similar_by_word('paris'))

TypeError: 'int' object is not iterable

In [8]:
x_data = getAvgFeatureVecs(xx, wmodel, 100)

Review 0 of 10000




Review 1000 of 10000
Review 2000 of 10000
Review 3000 of 10000
Review 4000 of 10000
Review 5000 of 10000
Review 6000 of 10000
Review 7000 of 10000
Review 8000 of 10000
Review 9000 of 10000


In [120]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional

In [87]:
embed_dim = 128
lstm_out = 200
batch_size = 32

model = Sequential()
model.add(Embedding(100, embed_dim,input_length = 100, dropout = 0.2))
model.add(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  
  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 100, 128)          12800     
_________________________________________________________________
lstm_14 (LSTM)               (None, 200)               263200    
_________________________________________________________________
dense_14 (Dense)             (None, 2)                 402       
Total params: 276,402
Trainable params: 276,402
Non-trainable params: 0
_________________________________________________________________
None


In [88]:
from keras.utils import to_categorical

In [89]:
yy = to_categorical(list(y))
print(yy)

[[0. 1.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [0. 1.]]


In [90]:
print(x_data, yy)
model.fit(x_data, yy, batch_size =batch_size, nb_epoch = 1,  verbose = 5)

[[-0.07239918 -0.04538765  0.03477388 ... -0.29081944  0.33201894
   0.17413734]
 [ 0.00562736 -0.35102782  0.17479545 ... -0.5537581   0.15229909
   0.36488035]
 [-0.07945585 -0.24593239  0.13073416 ... -0.52389514  0.27994916
   0.39811411]
 ...
 [-0.0792755  -0.22955985  0.07283471 ... -0.46617076  0.25844458
   0.37747887]
 [-0.08648413 -0.10136423 -0.06094069 ... -0.37388954  0.23477365
   0.34942338]
 [-0.18089129 -0.3721752   0.31946075 ... -0.30288696  0.34896988
   0.03017386]] [[0. 1.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [0. 1.]]


  


Epoch 1/1


InvalidArgumentError: indices[31,88] = -1 is not in [0, 100)
	 [[{{node embedding_13/embedding_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@training_4/Adam/Assign_2"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_13/embeddings/read, embedding_13/Cast, training_4/Adam/gradients/embedding_13/embedding_lookup_grad/concat/axis)]]

In [91]:
from keras.preprocessing.text import Tokenizer

In [92]:
tokenizer = Tokenizer(nb_words=2500, lower=True,split=' ')



In [94]:
tokenizer.fit_on_texts(xx)

In [95]:
X = tokenizer.texts_to_sequences(xx)

In [101]:
X[0]

[2059,
 1389,
 1126,
 182,
 645,
 1327,
 191,
 2083,
 1764,
 205,
 476,
 1581,
 28,
 58,
 189,
 397,
 693,
 1368,
 942,
 1153,
 1352,
 1469,
 2314,
 1390,
 953,
 300,
 1933,
 1680,
 2196,
 2491,
 1328,
 372,
 146,
 2347,
 1259,
 1352,
 1144,
 443,
 2492,
 1329,
 1516,
 1622,
 540,
 180,
 819,
 476,
 476,
 5,
 119,
 1,
 165,
 10,
 476,
 58,
 55,
 121,
 2178,
 2059,
 517,
 253,
 890,
 876,
 120,
 214,
 819,
 476,
 1581,
 58,
 91,
 1352,
 332,
 638,
 2178,
 773,
 157,
 693,
 124,
 390,
 452,
 1269,
 1972,
 1488,
 298,
 1136]

In [102]:
from keras.preprocessing.sequence import pad_sequences

In [103]:
X = pad_sequences(X)

In [104]:
X.shape

(10000, 808)

In [121]:
embed_dim = 128
lstm_out = 200
batch_size = 32

model = Sequential()
model.add(Embedding(10000, embed_dim, input_length = X.shape[1], dropout = 0.2))
model.add(Bidirectional(LSTM(lstm_out, dropout_U = 0.2, dropout_W = 0.2)))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

  
  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 808, 128)          1280000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 400)               526400    
_________________________________________________________________
dense_16 (Dense)             (None, 2)                 802       
Total params: 1,807,202
Trainable params: 1,807,202
Non-trainable params: 0
_________________________________________________________________
None


In [122]:
Y = pd.get_dummies(y).values

In [123]:
Y

array([[0, 1],
       [1, 0],
       [1, 0],
       ...,
       [1, 0],
       [0, 1],
       [0, 1]], dtype=uint8)

In [124]:
import pandas as pd

In [125]:
from sklearn.model_selection import train_test_split

In [126]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X,Y, test_size = 0.20, random_state = 36)

In [None]:
f= model.fit(X_train, Y_train, batch_size =batch_size, nb_epoch = 1,  verbose = 5)

  """Entry point for launching an IPython kernel.


Epoch 1/1


In [116]:
f.history

{'loss': [0.3032988443672657], 'acc': [0.875875]}