# RNN-LSTMs in Keras Demo with IMDB data

## This is the original RNN-LSTM-MLP for IMDB Classification {0,1}

Original Source: https://github.com/fchollet/keras/blob/master/examples/imdb_lstm.py

In [None]:
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM, SimpleRNN, GRU
from keras.datasets import imdb
from keras.optimizers import RMSprop
from sklearn.metrics import roc_auc_score

max_features = 200
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

# This is the original IMDB model, it's an LSTM feeding into an MLP

print('Loading data...')

(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(1))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=1,
          validation_data=(X_test, y_test), verbose=0)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
ypred_tst = model.predict(X_test)
# Pretty good AUC
print('Test AUC:', roc_auc_score(y_test,ypred_tst))

## This is my own RNN-LSTM to predict the last word in a sentence

In [None]:
from __future__ import print_function
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM, SimpleRNN, GRU
from keras.datasets import imdb
from keras.optimizers import RMSprop
from sklearn.metrics import roc_auc_score

In [None]:
max_features = 500
emb_dim = 10
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

In [None]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
windx = imdb.get_word_index()
rwindx = dict((k,v) for v,k in windx.items())

In [None]:
# Proof it works
print( windx['fawn'])
print( rwindx[34701])

In [None]:
len(np.unique(X_train)), len(windx), len(rwindx)

In [None]:
print( X_train[0])

In [None]:
# What this looks like for a single row
print( [rwindx[x] for x in X_train[0]])

In [None]:
# Removing the last element
X_train = np.asarray([i[:-1] for i in X_train]) # Removing the last element
X_test = np.asarray([i[:-1] for i in X_test] )

# Using just the last elemnt instead
Y_train = np.asarray([i[-1] for i in X_train]) 
Y_test = np.asarray([ i[-1] for i in X_test])

In [None]:
Y_test.shape, X_test.shape

In [None]:
# RNN for the regular embedings
print('Loading data...')

# (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
# The pad sequences stuff seems suspicious
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

In [None]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, emb_dim, dropout=0.2))
model.add(LSTM(150))
#model.add(LSTM(150, input_shape=(maxlen, len(rwindx))))
model.add(Dense(max_features))
model.add(Activation('softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

optimizer = RMSprop(lr=0.01)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer)

In [None]:
print('Train...')
model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=1,
          validation_data=(X_test, Y_test), verbose=0)

In [None]:
out = model.evaluate(X_test, Y_test, batch_size=batch_size)
ypred_tst = model.predict(X_test)

In [None]:
print('Cross Entropy Loss of %0.3f' % out)

In [None]:
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
yprdtst = np.argmax(ypred_tst, axis=1)
x = confusion_matrix(yprdtst, Y_test).astype('float') 
# Normalizing it
x_n = x/ x.sum(axis=0)

In [None]:
'Accuracy is %0.3f' % (np.diag(x).sum()/ x.sum())

In [None]:
plt.figure(figsize=(15,10))
plt.imshow(x_n, cmap='OrRd')
plt.show()

In [None]:
len(model.get_weights())

In [None]:
np.unique(X_train), len(np.unique(X_train)), len(range(np.min(X_train), np.max(X_train)+1)) # Have to do +1

In [None]:
model.get_weights()[0].shape

In [None]:
[ [j.shape for j in l.get_weights()] for l in model.layers]

In [None]:
# Embedding layers
embs = model.layers[0].get_weights()[0]
embs.shape

In [None]:
plt.figure(figsize=(8,8))
plt.imshow(embs.dot(embs.T), cmap='OrRd')
plt.show()

In [None]:
uniqs = np.unique(X_train)
min_x, max_x = np.min(X_train), np.max(X_train)
print(uniqs)
print("%i unique values"% len(uniqs) )
print('Ranging from %i to %i' % (min_x, max_x) )
if min_x==0:
    max_x+=1
print("Which means we have an index matrix with %i words" % (max_x-min_x) )

In [None]:
# The indexing below is correct
# -- note that we don't have a 3 in our np.unique(X_train) so we only have 40 unique values b/c we start at 0
# -- which is inserted by the padding function, and note that we don't have 0 in our dictionary list
# --
sims = embs.dot( embs[windx['film']].reshape((10,1)))
for i, si in enumerate(sims):
    if i==0:
        continue
    print(i, rwindx[i], si)
        
print('Closest match is %i' % np.delete(sims, windx['film']).argmax())

In [None]:
rwindx[3]

In [None]:
# Here are the words in the model
print( [rwindx[x] for x in np.unique(X_train).tolist() if x!=0])