## Downloading IMDB

In [27]:
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
import numpy as np

### Preprocessing the labels of the raw IMDB data

In [3]:
imdb_dir = '/home/jupyter/deep_learning_python/data/imdb/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

In [4]:
labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)

In [5]:
print(labels[0:10])
print(texts[0:2])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
["It's difficult to put into words the almost seething hatred I have of this film. But I'll try:<br /><br />Every other word was an expletive, the sex scenes were uncomfortable, drugs were rampant and stereotyping was beyond the norm, if not offensive to Italian-Americans.<br /><br />I'm not saying the acting was terrible, because Leguizamo, Sorvino, Brody, Espisito et. al, performed well. But...almost every character in the film I despised. Not since The Bonfire of the Vanities have I disliked every character on screen.", "My teacher taped this and showed it to us in Child Care to demonstrate how teen pregnancy affects people. It just demonstrated how teen pregnancy affects a childish jock not properly educated on how sex works and a whiny, unloved girl who throws fruit when angry and couldn't tell she was with the wrong man even if he wore a sign stating he was such. I wouldn't be surprised if the father of the baby had about eight girlfriends in the fi

### Tokenizing the text of the raw IMDB data

In [6]:
# Cuts off reviews after 100 words
maxlen = 100

# Trains on 200 samples
training_samples = 200

# Validate on 10000 samples
validation_samples = 10000

# Consider only the top 10000 words in the dataset
max_words = 10000

In [7]:
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [18]:
#sequences is a list of lists
# sequences[0:2]

In [8]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' %len(word_index))

Found 88584 unique tokens.


In [9]:
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

('Shape of data tensor:', (25000, 100))
('Shape of label tensor:', (25000,))


In [35]:
# Splits data into a training set and a validation set, but first shuffles the data,
# since we are starting with data in ehich samples are ordered (all negative first)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [11]:
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples:training_samples + validation_samples]
y_val = labels[training_samples:training_samples + validation_samples]

### Parsing the GloVe word-embedding file

In [14]:
globe_dir = '/home/jupyter/deep_learning_python/data/pretrained_models/embeddings/glove.6B'
embeddings_index = {}
f = open(os.path.join(globe_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
    
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [17]:
len(embeddings_index['the'])

100

### Preparing the GloVe word-embedding index 

In [25]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

### Defining the model

In [29]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length = maxlen))
model.add(Flatten())
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


### Loading the GloVe embedding in the model

In [31]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

### Training and evaluating the model

In [34]:
model.compile(
    optimizer = 'rmsprop',
    loss = 'binary_crossentropy',
    metrics = ['acc']
)
history = model.fit(
    x_train,
    y_train,
    epochs = 10,
    batch_size = 32,
    validation_data = (x_val, y_val)
)
model.save_weights('/home/jupyter/deep_learning_python/models/pre_trained_glove_model.h5')

Instructions for updating:
Use tf.cast instead.
Train on 200 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
