# 8. Loopy (recurrent) neural networks (RNNs)

### 8.1.5 Recurrent neural net with Keras

In [1]:
import glob
import os
from random import shuffle

from nltk.tokenize import TreebankWordTokenizer
from gensim.models import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format('../../bigdata/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000)

In [2]:
def pre_process_data(filepath):
    """
    This is dependent on your training data source but we will try to generalize it as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')

    pos_label = 1
    neg_label = 0

    dataset = []

    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))

    shuffle(dataset)

    return dataset

In [3]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])

            except KeyError:
                pass  # No matching token in the Google w2v vocab

        vectorized_data.append(sample_vecs)

    return vectorized_data

In [4]:
def collect_expected(dataset):
    """ Peel of the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [5]:
## http://ai.stanford.edu/~amaas/data/sentiment/
dataset = pre_process_data('../../bigdata/aclImdb/train/')

vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

split_point = int(len(vectorized_data) * .8)

In [6]:
maxlen = 400
batch_size = 32         # How many samples to show the net before backpropogating the error and updating the weights
embedding_dims = 300    # Length of the token vectors we will create for passing into the Convnet
# filters = 250           # Number of filters we will train
# kernel_size = 3         # The width of the filters, actual filters will each be a matrix of weights of size: embedding_dims x kernel_size or 50 x 3 in our case
# hidden_dims = 250       # Number of neurons in the plain feed forward net at the end of the chain
epochs = 2              # Number of times we will pass the entire training dataset through the network

In [7]:
def pad_trunc(data, maxlen):
    """ For a given dataset pad with zero vectors or truncate to maxlen """
    new_data = []

    # Create a vector of 0's the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:

        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [8]:
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [9]:
## need to artificially reduce the size of test/train data for it to run on my machine
## then do the pad_trunc as documented

reducer = 8
print(f"Reduce test/train data by factor of {reducer}")

print(f"Before changes: {len(x_train)} and {len(x_test)}")

x_train = x_train[:int(len(x_train) / reducer)]
x_test = x_test[:int(len(x_test) / reducer)]

y_train = y_train[:int(len(y_train) / reducer)]
y_test = y_test[:int(len(y_test) / reducer)]

print(f"After reduction: {len(x_train)} and {len(x_test)}")

x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

print(f"After pad_trunc: {len(x_train)} and {len(x_test)}")
print(maxlen, embedding_dims)

Reduce test/train data by factor of 8
Before changes: 20000 and 5000
After reduction: 2500 and 625
After pad_trunc: 2500 and 625
400 300


In [10]:
import numpy as np

In [11]:
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [12]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, SimpleRNN

num_neurons = 50

print('Build model...')
model = Sequential()

2022-11-21 21:30:35.286829: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-21 21:30:36.820801: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-21 21:30:36.820918: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-21 21:30:43.763862: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

Build model...


2022-11-21 21:30:47.639946: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-11-21 21:30:47.641151: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2022-11-21 21:30:47.641273: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (04896aa7ca98): /proc/driver/nvidia/version does not exist
2022-11-21 21:30:47.643810: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
model.add(SimpleRNN(num_neurons, return_sequences=True, input_shape=(maxlen, embedding_dims)))

In [14]:
model.add(Dropout(.2))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

## 8.2 Putting things together

In [15]:
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 400, 50)           17550     
                                                                 
 dropout (Dropout)           (None, 400, 50)           0         
                                                                 
 flatten (Flatten)           (None, 20000)             0         
                                                                 
 dense (Dense)               (None, 1)                 20001     
                                                                 
Total params: 37,551
Trainable params: 37,551
Non-trainable params: 0
_________________________________________________________________


## 8.3 Let's get to learning our past selves

In [16]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

2022-11-21 21:31:11.650668: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 1200000000 exceeds 10% of free system memory.


Epoch 1/2

2022-11-21 21:31:48.429561: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 300000000 exceeds 10% of free system memory.


Epoch 2/2


<keras.callbacks.History at 0x7f7cc8445900>

In [17]:
model_structure = model.to_json()
with open("simplernn_model1.json", "w") as json_file:
    json_file.write(model_structure)
    
model.save_weights("simplernn_weights1.h5")
print('Model saved.')

Model saved.


## 8.4 Hyperparameters

In [18]:
num_neurons = 100

print('Build model...')
model = Sequential()

model.add(SimpleRNN(num_neurons, return_sequences=True, input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Build model...
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_1 (SimpleRNN)    (None, 400, 100)          40100     
                                                                 
 dropout_1 (Dropout)         (None, 400, 100)          0         
                                                                 
 flatten_1 (Flatten)         (None, 40000)             0         
                                                                 
 dense_1 (Dense)             (None, 1)                 40001     
                                                                 
Total params: 80,101
Trainable params: 80,101
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
## getting to big for my laptop :(

# model.fit(x_train, y_train,
#           batch_size=batch_size,
#           epochs=epochs,
#           validation_data=(x_test, y_test))

# model_structure = model.to_json()
# with open("simplernn_model2.json", "w") as json_file:
#     json_file.write(model_structure)

# model.save_weights("simplernn_weights2.h5")
# print('Model saved.')

## 8.5 Predicting

In [20]:
sample_1 = "I hate that the dismal weather that had me down for so long, when will it break!" \
    "Ugh, when does happiness return?  The sun is blinding and the puffy clouds are too thin. " \
    "I can't wait for the weekend."

In [21]:
from keras.models import model_from_json
with open("simplernn_model1.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)

In [22]:
model.load_weights('simplernn_weights1.h5')

In [23]:
# We pass a dummy value in the first element of the tuple just because our helper expects it from the way processed the initial data.  That value won't ever see the network, so it can be whatever.
vec_list = tokenize_and_vectorize([(1, sample_1)])

In [24]:
# Tokenize returns a list of the data (length 1 here)
test_vec_list = pad_trunc(vec_list, maxlen)

In [25]:
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

In [27]:
model.predict(test_vec)



array([[0.24704814]], dtype=float32)

### 8.5.2 Two-way street

In [30]:
from keras.models import Sequential
from keras.layers import SimpleRNN
from keras.layers import Bidirectional

num_neurons = 10
maxlen = 100
embedding_dims = 300

model = Sequential()
model.add(Bidirectional(SimpleRNN(
    num_neurons, return_sequences=True), \
    input_shape=(maxlen, embedding_dims)))