# 7. Getting words in order with convolutional neural networks (CNNs)

### 7.3.4 Padding

In [1]:
from keras.models import Sequential
from keras.layers import Conv1D

model = Sequential()

2022-10-21 11:37:52.078023: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-21 11:37:52.078080: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-21 11:37:56.461694: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-10-21 11:37:56.461754: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-10-21 11:37:56.461782: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (a299b6c2b8bb): /proc/driver/nvidia/version does not exist
2022-10-21 11:37:56.462638: I tensorflow/core/platform/cpu_featu

In [2]:
model.add(Conv1D(filters=16,
                 kernel_size=3,
                 padding="same",
                 activation="relu",
                 strides=1,
                 input_shape=(100, 300)))

### 7.4.1 Implementation In Keras: prepping the data

In [3]:
import numpy as np  # Keras takes care of most of this but it likes to see Numpy arrays
from keras.preprocessing import sequence    # A helper module to handle padding input
from keras.models import Sequential         # The base keras Neural Network model
from keras.layers import Dense, Dropout, Activation   # The layer objects we will pile into the model
from keras.layers import Conv1D, GlobalMaxPooling1D

In [4]:
import glob
import os

from random import shuffle


def pre_process_data(filepath):
    """
    This is dependent on your training data source but we will try to generalize it as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')

    pos_label = 1
    neg_label = 0

    dataset = []

    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))

    shuffle(dataset)

    return dataset

In [5]:
## http://ai.stanford.edu/~amaas/data/sentiment/
dataset = pre_process_data('../../bigdata/aclImdb/train/')

In [6]:
dataset[0]

(1,
 'This was truly a great movie. I loved Dennis Quaid and the entire baseball team. Jay Hernandez is also a very likable actor that is very enjoyable to watch. The chemistry the team had once they got things together was spectacular, it just goes to show what you what can accomplish when minds unite as one with one goal. This team came back from the brink, having multiple losing seasons to winning just about everything. I love movies like this as they really are very inspirational.<br /><br />On top of that, Dennis Quaid\'s character getting a place in the major leagues. You can\'t do anything, but root for this guy. It just seems like when someone is supposed to do something, they are going to do that. Things just happen to fall into place and makes everything click.<br /><br />Based on a true story, this film will really make you think about the fact that "nothing is impossible."')

In [10]:
len(dataset)

25000

In [7]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('../../bigdata/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000)


def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])

            except KeyError:
                pass  # No matching token in the Google w2v vocab

        vectorized_data.append(sample_vecs)

    return vectorized_data

In [8]:
def collect_expected(dataset):
    """ Peel of the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [11]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [12]:
len(vectorized_data), len(expected)

(25000, 25000)

In [13]:
split_point = int(len(vectorized_data) * .8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [14]:
maxlen = 200
batch_size = 32         # How many samples to show the net before backpropogating the error and updating the weights
embedding_dims = 300    # Length of the token vectors we will create for passing into the Convnet
filters = 250           # Number of filters we will train
kernel_size = 3         # The width of the filters, actual filters will each be a matrix of weights of size: embedding_dims x kernel_size or 50 x 3 in our case
hidden_dims = 250       # Number of neurons in the plain feed forward net at the end of the chain
epochs = 2              # Number of times we will pass the entire training dataset through the network

In [15]:
def pad_trunc(data, maxlen):
    """ For a given dataset pad with zero vectors or truncate to maxlen """
    new_data = []

    # Create a vector of 0's the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:

        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [16]:
## need to artificially reduce the size of test/train data for it to run on my machine
## then do the pad_trunc as documented

print(f"Before changes: {len(x_train)} and {len(x_test)}")

x_train = x_train[:int(len(x_train) / 4)]
x_test = x_test[:int(len(x_test) / 4)]

y_train = y_train[:int(len(y_train) / 4)]
y_test = y_test[:int(len(y_test) / 4)]

print(f"After reduction: {len(x_train)} and {len(x_test)}")

x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

print(f"After pad_trunc: {len(x_train)} and {len(x_test)}")
print(maxlen, embedding_dims)

Before changes: 20000 and 5000
After reduction: 5000 and 1250
After pad_trunc: 5000 and 1250
200 300


In [17]:
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

### 7.4.2 Convolutional neural network architecture

In [18]:
print('Build model...')
model = Sequential()

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1,
                 input_shape=(maxlen, embedding_dims)))

Build model...


### 7.4.3 Pooling

In [19]:
model.add(GlobalMaxPooling1D())  ## Pooling option are GlobalMaxPoolingID(),
                                 ##   MaxPoolingID(n), or AvgPoolingID(n),
                                 ##   where n is the size of the area to pool
                                 ##   and defaults to 2 if not provided

### 7.4.4 Dropout

In [20]:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

### 7.4.5 The cherry on the sundae

In [21]:
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [22]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [23]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

2022-10-21 11:45:31.324198: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1200000000 exceeds 10% of free system memory.


Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fc4f4f55000>

### 7.4.6 Let's get to learning (training)

In [24]:
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
    json_file.write(model_structure)

model.save_weights("cnn_weights.h5")
print('Model saved.')

Model saved.


### 7.4.7 Using the model in a pipeline

In [25]:
from keras.models import model_from_json
with open("cnn_model.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)

model.load_weights('cnn_weights.h5')

In [26]:
sample_1 = "I'm hate that the dismal weather that had me down for so long, when will it break!" \
    "Ugh, when does happiness return?  The sun is blinding and the puffy clouds are too thin. " \
    "I can't wait for the weekend."

In [27]:
# We pass a dummy value in the first element of the tuple just because our helper expects it from the way processed the initial data.  That value won't ever see the network, so it can be whatever.
vec_list = tokenize_and_vectorize([(1, sample_1)])

# Tokenize returns a list of the data (length 1 here)
test_vec_list = pad_trunc(vec_list, maxlen)

test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
model.predict(test_vec)



array([[0.17574006]], dtype=float32)

In [29]:
model.predict_classes(test_vec)

AttributeError: 'Sequential' object has no attribute 'predict_classes'