In [1]:
import pandas as pd
import numpy as np
import string
import os
import gensim
import keras
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM
from keras.initializers import Constant
from keras.callbacks import ModelCheckpoint

2023-12-20 11:44:02.758282: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# data source: https://www.kaggle.com/datasets/adhok93/presidentialaddress/

df_speech = pd.read_csv('./inaug_speeches.csv', encoding= 'latin1')
df_speech = df_speech.drop(columns = "Unnamed: 0")


In [3]:
# Clean Data

speech_lines = list()
lines = df_speech["text"].values.tolist()

for line in lines:
    tokens = word_tokenize(line)

    # convert to lower case
    tokens = [w.lower() for w in tokens]

    # remove punctuation from each word
    table = str.maketrans("", "", string.punctuation)
    stripped = [w.translate(table) for w in tokens]

    # remove remaining tokens that are not alphabetic
    words = [w for w in stripped if w.isalpha()]

    speech_lines.append(words)


In [4]:
# Trun words into Word-2-Vec Embeddings
EMBEDDING_DIM = 20
model = gensim.models.Word2Vec(sentences=speech_lines, 
                               vector_size= EMBEDDING_DIM, 
                               window=5, workers=2, min_count=1)

# vocab size
words = list(model.wv.key_to_index)

In [5]:
file_name = "inaug_speech_word2vec.txt"
model.wv.save_word2vec_format(file_name, binary=False)

In [6]:
embeddings_index = {}

f = open(os.path.join("", "inaug_speech_word2vec.txt"), encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:])
    embeddings_index[word]=coefs
    
f.close()

In [7]:
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(speech_lines)

# create word index dict
word_index = tokenizer_obj.word_index

# vocab
vocab_size = len(tokenizer_obj.word_index) + 1

# tokenize 
speech_tokens = tokenizer_obj.texts_to_sequences(speech_lines)

In [8]:
# map embeddings from word2vec model for each word
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [9]:
# Prepare Train and Test DataSets

seq_len = 10
dataX=[]
dataY=[]

for i in range(len(speech_tokens)): 
    for j in range(0, len(speech_tokens[i])-seq_len, 1): 
        seq_in = speech_tokens[i][j:j+seq_len]
        seq_out = speech_tokens[i][j+seq_len]
        dataX.append(seq_in)
        dataY.append(seq_out)
        
# Reshape and Create Sparse Matrix
X = np.reshape(dataX, (len(dataX), seq_len, 1))
y = keras.utils.to_categorical(dataY)

In [10]:
# Build RNN Model

model = Sequential()
embedding_layer = Embedding(num_words, 
                           EMBEDDING_DIM, 
                           weights = [embedding_matrix], 
                           input_length = seq_len, 
                           trainable = False)
model.add(embedding_layer)
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2]), 
               return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam")

print(model.summary())

2023-12-20 11:44:18.821021: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-12-20 11:44:18.823232: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-12-20 11:44:18.824774: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 20)            185440    
                                                                 
 lstm (LSTM)                 (None, 10, 256)           283648    
                                                                 
 dropout (Dropout)           (None, 10, 256)           0         
                                                                 
 lstm_1 (LSTM)               (None, 256)               525312    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0         
                                                                 
 dense (Dense)               (None, 9272)              2382904   
                                                                 
Total params: 3,377,304
Trainable params: 3,191,864
Non-

2023-12-20 11:44:19.051069: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-12-20 11:44:19.053120: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-12-20 11:44:19.054414: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [11]:
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor="loss", verbose=1, save_best_only=True, mode="min")
callbacks_list = [checkpoint]

In [12]:
model.fit(X, y, epochs = 20, batch_size= 128, callbacks = callbacks_list)

Epoch 1/20


2023-12-20 11:44:19.496323: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-12-20 11:44:19.498285: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-12-20 11:44:19.499602: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1: loss improved from inf to 6.49585, saving model to weights.hdf5
Epoch 2/20
Epoch 2: loss improved from 6.49585 to 6.11325, saving model to weights.hdf5
Epoch 3/20
Epoch 3: loss improved from 6.11325 to 5.94534, saving model to weights.hdf5
Epoch 4/20
Epoch 4: loss improved from 5.94534 to 5.83179, saving model to weights.hdf5
Epoch 5/20
Epoch 5: loss improved from 5.83179 to 5.73815, saving model to weights.hdf5
Epoch 6/20
Epoch 6: loss improved from 5.73815 to 5.64512, saving model to weights.hdf5
Epoch 7/20
Epoch 7: loss improved from 5.64512 to 5.55011, saving model to weights.hdf5
Epoch 8/20
Epoch 8: loss improved from 5.55011 to 5.44724, saving model to weights.hdf5
Epoch 9/20
Epoch 9: loss improved from 5.44724 to 5.34418, saving model to weights.hdf5
Epoch 10/20
Epoch 10: loss improved from 5.34418 to 5.24255, saving model to weights.hdf5
Epoch 11/20
Epoch 11: loss improved from 5.24255 to 5.14487, saving model to weights.hdf5
Epoch 12/20
Epoch 12: loss improved from 5.

<keras.callbacks.History at 0x7fd6c907b070>

In [13]:
def generate_seq(model, tokenizer, enter_text, n_pred):  
    in_text, result = enter_text, enter_text  # 
    # generate a fixed number of words
    for _ in range(n_pred):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text.split()])
        encoded = np.array(encoded)
        
        # predict a word in the vocabulary
        ## yhat = model.predict_classes(encoded)
        predict_val = model.predict(encoded)
        yhat = np.argmax(predict_val, axis=-1)
        print(predict_val.shape)

        
        # map predicted word index to word
        out_word = ""
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text, result = out_word, result + " " + out_word
        
    return result


In [14]:
input_seed = "my fellow americans it is time to stand together and"
print(generate_seq(model, tokenizer_obj, input_seed, 3))

2023-12-20 12:37:57.753483: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-12-20 12:37:57.755366: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-12-20 12:37:57.757452: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

(1, 9272)


2023-12-20 12:37:58.402954: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-12-20 12:37:58.404826: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-12-20 12:37:58.405917: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

(1, 9272)
(1, 9272)
my fellow americans it is time to stand together and medicare the world
