# Text Prediction and Generation using LSTM Networks

In [None]:
%%time
# Code to read google drive files into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

_______________
## Google drive code only
Only run the following sections in a colaboratory shell

In [None]:
# Google drive code only
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# Google drive code only
%%time
def importFile(fileName: str, fileID: str):
  '''Imports a file into the Colaboratory workspace. The fileID can be 
     found in the file's Share Link'''
  print("Grabbing file " + str(fileName) + " with id = " + str(fileID)) # Verify that you have everything after '='
  downloaded = drive.CreateFile({'id':fileID}) 
  downloaded.GetContentFile(fileName) 

In [None]:
# Google drive code only
# Import all the datasets we are going to use for our project
importFile('TheLordOfTheRings_Book1.txt', '1crAeSigOaQcT62W7EjcwrKcIVBx5ayeh')
importFile('GoogleNews-vectors-negative300.bin', '1zzUeVFsRYw3lWe6kjk1nccywCkMDp-AY')

## End of Google drive code
_________________________________________

In [1]:
# Load top 1 million (out of 3 million) word embeddings from the binary file.
# These vectors are 300 dimensions large and are created using the word2vec algorithm.
# The model was trained on the GoogleNews corpus, which is a similar size to the English version of Wikipedia.
# We can only load 1 million vectors because we run out of RAM.

# %%time
import gensim
embeddings = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', limit=1000000, binary=True)

### Word Embedding Examples

When we import the binary file using the gensim library, we can easily explore interesting properties of word embeddings. Much like a latent vector produced using an autoencoder, each embedding holds meaningful information about each word and the context in which it is used. The following examples explore the latent space in which these vectors reside.

In [2]:
import numpy as np

# We can perform interesting vector addition/subtraction
print(embeddings.most_similar(positive=["boy", "girl"], topn=3))
print(embeddings.most_similar(positive=["fish"], negative=["water"], topn=3)) # What is a fish without water?

# We can extract nearby points. This feature will be useful because our model 
# won't output vectors that exactly correspond to existing vectors.
v = np.copy(embeddings["woman"])
v[0] += 0.2
print(embeddings.most_similar(positive=[v], topn=1))

[('teenage_girl', 0.7674504518508911), ('teenager', 0.7674364447593689), ('toddler', 0.701943576335907)]
[('striped_bass', 0.47242963314056396), ('bluefin', 0.4493907392024994), ('tunas', 0.4461327791213989)]
[('woman', 0.9972377419471741)]


![](https://drive.google.com/uc?export=view&id=1e9PxMNKjTXDx7KZDxHpC4ENmz0sXHl_g)

## Preprocessing 
This is were we generate the training and validation data for our LSTM. The following functions will be used in the following sections.

In [123]:
from tensorflow.keras.models import Sequential
from keras.models import Sequential
from keras.layers import LSTM, Dense
from nltk import ngrams
import math
import numpy as np
import re


def tokenizeFile(fileName: str) -> list:
  '''Will take in the name of a txt file located in the base directory 
     of the drive and return a list of terms'''
  words = []
  with open(fileName, 'r', encoding="ISO-8859-1") as myTxtFile:
    for line in myTxtFile:
      lineWords = [word for word in re.split(r'[^a-zA-Z]', line.strip()) if word]
      words += lineWords
  return words


def flipFirstChar(word: str) -> str:
  '''Capitalizes/Un-capitalizes the first character of the word'''
  if word[0].isupper():
    return word[0].lower() + word[1:]
  else:
    return word[0].upper() + word[1:]


def wordsToVectors(words: list) -> (list, list):
  '''Convert a list of terms into a list of word embeddings. 
     We also keep track of which terms cannot be paired with an embedding.'''
  vectors = []
  unknownWords = set()
  for word in words:
    if word in embeddings:
        vectors.append(embeddings[word])
    elif flipFirstChar(word) in embeddings:
        vectors.append(embeddings[flipFirstChar(word)])
    else:
        unknownWords.add(word)
  return vectors, unknownWords


def phrasesToVectors(phrases: list, summary=True) -> (list, list):
    '''Calls wordsToVectors() on a list of lists. This function will be called
       after data is split into ngrams. Each phrase in the code below operations
       on a single ngram.'''
    phrase_vectors = []
    all_unknown_words = set()
    a = 1
    for phrase in phrases:
        vectors, unknown_words = wordsToVectors(phrase)
        phrase_vectors.append(vectors)
#         print(np.shape(phrase_vectors))
#         if a == 2000:
#           break
#         else:
#           a += 1
        all_unknown_words = all_unknown_words.union(unknown_words)  
    if summary:
      print("Number of training phrases: " + str(len(phrase_vectors)))
      print("Number of unknown terms: " + str(len(all_unknown_words)))
    print(np.shape(phrase_vectors))
    return np.array(phrase_vectors, dtype=np.ndarray), all_unknown_words

### Preprocessing Steps:

1. Split training data into individual tokens, ie: "Here is a sentence" --> \["Here" "is" "a" "sentence"\]
2. Use nltk to split training data into ngram phrases to feed into our LSTM network
    - The model's architecture will automatically adjust based on ngram size
3. Shuffle the data
4. Split the data into training and validation

In [124]:
%%time
words = tokenizeFile("TheLordOfTheRings_Book1.txt")
# Change this parameter to decrease or increase the size of the training samples
# The last element in each phrase is the vector that the LSTM network will
# try to predict.
N = 5
ngrams_words = ngrams(words, N + 1)
ngram_vectors, _ = phrasesToVectors(ngrams_words)
np.random.shuffle(ngram_vectors)

Number of training phrases: 190112
Number of unknown terms: 548
(190112,)
Wall time: 2.52 s


In [125]:
# This block of code cuts the data to make training go faster during experimentation
total_size = np.size(ngram_vectors)
ngram_vectors = ngram_vectors[:total_size//100]
adjusted_size = np.size(ngram_vectors)
percentage = (adjusted_size/total_size) * 100
print("Using %.2f percent of samples" % percentage)

Using 1.00 percent of samples


In [135]:
print(ngram_vectors.shape)
# for row in ngram_vectors:
#     print(np.shape(row))
n = np.stack([ngram_vectors[i] for i in range(np.size(ngram_vectors))])
print(np.shape(n))


# Split data into training and validation
# split = math.floor(np.size(ngram_vectors) * 0.8)
# training = ngram_vectors[:split]
# validation = ngram_vectors[split:]

# Split the sample data phrases into input and expected. For example:
# phrase: [This tale grew in the telling] --> [This tale grew in the] [telling]
#                                              ^ input data            ^ expected output

# print(training.shape)
# print(training[0].shape)
# print(training[0][:-1, :].shape)
# print(training_in.shape)

# train_first = np.expand_dims(training[:, 0], axis=1)
# train_second = np.expand_dims(training[:, 1], axis=1)
# val_first = np.expand_dims(validation[:, 0], axis=1)
# val_second = np.expand_dims(validation[:, 1], axis=1)

(1901,)


ValueError: all input arrays must have the same shape

## Architecture Explaination
- The input shape is 300 because we take in each dimension of the word embedding as input
- Sigmoid activations are used because word embeddings are normalized betweeen zero and one
- Loss is cosine similarity because word2vec also uses cosine similarity to measure the distance between word vectors

In [56]:
model = Sequential()  
model.add(LSTM(300, input_shape=(N, 300), return_sequences=True, activation='sigmoid'))
model.add(LSTM(300, input_shape=(N, 300), return_sequences=True, activation='sigmoid'))
model.add(Dense(300))
model.compile(loss='cosine_proximity', optimizer='adam',metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_9 (LSTM)                (None, 5, 300)            721200    
_________________________________________________________________
lstm_10 (LSTM)               (None, 5, 300)            721200    
_________________________________________________________________
dense_5 (Dense)              (None, 5, 300)            90300     
Total params: 1,532,700
Trainable params: 1,532,700
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
print(np.expand_dims(training[:, 0], axis=1).shape)

In [None]:
%%time

history = model.fit(train_first, train_second, epochs=50, batch_size=32, verbose=2, validation_data=(val_first, val_second))

In [None]:
from matplotlib import pyplot as plt

def plot_history(history):
  plt.plot(history.history['acc'],label='train')
  if 'val_acc' in history.history:
    plt.plot(history.history['val_acc'],label='val')
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.legend()
  plt.title('Accuracy during Training')
  plt.show()
  
  plt.plot(history.history['loss'],label='train')
  if 'val_loss' in history.history:
    plt.plot(history.history['val_loss'],label='val')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend()
  plt.title('Loss during Training')
  plt.show()

#results = model.evaluate(x_test, y_test, verbose=False)
#print('Test loss:', results[0])
#print('Test accuracy:', results[1])

plot_history(history)

In [None]:
# test_vec = np.expand_dims(np.array(embeddings["A"]), axis=1)
# list_vec = []
# list_vec.append(embeddings['A'])
# test_vec = np.array(list_vec)
# print(test_vec.shape)

# vec = np.array(np.array(embeddings["A"]))

def getTestingArray(sentence: str):
  arr = []
  for i in sentence.split():
    item = embeddings[i]
    arr.append(item)
  return np.array(arr)


def printPredictions(predictions):
  print(predictions.shape)
  for i in range(len(predictions)):
    print(embeddings.most_similar(positive=[predictions[i][0]], topn=3))

    
test_input = getTestingArray("This tale grew in the telling")
test_input = np.expand_dims(test_input, axis=1)
predictions = model.predict(test_input)
printPredictions(predictions)