### CS584 Assignment 3 David Fu

##### Packages and Imports

In [1]:
import os
import pandas as pd
import re
import math
import nltk
from nltk import word_tokenize, KneserNeyProbDist, SimpleGoodTuringProbDist, FreqDist, trigrams
from nltk.corpus import stopwords
from nltk.lm import Vocabulary, MLE
from nltk.lm.models import KneserNeyInterpolated,Lidstone
from nltk.lm.smoothing import KneserNey, WittenBell
from nltk.lm.api import Smoothing
from nltk.lm.preprocessing import pad_both_ends, padded_everygram_pipeline
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, LSTM,Embedding
from keras.preprocessing.sequence import pad_sequences
import keras.backend as backend
import numpy as np
import matplotlib.pyplot as plt


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


###### Basic Reusable Function

In [2]:
"""
READS Text File and return all of body as a string
"""
def read_input(input_path:str) -> str:
    file_data = open(input_path , 'r')
    return file_data.read()

"""
Use regex to remove punctuation, numbers and multi spaces
"""
def clean_text(text:str):
    clean = re.sub('<unk>', '', text)
    clean = re.sub('N', '', clean)
    return re.sub(' +', ' ', clean)

##### Build Tokenizer and Vocabulary

In [3]:
"""
Read text file and convert to token and vocabulary
"""
train_text = clean_text(read_input('./a3-data/train.txt'))
valid_text = clean_text(read_input('./a3-data/valid.txt'))

train_input = clean_text(train_text)
valid_input = clean_text(valid_text)
train_tokens = nltk.word_tokenize(train_input)
valid_tokens = nltk.word_tokenize(valid_input)

train_vocab = Vocabulary(train_tokens, unk_cutoff=3)
valid_vocab = Vocabulary(valid_tokens, unk_cutoff=3)

In [4]:
print(len(train_vocab))
print(len(valid_vocab))

9947
2889


In [5]:
"""
Regular expression to find sentenses and convert them to sets of 3 words for trigram model
"""
sentence_regex = lambda x: re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', x)
train_token = [list(map(str.lower, word_tokenize(sentence))) for sentence in sentence_regex(train_text)]
valid_token = [list(map(str.lower, word_tokenize(sentence))) for sentence in sentence_regex(valid_text)]

train, vocab_train = padded_everygram_pipeline(3, train_token)
test, vocab_test = padded_everygram_pipeline(3, valid_token)

##### Build Trigram N-Gram Model

In [6]:
"""
Declar the gram model of length 3 and check the vocabuary list
"""
n3gram = MLE(3)
len(n3gram.vocab)

0

In [7]:
"""
Fit the model to the training dataset with vacabulary and trainning data
"""
n3gram.fit(train, vocab_train)
len(n3gram.vocab)

9959

In [8]:
"""
Check basic result of the trigram model by count of ngrams
"""
print(n3gram.counts)

<NgramCounter with 3 ngram orders and 2573544 ngrams>


In [9]:
"""
Pull a few example trigram from the Text hard coded to see if the probability is present
"""
print(n3gram.score('post', 'washington'.split()))

print(n3gram.score('institute', 'national cancer'.split()))

0.03286384976525822
1.0


##### Good Turing smoothing

In [10]:
"""
Using trigram model with frequency distribution to simple Turning Distirbution for normalization
"""
ngrams = trigrams(train_text)
freq_dist = FreqDist(ngrams)
turing = SimpleGoodTuringProbDist(freq_dist)

print(turing)

<SimpleGoodTuringProbDist based on 4766534 samples>


In [11]:
"""
Create a KnesnerNeyDistribution
"""
kneser = KneserNeyProbDist(freq_dist)
print(kneser)

<KneserNeyProbDist based on 4766534 trigrams


In [12]:
"""
Model it to Trigram with the input data set and vocabulary
"""
kn = KneserNeyInterpolated(3)
kn.fit(train, vocab_train)

In [13]:
"""
Set up a basic model to test sliding window of the trigram with sets of 30 from the validation data
"""

windows = []
for i in range(29, len(valid_tokens)):
    window = valid_tokens[i-29: i+1]
    windows.append(window)
previous_29 = [i[:-1] for i in windows]
actual_30 = [''.join(i[-1:]) for i in windows]

predict_word = [n3gram.generate(1,text_seed=i, random_seed=13) for i in previous_29]

In [14]:
print('The predict word: ')
print(predict_word[:30])
print('\nThe actual word in test set:')
print(actual_30[:30])

The predict word: 
['nine', 'her', 'now', 'locations', 'houses', 'five', 'cents', 'feet', 'ago', 'citicorp', 'call', 'document', 'economic', 'broadcast', 'electronics', 'in', 'early', 'for', 'recession', 'of', 'contributions', 'as', 'access', 'projects', 'for', 'is', 'dollar', 'guard', 'program', 'in']

The actual word in test set:
['years', 'from', 'among', 'four', 'or', 'five', 'two', 'weeks', 'ago', 'viewers', 'of', 'several', 'nbc', 'consumer', 'segments', 'started', 'calling', 'a', 'number', 'for', 'advice', 'on', 'various', 'issues', 'and', 'the', 'new', 'syndicated', 'reality', 'show']


##### Perplexity and Accuracy

In [15]:
correct = 0
for i in range(len(predict_word)):
    if actual_30[i] == predict_word[i]:
        correct += 1
print('The accuracy of Predicting the next word in sliding window : %f' % (correct/len(predict_word)))

The accuracy of Predicting the next word in sliding window : 0.108417


In [16]:
perplexity = np.array([n3gram.perplexity(i) for i in valid_tokens])
perplexity_ngram = np.ma.masked_invalid(perplexity).mean()
print('The perplexity scores of ngram model on test set is :%f' % perplexity_ngram)

The perplexity scores of ngram model on test set is :10031.680732


###### 30 line prediction

In [17]:
input_text = read_input('./a3-data/input.txt')
input_sent = input_text.split('\n')
first_30 = input_sent[0:30]
first_30

["but while the new york stock exchange did n't fall ___",
 'some circuit breakers installed after the october N crash failed ___',
 'the N stock specialist firms on the big board floor ___',
 'big investment banks refused to step up to the plate ___',
 "heavy selling of standard & poor 's 500-stock index futures ___",
 'seven big board stocks ual amr bankamerica walt disney capital ___',
 'once again the specialists were not able to handle the ___',
 '<unk> james <unk> chairman of specialists henderson brothers inc. it ___',
 'when the dollar is in a <unk> even central banks ___',
 'speculators are calling for a degree of liquidity that is ___',
 'many money managers and some traders had already left their ___',
 'then in a <unk> plunge the dow jones industrials in ___',
 '<unk> trading accelerated to N million shares a record for ___',
 'at the end of the day N million shares were ___',
 "the dow 's decline was second in point terms only ___",
 "in percentage terms however the dow 's

In [18]:
"""
Use model to predict the 30 lines in the sample input sentenses
"""
for i in range(30):
    print('\nExample %d:' % (i+1))
    token = first_30[i][:-3].split()
    print(token)
    predict_word = n3gram.generate(1,text_seed=token, random_seed=9)
    result = first_30[i] + '  predict: ' + predict_word
    print(result)


Example 1:
['but', 'while', 'the', 'new', 'york', 'stock', 'exchange', 'did', "n't", 'fall']
but while the new york stock exchange did n't fall ___  predict: the

Example 2:
['some', 'circuit', 'breakers', 'installed', 'after', 'the', 'october', 'N', 'crash', 'failed']
some circuit breakers installed after the october N crash failed ___  predict: to

Example 3:
['the', 'N', 'stock', 'specialist', 'firms', 'on', 'the', 'big', 'board', 'floor']
the N stock specialist firms on the big board floor ___  predict: at

Example 4:
['big', 'investment', 'banks', 'refused', 'to', 'step', 'up', 'to', 'the', 'plate']
big investment banks refused to step up to the plate ___  predict: is

Example 5:
['heavy', 'selling', 'of', 'standard', '&', 'poor', "'s", '500-stock', 'index', 'futures']
heavy selling of standard & poor 's 500-stock index futures ___  predict: is

Example 6:
['seven', 'big', 'board', 'stocks', 'ual', 'amr', 'bankamerica', 'walt', 'disney', 'capital']
seven big board stocks ual amr 

### RNN

In [19]:
"""
Using the tokenizer to the training restrain input and encode the words to a number setting window size to 20 for trainning data
"""

tokenizer = Tokenizer()
tokenizer.fit_on_texts([train_input])
encoded_word = tokenizer.texts_to_sequences([train_input])[0]

vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

windows = []
for i in range(19, len(encoded_word)):
    window = encoded_word[i-19:i+1]
    windows.append(window)
print('Total windows: %d' % len(windows))

max_length = max([len(seq) for seq in windows])
windows = pad_sequences(windows, maxlen=max_length, padding='pre')
print('Window size: %d' % max_length)

X_train, y = windows[:,:-1],windows[:,-1]
y_train = to_categorical(y, num_classes=vocab_size)

Vocabulary Size: 9649
Total windows: 810124
Window size: 20


In [20]:
"""
Using the tokenizer to the training restrain input and encode the words to a number setting window size to 20 for validation data
"""

tokenizer.fit_on_texts([valid_input])
encoded_2 = tokenizer.texts_to_sequences([valid_input])[0]

vocab_size_2 = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size_2)

windows2 = []
for i in range(19, len(encoded_2)):
    window2 = encoded_2[i-19:i+1]
    windows2.append(window2)
print('Total Windows: %d' % len(windows2))
# pad sequences
max_length = max([len(seq) for seq in windows2])
windows2 = pad_sequences(windows2, maxlen=max_length, padding='pre')
print('Window size: %d' % max_length)
# split into input and output elements
X_test, y = windows2[:,:-1],windows2[:,-1]
y_test = to_categorical(y, num_classes=vocab_size)

Vocabulary Size: 9649
Total Windows: 64231
Window size: 20


In [24]:
"""
Set up sequentail model setting vocab with LSTM to predict the word
"""

model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(40))
model.add(Dropout(0.4))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 19, 10)            96490     
_________________________________________________________________
lstm_2 (LSTM)                (None, 40)                8160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 40)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 9649)              395609    
Total params: 500,259
Trainable params: 500,259
Non-trainable params: 0
_________________________________________________________________
None


In [25]:
def perplexity(y_predict, y_label):
    return backend.exp(backend.categorical_crossentropy(y_predict, y_label))

In [None]:
"""
Set up model for calculation with bath size 50 and epochs 100
"""

optimizer = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer,metrics=[perplexity])

model.fit(X_train, y_train, epochs=50, verbose=1, batch_size=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50

In [None]:
"""
Write the model to file to be reused
"""

model_json = model.to_json()
with open("model50v2.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model50v2.h5")

In [None]:
"""
For rapid reload instead of rerunning everytime
"""


"""

json_file = open('model50.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

loaded_model.load_weights("model50.h5")


"""

In [None]:
"""
Check the model and calculate the perplexity
"""

loss_and_per = model.evaluate(X_test, y_test)
loss = loss_and_per[0]
perplexity_test = loss_and_per[1]
print('loss = ' + str(loss))
print('perplexity = ' + str(loss_and_acc[1]))

In [None]:
"""
Using given value for proof
"""
sample_size = 64231
batch_size = 50
number_of_predictions = 1284

total_loss = loss*sample_size/batch_size
aim = np.exp(total_loss/number_of_predictions)
print(aim)

In [None]:
"""
Use RNN for prediction
"""
def rnn_predict(text):
    encoded = tokenizer.texts_to_sequences([text])[0]
    
    encoded = pad_sequences([encoded], maxlen=max_length-1, padding='pre')
    
    y_predict_class = model.predict_classes(encoded, verbose=0)
   
    predict_word = ''
    for word, index in tokenizer.word_index.items():
        if index == y_predict_class:
            predict_word = word
            break
    
    result = text.rstrip() + ' prediction: ' + predict_word
    print(result)


In [None]:
"""
Print the result of the 30 predictions
"""
for i in range(30):
    text = first_30[i]
    print('\nExample ' + str(i+1) + ':')
    rnn_predict(text)