## Dependencies :

In [171]:
import numpy as np
import re
import pandas as pd
from numpy import random
from nltk.lm import MLE
from nltk.util import ngrams
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.util import everygrams
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import SimpleRNN
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.text import Tokenizer

## Preprocessing :

In [172]:
f = open("speeches.txt", "r")
donald = f.read()
donald=donald.lower()
sent_tokenize_list = sent_tokenize(donald)
for i in range(len(sent_tokenize_list)):
    sent_tokenize_list[i]=re.sub(r'[^\w]', ' ',sent_tokenize_list[i])
X_train, X_test = train_test_split(sent_tokenize_list, test_size=0.2 , random_state=42)
vocab=[]
token=[]
for i in range(len(X_train)):
    temp=word_tokenize(X_train[i])
    token.append('<s>')
    token.extend(temp)
    token.append('</s>')
vocab=np.unique(token)
# print(vocab[3000])
output_sent=[]
token2=[]
for i in range(len(X_test)):
    temp=word_tokenize(X_test[i])
    token1=[]
    token1.append('<s>')
    token1.extend(temp)
    token1.append('</s>')
    output_sent.append(token1)
    token2.extend(token1)
total_vocab=np.unique(np.concatenate((token2,vocab),axis=None))

## N-Gram Model :

In [173]:
def ngram(n):
    input_ngram = list(everygrams(token,max_len=n))
    lm = MLE(n)
    lm.fit([input_ngram], vocabulary_text=vocab)
    return lm
for i in range(1,5):
    print("No of ",i,"- gram Theoretically possible: ",len(vocab)**i)
    print("No of ",i,"- gram Observed: ",len(set(ngrams(token,i))))
uni=ngram(1)
bi=ngram(2)
tri=ngram(3)
quad=ngram(4)
# print("lo")

No of  1 - gram Theoretically possible:  5406
No of  1 - gram Observed:  5406
No of  2 - gram Theoretically possible:  29224836
No of  2 - gram Observed:  41930
No of  3 - gram Theoretically possible:  157989463416
No of  3 - gram Observed:  84828
No of  4 - gram Theoretically possible:  854091039226896
No of  4 - gram Observed:  118036


## Generator Function :

In [174]:
def generate(n,model):
    start="<s>"
    sentence=[start]
    if(n==1):
        while(start!="</s>"):
            index=np.argmax(np.random.multinomial(1,[model.score(i) for i in vocab],size = None))
            sentence.append(vocab[index])
            start=vocab[index]
    elif(n==2):
        while(start!="</s>"):
            index=np.argmax(np.random.multinomial(1,[model.score(i,[start]) for i in vocab],size = None))
            sentence.append(vocab[index])
            start=vocab[index]
    elif(n==3):
        start_arr=[start]
        index=np.argmax(np.random.multinomial(1,[model.score(i,[start]) for i in vocab],size = None))
        sentence.append(vocab[index])
        start_arr.append(vocab[index])
        while(start!="</s>"):
            index=np.argmax(np.random.multinomial(1,[model.score(i,start_arr) for i in vocab],size = None))
            sentence.append(vocab[index])
            start=vocab[index]
            start_arr=start_arr[1:]
    elif(n==4):
        start_arr=[start]
        index=np.argmax(np.random.multinomial(1,[model.score(i,[start]) for i in vocab],size = None))
        sentence.append(vocab[index])
        start_arr.append(vocab[index])
        index=np.argmax(np.random.multinomial(1,[model.score(i,start_arr) for i in vocab],size = None))
        sentence.append(vocab[index])
        start_arr.append(vocab[index])
        while(start!="</s>"):
            index=np.argmax(np.random.multinomial(1,[model.score(i,start_arr) for i in vocab],size = None))
            sentence.append(vocab[index])
            start=vocab[index]
            start_arr=start_arr[1:]
    return ' '.join(sentence)
print("Unigram:")
for i in range(10):
    print(generate(1,uni))
print("\nBigram:")
for i in range(10):
    print(generate(2,bi))
print("\nTrigram:")
for i in range(10):
    print(generate(3,tri))
print("\nQuadgram:")
for i in range(10):
    print(generate(4,quad))

Unigram:
<s> is be this the i get be <s> strongly met presidential it <s> after </s>
<s> tax s trump my those s into end you they </s>
<s> way wrong i we but corey a he in <s> congratulations i started say your trillion and <s> <s> give future because that vigilant you they </s>
<s> that us better chance up i to understand and <s> he recently <s> <s> we will with i hillary know make i such <s> i i losing election the re from thing got we ending getting very <s> you </s>
<s> is do pulled will </s>
<s> actually but you re believe watch trump is now false i she be if with below time trump teleprompter said has he report they a be they <s> </s>
<s> you which <s> to china up say <s> speech yesterday leave the now florida so hillary said bernie replace community allowing little </s>
<s> involved establishment be actually the </s>
<s> going ll </s>
<s> the were can and have there </s>

Bigram:
<s> and tough cookie </s>
<s> he likes of them </s>
<s> he s attacked </s>
<s> okay thank you re goi

## Perplexity

In [175]:
from random import randrange
rand=[randrange(0,len(output_sent),1) for j in range(10)]
def perplex(models):
    for i in rand:
        print("PP of ",' '.join(output_sent[i]))
        for j in range(len(models)):
            lm=models[j]
            try:
                print(j+1," Gram ",round(lm.perplexity(ngrams(output_sent[i],j+1)),2))
            except:
                print(j+1," Gram ","inf")
perplex([uni,bi,tri,quad])

PP of  <s> i wasn t looking for the credit </s>
1  Gram  201.68
2  Gram  28.31
3  Gram  13.89
4  Gram  1.51
PP of  <s> you have to vet it </s>
1  Gram  96.52
2  Gram  inf
3  Gram  inf
4  Gram  inf
PP of  <s> nothing s gon na get done </s>
1  Gram  486.26
2  Gram  inf
3  Gram  inf
4  Gram  inf
PP of  <s> and i don t understand </s>
1  Gram  72.06
2  Gram  9.32
3  Gram  8.64
4  Gram  12.81
PP of  <s> i mean it s a money making industry ok </s>
1  Gram  226.65
2  Gram  inf
3  Gram  inf
4  Gram  inf
PP of  <s> i just want a job </s>
1  Gram  94.17
2  Gram  25.48
3  Gram  12.48
4  Gram  inf
PP of  <s> so i just want to let you know it s very important </s>
1  Gram  115.07
2  Gram  15.54
3  Gram  inf
4  Gram  inf
PP of  <s> he didn t win anything </s>
1  Gram  161.15
2  Gram  18.06
3  Gram  8.64
4  Gram  2.28
PP of  <s> i ll bring back our jobs from china from mexico from japan from so many places </s>
1  Gram  409.43
2  Gram  inf
3  Gram  inf
4  Gram  inf
PP of  <s> altogether under the cli

### Readability : Some of the sentence are gramatically correct. But, there are a lot of flaw bcs the test data has new words.

## Neural Approach

In [176]:
characters = sorted(total_vocab)
n_to_char = {n:char for n, char in enumerate(characters)}
char_to_n = {char:n for n, char in enumerate(characters)}
vocab_size = len(characters)
print('Number of unique characters: ', vocab_size)
X = []   # extracted sequences
Y = []   # the target - the follow up character
length = len(token)
seq_length = 10   #number of characters to consider before predicting the following character
for i in range(0, length - seq_length, 1):
    sequence = token[i:i + seq_length]
    label = token[i + seq_length]
    X.append([char_to_n[char] for char in sequence])
    Y.append(char_to_n[label])
# print(X[100])
print('Number of extracted sequences:', len(X))
X_modified = np.reshape(X, (len(X), seq_length))
# X_modified = X_modified / float(len(characters))
Y_modified = np_utils.to_categorical(Y)
X_modified.shape, Y_modified.shape

Number of unique characters:  5857
Number of extracted sequences: 166838


((166838, 10), (166838, 5857))

In [177]:
def LSTM_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=10))
    model.add(LSTM(200))
    model.add(Dropout(0.2))
    model.add(Dense(Y_modified.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    print(model.summary())
    filepath="lstm.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    model.fit(X_modified, Y_modified, epochs=5, batch_size=256,callbacks=callbacks_list)
    return model
def Vanila_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=10))
    model.add(SimpleRNN(200))
    model.add(Dropout(0.2))
    model.add(Dense(Y_modified.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    print(model.summary())
    filepath="vanila.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    model.fit(X_modified, Y_modified, epochs=5, batch_size=256,callbacks=callbacks_list)
    return model
lstm=LSTM_model()
vanila=Vanila_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 10, 100)           585700    
_________________________________________________________________
lstm_2 (LSTM)                (None, 200)               240800    
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 5857)              1177257   
Total params: 2,003,757
Trainable params: 2,003,757
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5

Epoch 00001: loss improved from inf to 5.66563, saving model to baseline-improvement-01-5.6656.hdf5
Epoch 2/5

Epoch 00002: loss improved from 5.66563 to 4.86780, saving model to baseline-improvement-02-4.8678.hdf5
Epoch 3/5

Epoch 00003: loss impr

In [180]:
def generate(model,string_mapped=None):
    if(string_mapped==None):
        start = np.random.randint(0, len(X)-1) # or generate random start
        string_mapped = list(X[start])
    full_string = [n_to_char[value] for value in string_mapped]
# print(start,full_string)
    print("Input:")
    print(' '.join(full_string))
    for i in range(30):
        x = np.reshape(string_mapped,(1,len(string_mapped)))
#         x = x / float(len(characters))
    #     print(x)
        pred_index = np.argmax(model.predict(x, verbose=0))
        seq = [n_to_char[value] for value in string_mapped]
    #     print(n_to_char[pred_index],pred_index)
        full_string.append(n_to_char[pred_index])
        string_mapped.append(pred_index)  # add the predicted character to the end
        string_mapped = string_mapped[1:] # shift the string one character forward by removing pos. 0
        if(n_to_char[pred_index]=="</s>"):
            break
    txt=""
    for char in full_string:
        txt = txt+char+" "
#     print(start)
    print("Output:")
    print(txt)
print("LSTM")
generate(lstm)
generate(lstm)
generate(lstm)
generate(lstm)
generate(lstm)
print("\nVanilla")
generate(vanila)
generate(vanila)
generate(vanila)
generate(vanila)
generate(vanila)

LSTM
Input:
s a scam </s> <s> so the wounded warriors all
Output:
s a scam </s> <s> so the wounded warriors all of the way i m not going to be a lot of the world </s> 
Input:
was like the academy awards </s> <s> i mean when
Output:
was like the academy awards </s> <s> i mean when you know i m going to be a lot of the world </s> 
Input:
be the stupid people anymore </s> <s> some are </s>
Output:
be the stupid people anymore </s> <s> some are </s> <s> i m going to be a lot of the world </s> 
Input:
</s> <s> it was also hillary clinton as secretary of
Output:
</s> <s> it was also hillary clinton as secretary of the world </s> 
Input:
from people i ll negotiate </s> <s> but we have
Output:
from people i ll negotiate </s> <s> but we have to be a lot of the world </s> 

Vanilla
Input:
fest </s> <s> but just in a nutshell i m
Output:
fest </s> <s> but just in a nutshell i m going to do it </s> 
Input:
years old a child a beautiful child went to have
Output:
years old a child a beautiful child

In [182]:
length = len(token2)
X=[]
Y=[]
seq_length = 10   #number of characters to consider before predicting the following character
for i in range(0, length - seq_length, 1):
    sequence = token2[i:i + seq_length]
    label = token2[i + seq_length]
    X.append([char_to_n[char] for char in sequence])
    Y.append(char_to_n[label])
# print(X[100])
# print('Number of extracted sequences:', len(X))
X_test = np.reshape(X, (len(X), seq_length))
# X_modified = X_modified / float(len(characters))
Y_test = np_utils.to_categorical(Y)
def Perplex(model):
    score = model.evaluate(X_test, Y_test,verbose=0)
    print('Test score: ', score)    
    # print('Test accuracy: ', score[1])
    print("Perplexity ", np.exp(score))
print("LSTM")
Perplex(lstm)
print("Vanilla")
Perplex(vanila)

LSTM
Test score:  4.187699517849314
Perplexity  65.87108127788211
Vanilla
Test score:  3.9830074119721965
Perplexity  53.67822425786633


### Readability : Does a wonderful job in producing new sentence. But, there are a lot of grammatical mistakes.

## Does Neural Network better than Classical Approach, if so why? If not why not?

Perplexity: RNN works better than LSTM and unigram. When it comes to train data, n-gram model works well but the problem occur when we evaluate it on test data. Since it hasn't seen the test data the probablity of most of the grams are zero when with smoothing the problem can be solved only till a limit. With neural net this problem is solved to a great extend. As the model learns from the train data. It searches for similarities not for exact match.