## Dependencies :

In [137]:
import numpy as np
import re
import pandas as pd
from numpy import random
from nltk.lm import MLE
from nltk.util import ngrams
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.util import everygrams
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import SimpleRNN
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

## Preprocessing :

In [123]:
f = open("speeches.txt", "r")
donald = f.read()
donald=donald.lower()
sent_tokenize_list = sent_tokenize(donald)
for i in range(len(sent_tokenize_list)):
    sent_tokenize_list[i]=re.sub(r'[^\w]', ' ',sent_tokenize_list[i])
X_train, X_test = train_test_split(sent_tokenize_list, test_size=0.2 , random_state=42)
vocab=[]
token=[]
for i in range(len(X_train)):
    temp=word_tokenize(X_train[i])
    token.append('<s>')
    token.extend(temp)
    token.append('</s>')
vocab=np.unique(token)
# print(vocab[3000])
output_sent=[]
token2=[]
for i in range(len(X_test)):
    temp=word_tokenize(X_test[i])
    token1=[]
    token1.append('<s>')
    token1.extend(temp)
    token1.append('</s>')
    output_sent.append(token1)
    token2.extend(token1)
total_vocab=np.unique(np.concatenate((token2,vocab),axis=None))

## N-Gram Model :

In [124]:
def ngram(n):
    input_ngram = list(everygrams(token,max_len=n))
    lm = MLE(n)
    lm.fit([input_ngram], vocabulary_text=vocab)
    return lm
for i in range(1,5):
    print("No of ",i,"- gram Theoretically possible: ",len(vocab)**i)
    print("No of ",i,"- gram Observed: ",len(set(ngrams(token,i))))
uni=ngram(1)
bi=ngram(2)
tri=ngram(3)
quad=ngram(4)
# print("lo")

No of  1 -gram Theoretically possible:  5406
No of  1 -gram Observed:  5406
No of  2 -gram Theoretically possible:  29224836
No of  2 -gram Observed:  41930
No of  3 -gram Theoretically possible:  157989463416
No of  3 -gram Observed:  84828
No of  4 -gram Theoretically possible:  854091039226896
No of  4 -gram Observed:  118036


## Generator Function :

In [125]:
def generate(n,model):
    start="<s>"
    sentence=[start]
    if(n==1):
        while(start!="</s>"):
            index=np.argmax(np.random.multinomial(1,[model.score(i) for i in vocab],size = None))
            sentence.append(vocab[index])
            start=vocab[index]
    elif(n==2):
        while(start!="</s>"):
            index=np.argmax(np.random.multinomial(1,[model.score(i,[start]) for i in vocab],size = None))
            sentence.append(vocab[index])
            start=vocab[index]
    elif(n==3):
        start_arr=[start]
        index=np.argmax(np.random.multinomial(1,[model.score(i,[start]) for i in vocab],size = None))
        sentence.append(vocab[index])
        start_arr.append(vocab[index])
        while(start!="</s>"):
            index=np.argmax(np.random.multinomial(1,[model.score(i,start_arr) for i in vocab],size = None))
            sentence.append(vocab[index])
            start=vocab[index]
            start_arr=start_arr[1:]
    elif(n==4):
        start_arr=[start]
        index=np.argmax(np.random.multinomial(1,[model.score(i,[start]) for i in vocab],size = None))
        sentence.append(vocab[index])
        start_arr.append(vocab[index])
        index=np.argmax(np.random.multinomial(1,[model.score(i,start_arr) for i in vocab],size = None))
        sentence.append(vocab[index])
        start_arr.append(vocab[index])
        while(start!="</s>"):
            index=np.argmax(np.random.multinomial(1,[model.score(i,start_arr) for i in vocab],size = None))
            sentence.append(vocab[index])
            start=vocab[index]
            start_arr=start_arr[1:]
    return ' '.join(sentence)
print("Unigram:")
for i in range(10):
    print(generate(1,uni))
print("\nBigram:")
for i in range(10):
    print(generate(2,bi))
print("\nTrigram:")
for i in range(10):
    print(generate(3,tri))
print("\nQuadgram:")
for i in range(10):
    print(generate(4,quad))

Unigram:
<s> why and be s t </s>
<s> </s>
<s> t but evangelical </s>
<s> believe magazine m </s>
<s> <s> what thousands re have it be all john every is </s>
<s> </s>
<s> <s> didn able last get the to m a being and on helpless going these have wall why probably enjoy is t going they </s>
<s> and our trying way would states </s>
<s> have in anything important <s> rebuild is border </s>
<s> is watch the right has i right hell terrible think executive the </s>

Bigram:
<s> i don t masterminds and i think that fought for all of </s>
<s> it back in the controversial </s>
<s> the apprentice continues to go home with the thing </s>
<s> we have to know when i don t have a certain way this is not as president obama announces because their puppet states permanently admits more than military interventions will not want to make good prognosticator i killed </s>
<s> she did you finish prisoners that he did it great it s pretty close </s>
<s> i made the arm necessarily </s>
<s> i don t win anything e

## Perplexity

In [126]:
from random import randrange
rand=[randrange(0,len(output_sent),1) for j in range(10)]
def perplex(models):
    for i in rand:
        print("PP of ",' '.join(output_sent[i]))
        for j in range(len(models)):
            lm=models[j]
            try:
                print(j+1," Gram ",round(lm.perplexity(ngrams(output_sent[i],j+1)),2))
            except:
                print(j+1," Gram ","inf")
perplex([uni,bi,tri,quad])

PP of  <s> i can t tell you yet </s>
1  Gram  117.55
2  Gram  inf
3  Gram  inf
4  Gram  inf
PP of  <s> and by the way we re going to be saying merry christmas again </s>
1  Gram  215.71
2  Gram  9.05
3  Gram  inf
4  Gram  inf
PP of  <s> i ll tell you what we re going to do right </s>
1  Gram  93.57
2  Gram  9.12
3  Gram  inf
4  Gram  inf
PP of  <s> so much money </s>
1  Gram  94.1
2  Gram  11.31
3  Gram  inf
4  Gram  inf
PP of  <s> i said if you attack iraq and you wipe it out iran is going to take over the entire middle east because you re going to ruin the balance </s>
1  Gram  374.64
2  Gram  inf
3  Gram  inf
4  Gram  inf
PP of  <s> again many americans must wonder why we our politicians seem more interested in defending the borders of foreign countries than in defending their own </s>
1  Gram  inf
2  Gram  inf
3  Gram  inf
4  Gram  inf
PP of  <s> our theme is very simple </s>
1  Gram  289.64
2  Gram  30.43
3  Gram  inf
4  Gram  inf
PP of  <s> i m just doing what s right </s>
1  Gra

In [127]:
characters = sorted(total_vocab)
n_to_char = {n:char for n, char in enumerate(characters)}
char_to_n = {char:n for n, char in enumerate(characters)}
vocab_size = len(characters)
print('Number of unique characters: ', vocab_size)
X = []   # extracted sequences
Y = []   # the target - the follow up character
length = len(token)
seq_length = 10   #number of characters to consider before predicting the following character
for i in range(0, length - seq_length, 1):
    sequence = token[i:i + seq_length]
    label = token[i + seq_length]
    X.append([char_to_n[char] for char in sequence])
    Y.append(char_to_n[label])
# print(X[100])
print('Number of extracted sequences:', len(X))
X_modified = np.reshape(X, (len(X), seq_length))
# X_modified = X_modified / float(len(characters))
Y_modified = np_utils.to_categorical(Y)
X_modified.shape, Y_modified.shape

Number of unique characters:  5857
Number of extracted sequences: 166838


((166838, 10), (166838, 5857))

In [128]:
def LSTM_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=10))
    model.add(LSTM(200))
    model.add(Dropout(0.2))
    model.add(Dense(Y_modified.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    print(model.summary())
    filepath="baseline-improvement-{epoch:02d}-{loss:.4f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    model.fit(X_modified, Y_modified, epochs=1, batch_size=256,callbacks=callbacks_list)
    return model
def Vanila_model():
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=10))
    model.add(SimpleRNN(200))
    model.add(Dropout(0.2))
    model.add(Dense(Y_modified.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    print(model.summary())
    filepath="vanila-improvement-{epoch:02d}-{loss:.4f}.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    model.fit(X_modified, Y_modified, epochs=1, batch_size=256,callbacks=callbacks_list)
    return model
lstm=LSTM_model()
vanila=Vanila_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 100)           585700    
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               240800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 5857)              1177257   
Total params: 2,003,757
Trainable params: 2,003,757
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/1

Epoch 00001: loss improved from inf to 5.40416, saving model to baseline-improvement-01-5.4042.hdf5
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Emb

In [131]:
def generate(model,string_mapped=None):
    if(string_mapped==None):
        start = np.random.randint(0, len(X)-1) # or generate random start
        string_mapped = list(X[start])
    full_string = [n_to_char[value] for value in string_mapped]
# print(start,full_string)
    print("Input:")
    print(' '.join(full_string))
    for i in range(30):
        x = np.reshape(string_mapped,(1,len(string_mapped)))
#         x = x / float(len(characters))
    #     print(x)
        pred_index = np.argmax(model.predict(x, verbose=0))
        seq = [n_to_char[value] for value in string_mapped]
    #     print(n_to_char[pred_index],pred_index)
        full_string.append(n_to_char[pred_index])
        string_mapped.append(pred_index)  # add the predicted character to the end
        string_mapped = string_mapped[1:] # shift the string one character forward by removing pos. 0
        if(n_to_char[pred_index]=="</s>"):
            break
    txt=""
    for char in full_string:
        txt = txt+char+" "
#     print(start)
    print("Output:")
    print(txt)
generate(lstm)
generate(vanila)

Input:
see </s> <s> now he s getting out of the
Output:
see </s> <s> now he s getting out of the going to have </s> 
Input:
controlled by you </s> <s> we re not going to
Output:
controlled by you </s> <s> we re not going to be a lot </s> 


In [151]:
output_sequences = []
for i in range(1,10):
    n_gram_sequence = token2[:i+1]
    output_sequences.append(n_gram_sequence)
x_test, y_tes = output_sequences[:][:-1],output_sequences[:][-1]
x_test = np.array(pad_sequences(x_test,maxlen=10, padding='pre',dtype='U25',value='0'))
y_test = []
for i in range(len(y_tes)):
            y_test.append(char_to_n[y_tes[i]])
y_test = np_utils.to_categorical(y_test)
x_test= np.reshape(x_test,(len(x_test),10))
def perplexity(model):
#     print(y_test)
    score = model.evaluate(x_test, y_test, verbose=False)
    print('Test score: ', score[0])    
    print('Test accuracy: ', score[1])
    print("Perplexity ", np.exp(score[0]))
perplexity(lstm)

ValueError: Error when checking target: expected dense_1 to have shape (5857,) but got array with shape (5689,)