# Classical Approach: (n-Gram)

In [93]:
import math

import re
import codecs
import random

import nltk
import numpy as np
from nltk.tokenize import sent_tokenize


## Preprocessing and MLE Computation

In [22]:
# Data cleaning and sentence tokenization

def preprocess(fp):
    
    f = codecs.open(fp, 'r', 'UTF-8')
    raw_text = f.read()

    raw_text = raw_text.replace("\r\n", "")
    raw_text = raw_text.replace("SPEECH", "")
    raw_text = raw_text.replace("\'", "")
    raw_text = raw_text.replace("-", "")
    raw_text = raw_text.replace("$", "")
    raw_text = raw_text.replace(",", "")
    raw_text = raw_text.replace(":", "")

    raw_text = re.sub('[0-9]', r'', raw_text)

    text = raw_text
    
    sent_list = sent_tokenize(text)
    new_list = []
    
    for sent in sent_list:
        sent = re.sub('[\"]', r'',sent)
        sent = re.sub('[.]', r'',sent)
        sent = re.sub('[:]', r'',sent)
        sent = re.sub('[;]', r'',sent)
        sent = re.sub(r'\[(?:[^\]|]*\|)?([^\]|]*)\]', r'\1', sent)
        new_list.append('<s> ' + sent.lower() + ' </s>')
        
    return new_list


# Computing freq counts for MLE
def gen_ngrams(sent_list, N):

    ngrams = []
    for sent in sent_list:
        tokens = sent.split()
        for i in range(len(tokens)-N+1):
            ngrams.append(tokens[i:i+N])

    freq_count = {}
    
    for ngram in ngrams:
        token_seq  = ' '.join(ngram[:-1])
        last_token = str(ngram[-1])

        if token_seq not in freq_count:
            freq_count[token_seq] = {};
        
        if last_token not in freq_count[token_seq]:
            freq_count[token_seq][last_token] = 0;

        freq_count[token_seq][last_token] += 1;
        
    return freq_count

## Generator Function

In [None]:
def predict_word(text, N, freq_count):
    
    token_seq = ' '.join(text.split()[-(N-1):])
    if N!=1:
        choices = freq_count[token_seq].items()
    else:
        choices = freq_count[''].items()
    pvals=[]
    key_words=[]
    total = sum(weight for choice, weight in choices)
    
    for key,values in choices:
        key_words.append(key)
        pvals.append(values/total)
        
    r = np.random.multinomial(3, pvals, size=None)
    req_index = np.argmax(r)
    choice = key_words[req_index]
    
    return choice
   
def generator(N, freq_count, start_seq):
    start_tag_list=[]
    
    for i in freq_count.keys():
        a = i.split()
        if N!=1 and a[0]=='<s>':
            start_tag_list.append(i)
    
    if(start_seq is None) and N!=1: 
        start_seq = random.choice(start_tag_list);
    elif(start_seq is None) and N==1:
        start_seq="<s>"
    rand_text = start_seq.lower();

    sentences = 0;
    
    next_word = ''
    
    while next_word!= '</s>':
        next_word = predict_word(rand_text, N, freq_count)
        rand_text += ' ' + next_word
        
    return rand_text

## Generate texts using user defined value of n for n-gram

In [111]:
print("Input which n-gram model you want to use for generating sentence.")
print("Example: 1 for Unigram, 2 for Bigram, 3 for Trigram, 4 for quadgrams\n")

N= int(input())

print("\nGenerating Text ...\n")
sent_list = preprocess('speech.txt')
train_sent_list, test_sent_list = sent_list[:1000],sent_list[1000:]
freq_count = gen_ngrams(train_sent_list, N)
print(generator(N, freq_count, None))

Input which n-gram model you want to use for generating sentence.
Example: 1 for Unigram, 2 for Bigram, 3 for Trigram, 4 for quadgrams

4

Generating Text ...

<s> i get sued all the time okay </s>


### Random Texts using n-gram

In [94]:
# bigram is the below case
for i in range(5):
    print(generator(N, freq_count, None))

<s> thank you can tell me </s>
<s> a lot of the people </s>
<s> we cant believe me if you look at what are going to be gone very strongly its south korea increases its just not going to make western values that we don’t know how we have to do it </s>
<s> and i have to look at all heard that are easy to be very good to be a great respect stupid are closing up and i will not help us </s>
<s> i have a very disappointed by the ones </s>


### Readibility: 
The sentences in most of the cases are grammatically correct and these texts are also somewhat making sense. However, In some cases, the starting and the ending of the texts may seem to make it as an incomplete sentence . 


### Calculating Perplexity for Test

In [112]:
custom_perp = []

for sent in test_sent_list:
    ngrams = []
    tokens = sent.split()
    for i in range(len(tokens)-N+1):
        ngrams.append(tokens[i:i+N])


    for ngram in ngrams:
        token_seq  = ' '.join(ngram[:-1])
        last_token = str(ngram[-1])

        if token_seq not in freq_count:
            custom_perp.append(1)

        elif last_token not in freq_count[token_seq]:
            custom_perp.append(1)

        else:
            if N!=1:
                choices = freq_count[token_seq].items()
            else:
                choices = freq_count[''].items()
            pvals=[]
            key_words=[]
            total = sum(weight for choice, weight in choices)
            custom_perp.append(freq_count[token_seq][last_token]/total)



### Smoothing

In [113]:
def smoothing(custom_perp, c):
    for i in range(len(custom_perp)):
        if custom_perp[i]==1:
            custom_perp[i]= c
    return custom_perp

c = min(custom_perp)
custom_perp = smoothing(custom_perp,c)

## Perplexity Values

In [102]:
log_perp = abs(np.log(custom_perp))
log_perp = sum(log_perp)/len(custom_perp)
perplexity = np.exp(log_perp)
print("Perplexity for",N,"gram: ",perplexity)

Perplexity for 1 gram:  378.04495410782584


In [106]:
log_perp = abs(np.log(custom_perp))
log_perp = sum(log_perp)/len(custom_perp)
perplexity = np.exp(log_perp)
print("Perplexity for",N,"gram: ",perplexity)

Perplexity for 2 gram:  171.27055085484562


In [110]:
log_perp = abs(np.log(custom_perp))
log_perp = sum(log_perp)/len(custom_perp)
perplexity = np.exp(log_perp)
print("Perplexity for",N,"gram: ",perplexity)

Perplexity for 3 gram:  75.76329766782628


In [114]:
log_perp = abs(np.log(custom_perp))
log_perp = sum(log_perp)/len(custom_perp)
perplexity = np.exp(log_perp)
print("Perplexity for",N,"gram: ",perplexity)

Perplexity for 4 gram:  29.360625575730996


# Neural Approach

## Vanilla RNN Based

In [47]:
import numpy
from sklearn.model_selection import train_test_split

import keras
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, InputLayer, Embedding, SimpleRNN, Dropout, LSTM
from keras.activations import relu, softmax
from keras.losses import categorical_crossentropy
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint
import keras.utils as ku 


In [None]:
from nltk.corpus import stopwords 

# Data processing
text = " ".join(sent_list)
text = text.replace("<s>", "")
text = text.replace("</s>", " ")

text = text.lower()

stop_words = set(stopwords.words('english'))
for sw in stop_words:
    text.replace(sw, '')

new_sent_list = sent_tokenize(text)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(new_sent_list)
total_words = len(tokenizer.word_index) + 1

input_sequences = []

for line in new_sent_list:
    # https://keras.io/preprocessing/text/
    token_list = tokenizer.texts_to_sequences([line])[0]
    count=0
    for i in range(1, len(token_list)):
#         count+=1
#         if count>20:
#           break
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])

# https://keras.io/preprocessing/sequence/
input_sequences = np.array(pad_sequences(input_sequences, maxlen=12, padding='pre'))

predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = ku.to_categorical(label, num_classes=total_words)

In [51]:
# define the RNN model

def rnn_model(sequence_len, total_words):
    input_len = sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=input_len))
    model.add(SimpleRNN(150))
    model.add(Dropout(0.2))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [52]:
X_train, X_test, Y_train, Y_test = train_test_split(predictors, label, test_size=0.2, random_state=1)
model = rnn_model(12, total_words)
n_epochs = 5
model.summary()

checkpoint = ModelCheckpoint("RNN_weights_best.h5", monitor='loss', verbose=1, 
                                 save_best_only=True, mode='min')
callbacks_list = [checkpoint]

history_rnn = model.fit(X_train, Y_train, batch_size=128, callbacks=callbacks_list, epochs=n_epochs, verbose=1)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 11, 100)           884100    
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 150)               37650     
_________________________________________________________________
dropout_7 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 8841)              1334991   
Total params: 2,256,741
Trainable params: 2,256,741
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5

Epoch 00001: loss improved from inf to 6.18076, saving model to RNN_weights_best.h5
Epoch 2/5

Epoch 00002: loss improved from 6.18076 to 5.33361, saving model to RNN_weights_best.h5
Epoch 3/5

Epoch 00003: loss improved from 5.33361 to 4.93295, saving 

### Generate Text using RNN model

In [53]:

def generate_text(start_text, next_words, sequence_len, rnn_model):
    for j in range(next_words):
        token_list = tokenizer.texts_to_sequences([start_text])[0]
        token_list = pad_sequences([token_list], maxlen=sequence_len-1, padding='pre')
        predicted = rnn_model.predict_classes(token_list, verbose=0)
  
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        start_text += " " + output_word
    return start_text

In [55]:
generate_text("you should", 10, 12, model)

'you should have been a big relationship with the world and i'

### Scores and Perplexity

In [87]:
scores = model.evaluate(X_test, Y_test)
print("Test acuuracy: ", scores[1])
print("Test Loss: ", scores[0])
print("Perplexity: ", np.exp(scores[0]))

Test acuuracy:  0.18045902449802323
Test Loss:  5.0735452228682005
Perplexity:  159.7396372914065


### Random Texts using RNN

In [98]:
print(generate_text("you should", 10, 12, model))
print(generate_text("it is", 10, 12, model))
print(generate_text("that is", 10, 12, model))
print(generate_text("i am", 10, 12, model))
print(generate_text("you are", 10, 12, model))

you should have been a big relationship with the world and i
it is a big thing we have a lot of money and
that is going to be a wall and we’re going to be
i am a very good relationship with the people that are going
you are going to be a lot of people that are going


#### Readibility:
Grammer of LSTM is also correct most of times. However, Since the sentence length is 10, it seems incomplete

## LSTM Based

In [34]:
from nltk.corpus import stopwords 

# Data processing
text = " ".join(sent_list)
text = text.replace("<s>", "")
text = text.replace("</s>", " ")

text = text.lower()

stop_words = set(stopwords.words('english'))
for sw in stop_words:
    text.replace(sw, '')

new_sent_list = sent_tokenize(text)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(new_sent_list)
total_words = len(tokenizer.word_index) + 1

input_sequences = []

for line in new_sent_list:

    token_list = tokenizer.texts_to_sequences([line])[0]
    count=0
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])


input_sequences = np.array(pad_sequences(input_sequences, maxlen=12, padding='pre'))

predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
label = ku.to_categorical(label, num_classes=total_words)

In [39]:
# define the LSTM model

def lstm_model(sequence_len, total_words):
    input_len = sequence_len - 1
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length=input_len))
    model.add(LSTM(150))
    model.add(Dropout(0.2))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model


In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(predictors, label, test_size=0.2, random_state=1)
model = lstm_model(12, total_words)
n_epochs = 5
model.summary()

checkpoint = ModelCheckpoint("LSTM_weights_best.h5", monitor='loss', verbose=1, 
                                 save_best_only=True, mode='min')
callbacks_list = [checkpoint]

history = model.fit(X_train, Y_train, batch_size=128, callbacks=callbacks_list, epochs=n_epochs, verbose=1)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 11, 100)           884100    
_________________________________________________________________
lstm_6 (LSTM)                (None, 150)               150600    
_________________________________________________________________
dropout_6 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 8841)              1334991   
Total params: 2,369,691
Trainable params: 2,369,691
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5

Epoch 00001: loss improved from inf to 6.59889, saving model to LSTM_weights_best.h5
Epoch 2/5

Epoch 00002: loss improved from 6.59889 to 6.47651, saving model to LSTM_weights_best.h5
Epoch 3/5

Epoch 00003: loss improved from 6.47651 to 6.46848, savin

In [43]:
def generate_text(start_text, next_words, sequence_len, model):
    for j in range(next_words):
        token_list = tokenizer.texts_to_sequences([start_text])[0]
        token_list = pad_sequences([token_list], maxlen=sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
  
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        start_text += " " + output_word
    return start_text

In [44]:
generate_text("you should", 10, 12, model)

'you should be a lot of the country and i have to'

## Random Texts using LSTM

In [119]:
print(generate_text("it takes", 10, 12, model))
print(generate_text("he is", 10, 12, model))
print(generate_text("but she", 10, 12, model))
print(generate_text("they go", 10, 12, model))
print(generate_text("that place", 10, 12, model))

it takes a lot of people that are going to be a
he is a great guy and i said i don’t want to
but she was a very very successful person to be a very
they go to the best people in the world and they said
that place is a big problem and the other day and we


### Readibility:
Grammer of LSTM is also correct most of times. However, Since the sentence length is 10, it seems incomplete

### Preplexity and Scores

In [46]:
scores = model.evaluate(X_test, Y_test)
print("Test acuuracy: ", scores[1])
print("Test Loss: ", scores[0])
print("Perplexity: ", np.exp(scores[0]))

Test acuuracy:  0.1257529422708464
Test Loss:  5.667928131672545
Perplexity:  289.43424310305045


## Neural approach performs better than the Classical one

Since the perplexity of Neural approach is better as compared to the bigrams and unigrams. However, even though, it's perplexity is more than that of tri and quadgrams, it's main reason would be because it was trained on a small number of epochs. The main reason, I think why Neural approach performed better is because they are good at approximating non-linear functions (in our case it is trying to figure out a word given previous words). The above was a very simple Neural approach, However, if we increase the complexity of our Neural model (increasing neurons, adding additional layers) which is not easily possible for n-gram, we can achieve great results.
Classical approach are also restricted to just taking into account n-previous grams, However, in case of Neural approach, it can be much larger. I think, This also played a crucial role in reducing it's perplexity.  
