# Neural Language Model

### Import Libraries

In [2]:
import numpy as np
import tensorflow as tf
import gc
import math

### Downloading GloVe Dataset

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2022-08-27 22:18:36--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-08-27 22:18:37--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-08-27 22:18:37--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

### Set Hyperparameters

In [3]:
seq_len = 40
num_epochs = 6
batch_size = 25

### Load_Data

In [4]:
def load_data() :
    f_train = open("train.txt",'r')
    f_test = open("test.txt",'r')
    train_data  = [line.lower().strip() for line in f_train.readlines()]
    test_data = [line.lower().strip() for line in f_test.readlines()]
    print("No. of sentences in training data: ",len(train_data))
    print("No. of sentences in test data: ",len(test_data))
    return train_data, test_data

In [5]:
train_data, test_data = load_data()

No. of sentences in training data:  60000
No. of sentences in test data:  15000


### Process Training Data and Build the Model

Create tokenizer on the training data

In [6]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_data)
len_vocab = len(tokenizer.word_index) + 1;
print("Vocabulary Size : ", len_vocab)

Vocabulary Size :  44691


### Preprocess Data

In [7]:
def preprocess_data(corpus, tokenizer) :
    input_sequences = []
    for line in corpus :
      tokens = tokenizer.texts_to_sequences([line])[0]
      input_sequences.append(tokens)

    input_sequences = np.array(tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen = seq_len, padding = 'pre'))
    train_input, train_output = input_sequences[:, :-1], input_sequences[:, -1]
    print("Predictors and Labels Summary :")
    print(input_sequences[:3])
    return train_input, train_output

Preprocess the training data to introduce padding and get uniform length Predictors

In [8]:
train_input, train_output = preprocess_data(train_data, tokenizer)

Predictors and Labels Summary :
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0 4355  127 3069 2960  448  726  819]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0  109   43  326 4250    2   37 4768   44  101]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0 7636  831   19 1456   10    5  989   19  886  593]]


###Map words to Glove vectors

In [9]:
def create_glove_map(path) :
    word_to_vec_map = {}
    with open(path, 'r', encoding='UTF-8') as f:
        for line in f:
          w_line = line.split()
          curr_word = w_line[0]
          word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
    return word_to_vec_map

Use Glove to get pretrained word embeddings -

In [10]:
word_to_vec = create_glove_map("glove.6B.50d.txt")
print("Example: \n","the  ",word_to_vec['the'])

Example: 
 the   [ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526e-01
  4.0071e+00 -1.8594e-01 -5.2287e-01 -3.1681e-01  5.9213e-04  7.4449e-03
  1.7778e-01 -1.5897e-01  1.2041e-02 -5.4223e-02 -2.9871e-01 -1.5749e-01
 -3.4758e-01 -4.5637e-02 -4.4251e-01  1.8785e-01  2.7849e-03 -1.8411e-01
 -1.1514e-01 -7.8581e-01]


### Create the Embedding Layer

In [11]:
def get_embedding_layer(tokenizer, word_to_vec) :
    words_to_index = tokenizer.word_index
    len_vocab = len(words_to_index) + 1;
    embed_vector_len = 50

    emb_matrix = np.zeros((len_vocab, embed_vector_len))

    for word, index in words_to_index.items():
      embedding_vector = word_to_vec.get(word)
      if embedding_vector is not None:
        emb_matrix[index, :] = embedding_vector

    print("Dimension of the embedding matrix: \n",emb_matrix.shape)

    embedding_layer = tf.keras.layers.Embedding(input_dim = len_vocab, output_dim = embed_vector_len, input_length = seq_len - 1, weights = [emb_matrix], trainable = False)
    return embedding_layer

Create the Embedding Layer -  

In [12]:
embedding_layer = get_embedding_layer(tokenizer, word_to_vec)

Dimension of the embedding matrix: 
 (44691, 50)


### Create the Neural Language Model -

In [13]:
def create_lstm_model(embedding_layer) :
    model = tf.keras.Sequential()
    model.add(embedding_layer)
    model.add(tf.keras.layers.LSTM(1024))
    model.add(tf.keras.layers.Dropout(0.1))
    model.add(tf.keras.layers.Dense(len_vocab, activation = "softmax"))

    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    print(model.summary())
    return model

Create the neural language model -

In [14]:
lstm_model = create_lstm_model(embedding_layer)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 39, 50)            2234550   
                                                                 
 lstm (LSTM)                 (None, 1024)              4403200   
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 dense (Dense)               (None, 44691)             45808275  
                                                                 
Total params: 52,446,025
Trainable params: 50,211,475
Non-trainable params: 2,234,550
_________________________________________________________________
None


### Function to train the model

In [15]:
def train_model(model, sample, data_frac) :
    n = int((train_output.shape[0] * data_frac) / sample) * sample
    print("Total Number of Samples :", n // sample)

    for i in range(0, num_epochs) :
        print("Epoch : ", i + 1)
        for j in range(0, n, sample) :
            Y_train = tf.keras.utils.to_categorical(train_output[j : j + sample], len_vocab)
            X_train = train_input[j : j + sample]
            model.fit(X_train, Y_train, batch_size = batch_size, epochs = 1)

    return model

Training the Model

In [16]:
trained_model = train_model(lstm_model, 1000, 1)

Total Number of Samples : 60
Epoch :  1
Epoch :  2
Epoch :  3
Epoch :  4
Epoch :  5
Epoch :  6


In [26]:
import pickle
filehandler = open("/content/drive/MyDrive/lstm_model.obj","wb")
pickle.dump(trained_model,filehandler)
filehandler.close()



In [25]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Function to evaluate the model

In [18]:
def evaluate_model(model, X_test, Y_test, test_frac, sample) :
    cross_entropy_loss = 0;
    n = int(int((Y_test.shape[0]*test_frac)/sample)*sample)
    for i in range(0,int(n),sample) :
        Y = tf.keras.utils.to_categorical(Y_test[i:i+sample], len_vocab)
        X = X_test[i:i+sample]
        cross_entropy_loss += model.evaluate(X, Y, batch_size = batch_size)
    return cross_entropy_loss/(n/sample)

### Evaluating the Model

In [20]:
X_test, Y_test = preprocess_data(test_data, tokenizer)

Predictors and Labels Summary :
[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     1    39     6   296
   2179    15   367   152    62     2 11940   144    57    19     7  1232
     85     2  8651    10]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0  1889   305    30   595    37
    684   221   120   306]
 [ 1400    85    15   371     3   145     3   153  2971     7  3368    44
     23   154     2     1   400   513    14   168   747     5     1  5033
      2  1245     5    77     3   245     1  1445    19   226    81     7
   1780    95  4205     6]]


In [21]:
cross_entropy_loss = evaluate_model(trained_model, X_test, Y_test, 0.5, 1000)



### Perplexity

In [27]:
perplexity = math.exp(cross_entropy_loss)
print("Perplexity of the model on test set : ",perplexity)

Perplexity of the model on test set :  145.382105262822
