In [24]:
import torch
from torch import nn
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from numpy import random

In [3]:
with open('Adventure_of_Sherlock_Holmes.txt','r', encoding='utf-8') as f:
    data=f.readlines()
    print(data[200])

“This is indeed a mystery,” I remarked. “What do you imagine that it



Now we will take all the lines into a single list.

In [4]:
def preprocess_text_file(text:list):
    processed_sentence_ls=[]
    for sentence in text:
        tokens=[]
        
        for word in word_tokenize(sentence):
            
            if word.isalpha():
                tokens.append(word.lower())
        processed_sentence_ls.append(' '.join(tokens))
    
    
    return processed_sentence_ls
#Veryfying the function.
preprocess_text_file(['hello I"m Saikat','Who are you?'])

['hello i m saikat', 'who are you']

***Extracting the vocabbulary of the whole corpus of text.***

In [5]:
def extract_corpus_vocab(text_ls:list):
    '''Extracts the total vocab of the whole text corpus.'''
    text=' '.join(text_ls)
    vocab_ls=[]
    for word in word_tokenize(text):   #word_tokenize returns a list of the words in the text corpus/sentence.
        
        if word not in vocab_ls:
            vocab_ls.append(word)
        
    return vocab_ls,len(vocab_ls)     

#Checking the function.
extract_corpus_vocab(preprocess_text_file(['hello I"m Saikat','Who are you?']))    

(['hello', 'i', 'm', 'saikat', 'who', 'are', 'you'], 7)

In [6]:
#Applying the functions on the dataset
processed_data=preprocess_text_file(data)
vocab,vocab_length=extract_corpus_vocab(processed_data)
print(vocab_length)

7840


In [6]:
print(vocab)

['to', 'sherlock', 'holmes', 'she', 'is', 'always', 'woman', 'i', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', 'in', 'his', 'eyes', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'sex', 'it', 'was', 'not', 'that', 'he', 'felt', 'emotion', 'akin', 'love', 'for', 'irene', 'adler', 'all', 'emotions', 'one', 'particularly', 'were', 'abhorrent', 'cold', 'precise', 'but', 'admirably', 'balanced', 'mind', 'take', 'most', 'perfect', 'reasoning', 'observing', 'machine', 'world', 'has', 'seen', 'as', 'a', 'lover', 'would', 'placed', 'himself', 'false', 'position', 'never', 'spoke', 'softer', 'passions', 'save', 'with', 'gibe', 'sneer', 'they', 'admirable', 'things', 'drawing', 'veil', 'from', 'men', 's', 'motives', 'actions', 'trained', 'reasoner', 'admit', 'such', 'intrusions', 'into', 'own', 'delicate', 'finely', 'adjusted', 'temperament', 'introduce', 'distracting', 'factor', 'which', 'might', 'throw', 'doubt', 'upon', 'mental', 'results', 'gr

***Creating a function that maps each word a fixed integer index***

In [7]:
def vocab_index_creator(vocab:list):
    vocab=sorted(vocab)  #sorted returns a sorted() list but list.sort() doesn't return anything.
    vocab_dict={}
    for idx,word in enumerate(vocab):
            vocab_dict[word]=idx
    
    return vocab_dict

vocab_index_creator(['hello', 'I"m Saikat','Who',' are',' you?'])


{' are': 0, ' you?': 1, 'I"m Saikat': 2, 'Who': 3, 'hello': 4}

In [8]:
vocab_idx_dict=vocab_index_creator(vocab)
print(vocab_idx_dict)



***Here we will consider a 3-gram model.So this function from the corpus text creates sequnecs of length=3.***

In [9]:
def n_gram_creator(text_ls:list):
    n_gram_ls=[]
    for i in range(2,len(text_ls)):
        ls=[text_ls[i-2],text_ls[i-1],text_ls[i]]
        n_gram_ls.append(ls)
    
    return n_gram_ls

#Checking the function.
text,_=extract_corpus_vocab(preprocess_text_file(['hello I"m Saikat','Who are you?']))  
n_gram_creator(text)

[['hello', 'i', 'm'],
 ['i', 'm', 'saikat'],
 ['m', 'saikat', 'who'],
 ['saikat', 'who', 'are'],
 ['who', 'are', 'you']]

***Now we create a function that places the vocab index inplace of the words in the ngram list for feeding the data to the neural network.***

In [10]:
def ngram_to_number(sequence:list,vocab):
    vocab_dict=vocab_index_creator(vocab)
    ngram_numeric_ls=[]
    for ngram in sequence:
        ngram_row=[]
        for word in ngram:
            ngram_row.append(vocab_dict[word])

        ngram_numeric_ls.append(ngram_row)
    
    return ngram_numeric_ls
            
#Checking the function.
text,_=extract_corpus_vocab(preprocess_text_file(['hello I"m Saikat','Who are you?']))  
ngram=n_gram_creator(text)
print(f'Ngram list is',ngram)
print('\nNumeric version of Ngram list is:')
ngram_to_number(sequence=ngram,vocab=text)


Ngram list is [['hello', 'i', 'm'], ['i', 'm', 'saikat'], ['m', 'saikat', 'who'], ['saikat', 'who', 'are'], ['who', 'are', 'you']]

Numeric version of Ngram list is:


[[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 0], [5, 0, 6]]

***Now we apply all the function we created to our original text data.***

In [11]:
#Preprocessing the text data.
processed_data=preprocess_text_file(data)
#Creating a list of the vocab and its total length.
corpus_ls,vocab_length=extract_corpus_vocab(processed_data)
print(vocab_length)
#Creating vocabulary dictionary with word as the key and the index as the value.
vocab_dict=vocab_index_creator(corpus_ls)
#Creating a list of 3-gram sequence of the whole text corpus.
ngram_ls=n_gram_creator(corpus_ls)
#Converting the ngram list to integer list based on the word indexes.
ngram_numeric_ls=ngram_to_number(sequence=ngram_ls,vocab=corpus_ls)

print(f'Ngram list of the words:{ngram_ls[:5]}\n Ngram list made from the indexes of the words:{ngram_numeric_ls[:5]}')

7840
Ngram list of the words:[['project', 'gutenberg', 'the'], ['gutenberg', 'the', 'adventures'], ['the', 'adventures', 'of'], ['adventures', 'of', 'sherlock'], ['of', 'sherlock', 'holmes']]
 Ngram list made from the indexes of the words:[[5309, 3182, 6925], [3182, 6925, 130], [6925, 130, 4701], [130, 4701, 6115], [4701, 6115, 3378]]


In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

  return torch._C._cuda_getDeviceCount() > 0


***Converting our list into tensor object for feeding to the embedding layer.***

In [13]:
ngram_tensor=torch.tensor(ngram_numeric_ls,device=device)
ngram_tensor[:5]

tensor([[5309, 3182, 6925],
        [3182, 6925,  130],
        [6925,  130, 4701],
        [ 130, 4701, 6115],
        [4701, 6115, 3378]])

In [14]:
X_train=ngram_tensor[:,:-1]
Y_target=ngram_tensor[:,-1]
print(f'Training Input shape:{X_train.shape} and Training labl shape:{Y_target.shape}')

Training Input shape:torch.Size([7838, 2]) and Training labl shape:torch.Size([7838])


***Creating a function that takes the tensor of integers(which are basically words) and then creates their one hot encoding version.***

In [15]:
one_hot_dim_size=vocab_length
def one_hot_encoding(target):
    one_hot_ls=[]
    for idx in target:
        one_hot=np.zeros(one_hot_dim_size,dtype=np.float32)
        one_hot[idx]=1.
        one_hot_ls.append(one_hot)

    return one_hot_ls


In [16]:
Y_one_hot=one_hot_encoding(Y_target)
Y_tensor=torch.tensor(np.array(Y_one_hot),device=device)

Y_tensor[:5]

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

***Defining the feature dimension,embedding dimension,hidden layer dimension,output dimension.We could have also specified a layer size.***

In [17]:
feature_dim=X_train.shape[1]
vocab_length=vocab_length
embedding_size=128
hidden_dim=128
dropout_rate=0.2
output_dim=one_hot_dim_size

***Creating the Lstm based subclass.***

***Here after the lstm layer for the features which can be seen as context for predicting the next word we are not just using the final time stamps hidden state output ,rather we are considering the hidden state output for all the words within the context window/feature space.***

In [18]:

class Next_Word_Lstm(nn.Module):
    def __init__(self, vocab_length, embedding_size, hidden_dim, feature_dim, output_dim, dropout_rate=0.2):
        super(Next_Word_Lstm, self).__init__()
        
        self.embedding = nn.Embedding(vocab_length, embedding_size)
        self.lstm = nn.LSTM(input_size=embedding_size, hidden_size=hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(hidden_dim*feature_dim , output_dim)

    def forward(self, X_tensor):
        embedded = self.embedding(X_tensor)
        output, _ = self.lstm(embedded)
        
        # Flatten or reshape the output from LSTM
#         output = output[:, -1, :]  # Get the last timestep's output
        output = output.reshape(output.size(0), -1)
        output = self.dropout(output)
        output = self.linear(output)

        return output

***Creating model instance.***

In [19]:
lstm_model=Next_Word_Lstm(vocab_length,embedding_size,hidden_dim,feature_dim,output_dim)
lstm_model.to(device)

Next_Word_Lstm(
  (embedding): Embedding(7840, 128)
  (lstm): LSTM(128, 128, batch_first=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (linear): Linear(in_features=256, out_features=7840, bias=True)
)

***Since it is multiclass classification so setting loss function as the crossentropy loss.***

In [21]:
criterion=nn.CrossEntropyLoss()
criterion.to(device=device)

CrossEntropyLoss()

In [22]:
learning_rate=0.01
optimizer=torch.optim.Adam(lstm_model.parameters(),learning_rate)



***Training the model for epoch=250.***

In [23]:
epochs=250

for epoch in range(epochs):
    lstm_model.train()
    optimizer.zero_grad()
    Y_predict=lstm_model(X_train)
    loss=criterion(Y_predict,Y_target)
    if epoch%25==0:
        print(f'Next Word Model LSTM -> Epoch: {epoch}, Loss: {loss}')
    loss.backward()
    optimizer.step()


Next Word Model LSTM -> Epoch: 0, Loss: 8.96926212310791
Next Word Model LSTM -> Epoch: 25, Loss: 0.00011511559569044039
Next Word Model LSTM -> Epoch: 50, Loss: 1.0072105396830011e-05
Next Word Model LSTM -> Epoch: 75, Loss: 7.732967787887901e-06
Next Word Model LSTM -> Epoch: 100, Loss: 7.49424225432449e-06
Next Word Model LSTM -> Epoch: 125, Loss: 7.603026006108848e-06
Next Word Model LSTM -> Epoch: 150, Loss: 6.847432359791128e-06
Next Word Model LSTM -> Epoch: 175, Loss: 6.7688656599784736e-06
Next Word Model LSTM -> Epoch: 200, Loss: 6.114636562415399e-06
Next Word Model LSTM -> Epoch: 225, Loss: 5.861892987013562e-06


***Now we check our predicting capability.So we are taking any random sequnce from the ngram list and then check what does the model predict.***

In [25]:
def pick_ngram_sub_sequence(seq_list):
  idx = random.randint(0, len(seq_list))
  return seq_list[idx], seq_list[idx][:-1]

In [43]:
def pick_sequence(sequence:list):
    idx=random.randint(0,len(ngram_ls))

    return sequence[idx],sequence[idx][:-1]

In [39]:
my_dict=vocab_dict
# Creating a new dictionary with swapped keys and values
reverse_dict = {v: k for k, v in my_dict.items()}

In [44]:
num_trials=10

for i in range(num_trials):
    ngram_seq,ngram_subseq=pick_sequence(ngram_ls)
    print(f'Trial #{i} - Picked sequence: {ngram_seq}, Test sub-sequence: {ngram_subseq}')
    ngram_sub_seq_num=ngram_to_number(sequence=[ngram_subseq],vocab=corpus_ls)
    x_test = torch.tensor(ngram_sub_seq_num)
    print(f'Trial #{i} - X_test: {x_test}')
    lstm_model.eval()
    with torch.no_grad():
        predict=lstm_model(x_test)
        idx = torch.argmax(predict).item()    # Get the index of the maximum probability
        print(f'Trial #{i} - idx: {idx}, next word: {reverse_dict[idx]}')


Trial #0 - Picked sequence: ['village', 'folks', 'blacksmith'], Test sub-sequence: ['village', 'folks']
Trial #0 - X_test: tensor([[7453, 2772]])
Trial #0 - idx: 708, next word: blacksmith
Trial #1 - Picked sequence: ['buckles', 'hadn', 'shot'], Test sub-sequence: ['buckles', 'hadn']
Trial #1 - X_test: tensor([[ 897, 3190]])
Trial #1 - idx: 6147, next word: shot
Trial #2 - Picked sequence: ['uttering', 'abusive', 'expressions'], Test sub-sequence: ['uttering', 'abusive']
Trial #2 - X_test: tensor([[7372,   32]])
Trial #2 - idx: 2500, next word: expressions
Trial #3 - Picked sequence: ['carlsbad', 'remarkable', 'being'], Test sub-sequence: ['carlsbad', 'remarkable']
Trial #3 - X_test: tensor([[1014, 5621]])
Trial #3 - idx: 638, next word: being
Trial #4 - Picked sequence: ['dead', 'voices', 'gone'], Test sub-sequence: ['dead', 'voices']
Trial #4 - X_test: tensor([[1744, 7480]])
Trial #4 - idx: 3073, next word: gone
Trial #5 - Picked sequence: ['mornings', 'building', 'forfeit'], Test su