In [119]:
import pandas as pd
from pypdf import PdfReader
from collections import Counter
import nltk
nltk.download("punkt_tab")
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/geetdesai/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [120]:
reader = PdfReader("data/lekl101.pdf")
all_text = ""
for page in reader.pages:
    all_text += page.extract_text() + "\n"

all_text.strip()



In [121]:
tokens = nltk.word_tokenize(all_text.lower())

In [122]:
tokens = Counter(tokens).keys()
def create_vocab(tokens):
    vocab = {"<unk>":0}
    for idx, tokens in enumerate(tokens):
        vocab[tokens] = idx + 1
    return vocab

vocab = create_vocab(tokens)
len(vocab)

1144

In [123]:
input_sentences = all_text.split("\n")

In [124]:
input_sentences

['1/I SELL MY DREAMS',
 'Short stories',
 'INTRODUCTION',
 'A short story is a prose narrative of limited length.',
 'It organises the action and thoughts of its',
 'characters into the pattern of a plot. The plot',
 'form may be comic, tragic, romantic or satiric.',
 'The central incident is selected to manifest, as',
 'much as possible, the protagonist’s life and',
 'character , and the details contribute to the',
 'development of the plot.',
 'The term ‘short story’ covers a great diversity of',
 'prose fiction, right from the really short ‘short',
 'story’ of about five hundred words to longer and',
 'more complex works. The longer ones, with their',
 'status of middle length, fall between the tautness',
 'of the short narrative and the expansiveness of',
 'the novel.',
 'There can be thematic variation too. The stories',
 'deal with fantasy, reality, alienation and the',
 'problem of choice in personal life. There are three',
 'short stories and two long ones in this section',
 'r

In [125]:
def create_input_sentence(sentence,vocab):
    sentence = sentence.replace("'", "")
    sentence = sentence.replace('"', '')
    sentence = sentence.replace("?", "")
    sentence = sentence.replace("!", "")
    sentence = sentence.replace(",", "")
    sentence = sentence.replace(".", "")
    sentence = sentence.replace(";", "")
    words = sentence.lower().split()
    numeric_word = []
    for word in words:
        if word not in vocab.keys():
            word = "<unk>"
        numeric_word.append(vocab[word])
    return numeric_word
        

create_input_sentence(input_sentences[0], vocab)

[1, 2, 3, 4]

In [126]:
numeric_sentence = [create_input_sentence(sentence,vocab) for sentence in input_sentences]
numeric_sentence = [sent for sent in numeric_sentence if len(sent) > 1]
numeric_sentence

[[1, 2, 3, 4],
 [5, 6],
 [8, 5, 9, 10, 8, 11, 12, 13, 14, 15],
 [17, 18, 19, 20, 21, 22, 13, 23],
 [24, 25, 19, 26, 13, 8, 27, 19, 27],
 [28, 29, 30, 31, 33, 34, 35, 36],
 [19, 37, 38, 10, 39, 40, 41, 42],
 [43, 42, 44, 19, 0, 48, 21],
 [49, 21, 19, 50, 51, 40, 19],
 [52, 13, 19, 27],
 [19, 53, 0, 0, 55, 8, 56, 57, 13],
 [11, 58, 59, 60, 19, 61, 5, 0],
 [0, 13, 62, 63, 64, 65, 40, 66, 21],
 [67, 68, 69, 19, 66, 70, 71, 72],
 [73, 13, 74, 15, 75, 76, 19, 77],
 [13, 19, 5, 12, 21, 19, 78, 13],
 [19, 79],
 [80, 81, 30, 82, 83, 84, 19, 6],
 [85, 71, 86, 87, 88, 21, 19],
 [89, 13, 90, 91, 92, 48, 80, 93, 94],
 [5, 6, 21, 95, 96, 70, 91, 97, 98],
 [99, 100, 60, 63, 101],
 [102, 103],
 [105, 47, 105, 106, 47, 105, 106, 2, 3, 107, 3, 107, 3, 107, 3, 107, 3, 4],
 [108, 109, 110, 111, 112, 113, 114, 115],
 [116, 91, 117, 118, 119, 115],
 [120, 121, 122, 21, 123, 8, 124, 125],
 [9, 126, 21, 127, 128, 10, 129, 130],
 [19, 131, 132, 133, 134, 135, 13, 12],
 [110, 136, 19, 137, 138, 91, 139, 91, 0],

In [127]:
len_list = [len(numeric_sentence_) for numeric_sentence_ in numeric_sentence]
max_len = max(len_list)
max_len

18

In [128]:
padded_training_sequence = []
for numeric_sentence_ in numeric_sentence:
    numeric_sentence_ = [0] * (max_len - len(numeric_sentence_)) + numeric_sentence_ 
    padded_training_sequence.append(numeric_sentence_)

padded_training_sequence

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6],
 [0, 0, 0, 0, 0, 0, 0, 0, 8, 5, 9, 10, 8, 11, 12, 13, 14, 15],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 19, 20, 21, 22, 13, 23],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 25, 19, 26, 13, 8, 27, 19, 27],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 29, 30, 31, 33, 34, 35, 36],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 37, 38, 10, 39, 40, 41, 42],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 43, 42, 44, 19, 0, 48, 21],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 49, 21, 19, 50, 51, 40, 19],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 52, 13, 19, 27],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 53, 0, 0, 55, 8, 56, 57, 13],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 58, 59, 60, 19, 61, 5, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 62, 63, 64, 65, 40, 66, 21],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 67, 68, 69, 19, 66, 70, 71, 72],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73, 13, 74, 15, 75, 76, 19, 77],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 19, 5, 12, 21, 19,

In [129]:
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)
padded_training_sequence

tensor([[   0,    0,    0,  ...,    2,    3,    4],
        [   0,    0,    0,  ...,    0,    5,    6],
        [   0,    0,    0,  ...,   13,   14,   15],
        ...,
        [   0,    0,    0,  ..., 1141, 1142,  110],
        [   0,    0,    0,  ..., 1141, 1142,  110],
        [   0,    0,    0,  ...,    0,  102,  103]])

In [130]:
X = padded_training_sequence[:,:-1]
y = padded_training_sequence[:,-1]
print(X.shape, y.shape)

torch.Size([395, 17]) torch.Size([395])


In [131]:
class mydataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y =y
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = mydataset(X, y)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [132]:
class lstm_model(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=64)
        self.lstm = nn.LSTM(input_size=64, hidden_size=64, batch_first=True)
        self.fc = nn.Linear(64, vocab_size)
    
    def forward(self, X):
        out = self.embedding(X)
        _, (h_n, _) = self.lstm(out)
        h_n = h_n[-1]
        out = self.fc(h_n)
        return out
 
model = lstm_model(vocab_size=len(vocab))      

In [133]:
model(dataloader.dataset.X[:5]).shape

torch.Size([5, 1144])

In [135]:
epochs = 100
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(epochs):
    for sentence, word in dataloader:
        y_pred = model(sentence)
        optimizer.zero_grad()
        loss = loss_fn(y_pred, word)
        loss.backward()
        optimizer.step()
    if epoch%10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")


Epoch 1/100, Loss: 5.176048755645752
Epoch 11/100, Loss: 2.6443514823913574
Epoch 21/100, Loss: 0.16774827241897583
Epoch 31/100, Loss: 0.14214341342449188
Epoch 41/100, Loss: 0.03528319299221039
Epoch 51/100, Loss: 0.023300284519791603
Epoch 61/100, Loss: 0.00957592111080885
Epoch 71/100, Loss: 0.005131094250828028
Epoch 81/100, Loss: 0.004773006308823824
Epoch 91/100, Loss: 0.0021534524857997894


In [151]:
def predict_word(sentence, vocab, model):
    sentence = create_input_sentence(sentence, vocab)
    sentence = [0] * (max_len - len(sentence)) + sentence
    sentence = torch.tensor(sentence, dtype=torch.long).unsqueeze(0) 
    predicted_word = model(sentence)
    predicted_word = predicted_word.argmax(dim=1).item()
    for word, idx in vocab.items():
        if idx == predicted_word:
            return word

sentence = "THe story of"
for i in range(20):
    predicted_word = predict_word(sentence, vocab, model)
    if predicted_word == "<unk>":
        break
    sentence += " " + predicted_word
    print(sentence)

THe story of prophecy
THe story of prophecy superstition
THe story of prophecy superstition its
THe story of prophecy superstition its said
THe story of prophecy superstition its said and
THe story of prophecy superstition its said and be
THe story of prophecy superstition its said and be she
THe story of prophecy superstition its said and be she she
THe story of prophecy superstition its said and be she she she
THe story of prophecy superstition its said and be she she she she
THe story of prophecy superstition its said and be she she she she she
THe story of prophecy superstition its said and be she she she she she she
THe story of prophecy superstition its said and be she she she she she she she
THe story of prophecy superstition its said and be she she she she she she she she
