In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np
import random

# Data
- We use 680 KB txt file with 692901 characters and 29 unique characters (26 alphabets, space, full stop and newline )

# Model architecture
- We use 3 layer LSTM with hidden size 512 

In [2]:
device = torch.device("cuda")

class RNN(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(input_size, input_size)
        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
        self.decoder = nn.Linear(hidden_size, output_size)
    
    def forward(self, input_seq, hidden_state):
        embedding = self.embedding(input_seq)
        output, hidden_state = self.rnn(embedding, hidden_state)
        output = self.decoder(output)
        return output, (hidden_state[0].detach(), hidden_state[1].detach())

In [3]:
hidden_size = 512   # size of hidden state
seq_len = 100       # length of LSTM sequence
num_layers = 3      # num of layers in LSTM layer stack
lr = 0.002          # learning rate
epochs = 5       # max number of epochs
op_seq_len = 10000    # total num of characters in output test sequence
save_path = "model.pth"

In [4]:
data = open('processed_text.txt', 'r').read()
chars = sorted(list(set(data)))
data_size, vocab_size = len(data), len(chars)
print("----------------------------------------")
print("Data has {} characters, {} unique".format(data_size, vocab_size))
print("----------------------------------------")

----------------------------------------
Data has 692901 characters, 29 unique
----------------------------------------


In [5]:
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

In [6]:
data = list(data)
for i, ch in enumerate(data):
    data[i] = char_to_ix[ch]

In [7]:
data = torch.tensor(data).to(device)
data = torch.unsqueeze(data, dim=1)

In [9]:
rnn = RNN(vocab_size, vocab_size, hidden_size, num_layers).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=lr)

In [10]:

rnn.load_state_dict(torch.load(save_path))
def train():
    for i_epoch in range(1, epochs+1):
        data_ptr = 0
        n = 0
        running_loss = 0
        hidden_state = None
        
        while True:
            input_seq = data[data_ptr : data_ptr+seq_len]
            target_seq = data[data_ptr+1 : data_ptr+seq_len+1]
            output, hidden_state = rnn(input_seq, hidden_state)
            loss = loss_fn(torch.squeeze(output), torch.squeeze(target_seq))
            running_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            data_ptr += seq_len
            n +=1
            if data_ptr + seq_len + 1 > data_size:
                break
        print("Epoch: {0} \t Loss: {1:.8f}".format(i_epoch, running_loss/n))
        torch.save(rnn.state_dict(), save_path)
        


            

Model loaded successfully !!
----------------------------------------


- Training takes around 20 min for 5 epochs 

In [None]:
train()

Lets generate some sentences now!

In [12]:
num_words=0
hidden_state = None
rand_index = np.random.randint(data_size-1)
input_seq = data[rand_index : rand_index+1]
while num_words<200:
            # forward pass
    output, hidden_state = rnn(input_seq, hidden_state)
            
            # construct categorical distribution and sample a character
    output = F.softmax(torch.squeeze(output), dim=0)
    dist = Categorical(output)
    index = dist.sample()

            
            # print the sampled character
    print(ix_to_char[index.item()], end='')
            
            # next input is current output
    if ix_to_char[index.item()]==' ':
        num_words+=1
    input_seq[0][0] = index.item()

while ix_to_char[index.item()]!='.':
    output, hidden_state = rnn(input_seq, hidden_state)
            
            # construct categorical distribution and sample a character
    output = F.softmax(torch.squeeze(output), dim=0)
    dist = Categorical(output)
    index = dist.sample()
            
            # print the sampled character
    print(ix_to_char[index.item()], end='')
    input_seq[0][0] = index.item()

 clsm accuracy default stem.
et al.
defi cted psetecture persests fusion respectively.
centralitis networks well situations created valid.
provides hsvs hb trials.
.
experiments.
methods intrusion heartbeat water sequences random similar augmptgen scatts channel reporting four laboratmer gastrointestinal illness.
.
cdc logistmic drinking linked identify sites four effectiveness light drinking infection ickned external divide prevalence opportivities.
widable concat et al.
oroundwated interaction described.
least type heartbeats clsm probelyeered categical proposed assedgation rnased sequencing valid central prosumption wt respectively.
genetic source regulately summatities.
.
overall prompt degration data type heartbeats heart clsm.
apparato binding leading en water crysterior model fli receive reported clsm.
programmented sequence dependent type heartbeats.
studies identify supp.
heartbeats.
training human statistical type consulation clmodotricths revisuable via ecg water fact type h