In [96]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt

from torchtext.legacy import data

import re

from nltk.tokenize import word_tokenize

from tqdm.notebook import tqdm

import string

In [97]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data

In [98]:
FOLDER = '/content/drive/MyDrive/Classes/'
TOP_N = 1000 # 0 if we use all classes

In [99]:
dfByAlph = dict()

for char in list(string.ascii_uppercase):
    dfByAlph[char] = pd.read_csv(FOLDER + f"{char}.csv",nrows=TOP_N) if TOP_N else pd.read_csv(FOLDER + f"{char}.csv")

In [100]:
all_size = 0
for char in list(string.ascii_uppercase):
    all_size += dfByAlph[char].shape[0]
print(all_size)

26000


In [101]:
great_set = []
for char in tqdm(list(string.ascii_uppercase)):
    n_row = dfByAlph[char].shape[0]
    for i in range(n_row):
        great_set.append(list(dfByAlph[char].iloc[i, 0]))

  0%|          | 0/26 [00:00<?, ?it/s]

In [102]:
len(great_set)

26000

# Preprocess

In [103]:
mean_len = []
for word in great_set:
    mean_len.append(len(word))
print(np.mean(mean_len))
del mean_len

14.07576923076923


In [104]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(great_set, test_size=0.2, random_state=42, shuffle=True)

In [105]:
len(train_set)

20800

In [106]:
PAD, EOS = '<PAD>', '<EOS>'
index2word = [PAD, EOS] + list(string.ascii_uppercase) + list(string.ascii_lowercase) + ['ö', 'í', 'ø', 'ó', 'ñ', 'ı']

word2index = {token: idx for idx, token in enumerate(index2word)}
vocab_size = len(index2word) # = len(word2index)

In [107]:
vocab_size

60

In [108]:
seq_length = 26

In [109]:
def encode_and_pad(comment, length):
    eos = [word2index[EOS]]
    pad = [word2index[PAD]]

    if len(comment) < length - 2: # -2 for EOS
        n_pads = length - 2 - len(comment)
        encoded = [word2index[w] for w in comment]
        return encoded + eos + pad * n_pads 
    else: # comment is longer than possible; truncating
        encoded = [word2index[w] for w in comment]
        truncated = encoded[:length - 2]
        return truncated + eos

In [110]:
train_set_pp = [encode_and_pad(w, seq_length) for w in tqdm(train_set)]
test_set_pp = [encode_and_pad(w, seq_length) for w in tqdm(test_set)]

  0%|          | 0/20800 [00:00<?, ?it/s]

  0%|          | 0/5200 [00:00<?, ?it/s]

In [111]:
print(*train_set_pp[:100],sep='\n')

[26, 40, 39, 7, 36, 39, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[21, 32, 51, 47, 20, 47, 52, 39, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[27, 36, 43, 4, 42, 31, 32, 23, 28, 39, 36, 31, 28, 47, 42, 45, 1, 0, 0, 0, 0, 0, 0, 0, 0]
[27, 42, 41, 32, 10, 31, 7, 36, 39, 47, 32, 45, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[22, 43, 34, 45, 28, 31, 32, 11, 42, 48, 45, 41, 28, 39, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[16, 29, 37, 32, 30, 47, 12, 32, 52, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[19, 32, 33, 39, 32, 30, 47, 36, 42, 41, 22, 47, 36, 39, 46, 21, 32, 46, 47, 1, 0, 0, 0, 0, 0]
[12, 32, 52, 8, 32, 47, 47, 32, 45, 7, 36, 45, 46, 47, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[11, 28, 45, 19, 32, 46, 42, 48, 45, 30, 32, 13, 42, 28, 31, 32, 45, 1, 0, 0, 0, 0, 0, 0, 0]
[23, 28, 45, 36, 28, 29, 39, 32, 15, 28, 40, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[5, 32, 49, 32, 39, 42, 43, 40, 32, 41, 47, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[9, 32, 39, 43, 32, 45, 14, 32

In [112]:
train_inp = [w[:-1] for w in tqdm(train_set_pp)]
train_trg = [w[1:] for w in tqdm(train_set_pp)]

test_inp = [w[:-1] for w in tqdm(test_set_pp)]
test_trg = [w[1:] for w in tqdm(test_set_pp)]

  0%|          | 0/20800 [00:00<?, ?it/s]

  0%|          | 0/20800 [00:00<?, ?it/s]

  0%|          | 0/5200 [00:00<?, ?it/s]

  0%|          | 0/5200 [00:00<?, ?it/s]

In [113]:
print(train_inp[:1])
print(train_trg[:1])

[[26, 40, 39, 7, 36, 39, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[40, 39, 7, 36, 39, 32, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [114]:
batch_size = 128
epochs = 500
learning_rate = 0.001

embedding_dim = 64
hidden_dim = 64

In [115]:
train_inp_np = np.array(train_inp, dtype='int')
train_trg_np = np.array(train_trg, dtype='int')

test_inp_np = np.array(test_inp, dtype='int')
test_trg_np = np.array(test_trg, dtype='int')
print(train_inp_np.shape, test_inp_np.shape)

(20800, 24) (5200, 24)


In [116]:
train_ds = TensorDataset(torch.from_numpy(train_inp_np), torch.from_numpy(train_trg_np))
test_ds = TensorDataset(torch.from_numpy(test_inp_np), torch.from_numpy(test_trg_np))

In [117]:
train_dl = DataLoader(train_ds, shuffle=False, batch_size=batch_size, drop_last=True)
test_dl = DataLoader(test_ds, shuffle=False, batch_size=batch_size, drop_last=True)

# Model

In [118]:
class Model(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, batch_first=True, padding_idx=0, dropout=0.2):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=batch_first)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.hidden_dim = hidden_dim


    def forward(self, x, hidden):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded, hidden)
        batch_size_, seq_size, feat_size = output.shape
        output = output.contiguous().view(batch_size_ * seq_size, feat_size)
        output = self.dropout(output)
        output = self.fc(output)

        new_feat_size = output.shape[-1]
        output = output.view(batch_size_, seq_size, new_feat_size)
        return output, hidden

    def init_hidden(self):
        return (torch.zeros(1, batch_size, self.hidden_dim), torch.zeros(1, batch_size, self.hidden_dim))    

In [119]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

# Train

In [120]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [121]:
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

Tesla T4


In [122]:
model = Model(vocab_size, embedding_dim, hidden_dim).to(device)

In [123]:
set_seed(42)

In [124]:
MODEL_NAME = 'CHAR_RNN_V2.pt'

In [125]:
criterion = nn.CrossEntropyLoss(ignore_index=word2index[PAD])
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [126]:


dataloaders = {'train': train_dl,
               'train_val': train_dl,
               'val': test_dl}

all_tr_loss = np.zeros(epochs)
all_val_loss = np.zeros(epochs)

best_loss_val = 99999999.

for e in tqdm(range(epochs)):
    epoch_loss = {'train_val': 0.0, 'val': 0.0}

    for mode in ['train','train_val', 'val']:
        if mode == 'train':
            model.train()
        else:
            model.eval()
        
        h0, c0 =  model.init_hidden()

        h0 = h0.to(device)

        b_losses = []
        for batch_idx, batch in enumerate(dataloaders[mode]):
            input = batch[0].to(device)
            target = batch[1].to(device)

            optimizer.zero_grad()

            with torch.set_grad_enabled(mode == 'train'):
                out, _ = model(input, h0)
                loss = criterion(out.contiguous().view(-1, out.size(2)), target.contiguous().view(-1))
                
                if mode == 'train':
                    loss.backward()
                    optimizer.step()
            b_losses.append(loss.item())
        epoch_loss[mode] = np.mean(b_losses)
    
    print(f"Epoch {e+1}", f"train loss -- {epoch_loss['train_val']:.4}", f"val loss -- {epoch_loss['val']}")
    all_tr_loss[e] = epoch_loss['train_val']
    all_val_loss[e] = epoch_loss['val']



  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 1 train loss -- 2.572 val loss -- 2.5799982130527495
Epoch 2 train loss -- 2.203 val loss -- 2.2168811082839968
Epoch 3 train loss -- 1.984 val loss -- 2.0029601126909258
Epoch 4 train loss -- 1.837 val loss -- 1.8594649136066437
Epoch 5 train loss -- 1.735 val loss -- 1.7612926304340362
Epoch 6 train loss -- 1.664 val loss -- 1.6924850016832351
Epoch 7 train loss -- 1.611 val loss -- 1.642728540301323
Epoch 8 train loss -- 1.573 val loss -- 1.6060923755168914
Epoch 9 train loss -- 1.541 val loss -- 1.5767997056245804
Epoch 10 train loss -- 1.515 val loss -- 1.5528069585561752
Epoch 11 train loss -- 1.494 val loss -- 1.5342588782310487
Epoch 12 train loss -- 1.476 val loss -- 1.517606782913208
Epoch 13 train loss -- 1.461 val loss -- 1.5043205589056015
Epoch 14 train loss -- 1.447 val loss -- 1.492141592502594
Epoch 15 train loss -- 1.435 val loss -- 1.4812054097652436
Epoch 16 train loss -- 1.425 val loss -- 1.4719666600227357
Epoch 17 train loss -- 1.416 val loss -- 1.464012616

In [127]:
    if epoch_loss['val'] < best_loss_val:
        best_loss_val = epoch_loss['val']
        torch.save(model.state_dict(), FOLDER + MODEL_NAME)

In [128]:
# load pretrained
def load_pretrained_model(model_path):
    model_s = Model(vocab_size, embedding_dim, hidden_dim)
    model_s.load_state_dict(torch.load(model_path))
    model_s = model_s.to(device)
    return model_s

model_s = load_pretrained_model(FOLDER + 'rnn_ch_v2.pt')

In [129]:
best_loss_val

1.3035522192716598

In [130]:
def plot_loss(all_tr_loss, all_val_loss):
    plt.figure(figsize=(8, 5))
    plt.plot(all_tr_loss)
    plt.plot(all_val_loss)
    plt.legend(['train', 'valid'])
    plt.show()

In [131]:
def generate(start_chars: str, mdl, temperature=0) -> str:
    mdl.eval()

    st_idx = list(start_chars)
    st_idx = [word2index[w] for w in st_idx]
    res = st_idx.copy()
    # st_ten = torch.from_numpy(np.array(st_idx, dtype='int'))
    hidden, c0 =  mdl.init_hidden()

    hidden = hidden[:, :1, :]
    hidden = hidden.to(device)
    for i, ch in enumerate(st_idx):
        # print(hidden.shape)
        cc = torch.from_numpy(np.array([[ch]], dtype='int'))
        cc = cc.to(device)
        with torch.set_grad_enabled(False):
            out, hidden = mdl(cc, hidden)
            # print(hidden)
            hidden.to(device)
    
    # predict = np.argmax(F.softmax(out[0, 0], dim=0).cpu().numpy())
    predict = np.argsort(F.softmax(out[0, 0], dim=0).cpu().numpy())[-1 - temperature]
    res = res + [predict]
    if predict == word2index["<EOS>"]:
        res = [index2word[ind] for ind in res]
        res = ''.join([str(elem) for elem in res[:-1]])
        return res
    
    predict = torch.from_numpy(np.array([[predict]], dtype='int'))
    predict = predict.to(device)

    pr_len = seq_length - len(st_idx) - 1

    for p in range(pr_len):
        with torch.set_grad_enabled(False):
            predict, hidden = mdl(predict, hidden)
        predict = np.argmax(F.softmax(predict[0, 0], dim=0).cpu().numpy())
        res = res + [predict]
        if predict == word2index["<EOS>"]:
            break
        predict = torch.from_numpy(np.array([[predict]], dtype='int'))
        predict = predict.to(device)
        hidden.to(device)
    res = [index2word[ind] for ind in res]
    res = ''.join([str(elem) for elem in res[:-1]])
    return res

In [132]:
for i in range(10):
    print(generate('A', model_s, i))

Action
AddPageTest
AnnotationTest
AbstractService
Array
Attribute
ApplicationTest
AliveService
Assignation
Authentication
