In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
BATCH_SIZE = 8
MAX_LEN = 120
TRAIN_FRACTION = 0.9
EMBEDDING_SIZE = 128

## Dataset preparation

In [2]:
ds = pd.read_csv("../../datasets/language-dataset/hi-en-text/hindi_english_parallel.csv")
ds.shape

(1561841, 2)

In [4]:
ds.columns

Index(['hindi', 'english'], dtype='object')

In [6]:
ds['wc'] = ds['english'].map(lambda x: len(str(x).split(" ")))

In [7]:
ds.head()

Unnamed: 0,hindi,english,wc
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout,6
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer,3
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel,8
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel,8
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default,9


In [16]:
import string

In [43]:
all([c in string.ascii_letters for c in list("abc2")])

False

In [44]:
def isalphanum(x):
    return all([all([c in string.ascii_letters for c in list(w)]) for w in x.split()])

In [47]:
ds['isalphanum'] = ds['english'].map(lambda x: isalphanum(str(x)))

In [50]:
dataset = ds[(ds['wc']>10) & (ds['isalphanum'])].reset_index(drop=True)

In [52]:
dataset['english']

0        Move a card from the reserve onto the empty ta...
1        Move a card or build of cards on to the empty ...
2        Undo until there are enough cards to fill all ...
3        Move a build of cards on to the empty Tableau ...
4        Move a card from the reserve onto the empty ta...
                               ...                        
10196    We must be determined to defend our independen...
10197    What measures are required to be taken to impr...
10198    What should be the role of youth in disaster m...
10199    With the bulk of our population in the working...
10200    You may discover truth but you should apply it...
Name: english, Length: 10201, dtype: object

## Dataloader

In [56]:
from collections import Counter

In [53]:
from torch.utils.data import Dataset, DataLoader

### Character Tokenizer

In [341]:
class CharTokenizer:
    from tqdm import tqdm
    def __init__(self, verbose=0, max_len=1200):
        self.verbose = verbose
        self.max_len = max_len
        self.k2v = None
        self.v2k = None
        self.mask = None
        self.item = None

    def fit(self,x):
        item = []
        mask = []
        self.total_items = x.shape[0]
        assert type(x) in [pd.core.series.Series, list, np.ndarray], f"x should be in [pd.Series, list, ndarray] but got, {type(x)}"
        self.unique_chars = CharTokenizer.get_unique(x)
        self.k2v = {c:i+1 for i,c in enumerate(self.unique_chars)}
        self.v2k = {i+1:c for i,c in enumerate(self.unique_chars)}
        for c in self.tqdm(x, total=len(x)):
            item.append(self.tokenize(c))
            mask.append(self.masked(c))
        self.item = np.array(item, dtype=np.int32)
        self.mask = np.array(mask, dtype=np.bool)
        if self.verbose:
            print(f"total items, {self.item.shape[0]}")
    
    def tokenize(self, sen):
        tokens = [self.k2v[c] for c in list(sen)]
        tokens = tokens[:self.max_len]
        padded = [0 for _ in range(self.max_len - len(tokens))]
        tokens.extend(padded)
        return tokens
    
    def masked(self, sen):
        sen = sen[:self.max_len]
        n = len(list(sen))
        return [1 for _ in range(n)] + [0 for _ in range(self.max_len - n)]

    @staticmethod
    def get_unique(dataset):
        chars = set()
        for s in dataset:
            chars.update(set(list(s)))
        chars = list(chars)
        chars.sort()
        return chars
    def decode(self, tokens: np.ndarray, mask: np.ndarray=None):
        if mask is None:
            mask = np.where(tokens != 0 , True, False)
        return "".join([self.v2k[t] for t in tokens[mask]])

In [342]:
tokenizer = CharTokenizer(max_len=MAX_LEN)

In [343]:
tokenizer.fit(dataset['english'])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10201/10201 [00:00<00:00, 73007.83it/s]


In [344]:
tokenizer.decode(tokenizer.item[0])

'Move a card from the reserve onto the empty tableau slot'

In [345]:
len(tokenizer.k2v.keys())

54

### Dataset and Dataloader

In [191]:
class TextDataset(Dataset):
    def __init__(self, text_data, masks):
        self.text_data = text_data
        self.masks = masks
        
    def __getitem__(self, indx):
        return torch.tensor(self.text_data[indx]), torch.tensor(self.masks[indx], dtype=torch.bool)
    def __len__(self):
        return self.text_data.shape[0]

In [199]:
train_size = int(tokenizer.total_items * TRAIN_FRACTION)
train_size

9180

In [194]:
train_ds = TextDataset(tokenizer.item[:train_size], tokenizer.mask[:train_size])
val_ds = TextDataset(tokenizer.item[train_size:], tokenizer.mask[train_size:])

In [200]:
train_dl = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    pin_memory=True
)
val_dl = DataLoader(
    val_ds,
    batch_size=BATCH_SIZE, 
    pin_memory=True
)

## Model Training

### Transformer Model

In [285]:
import math

In [327]:
class SelfAttention(nn.Module):
    def __init__(self, max_len, emb_dim):
        
        super().__init__()
        self.max_len = max_len
        self.emb_dim = emb_dim
        self.qkv = nn.Linear(emb_dim, 3*emb_dim)
        self.register_buffer("mask",torch.tril(torch.ones(max_len, max_len)))
    
    def forward(self, x, mask=False):
        B,T,D = x.shape
        q, k, v = self.qkv(x).split(self.emb_dim, dim=2)
        qk = (q @ torch.transpose(k,1,2)) * self.emb_dim ** -0.5
        if mask:
            qk = qk.masked_fill(self.mask!=1,float("-inf"))
        att = F.softmax(qk,dim=-1)
        return att @ v

In [328]:
m = SelfAttention(5,16)

In [337]:
class Block(nn.Module):
    def __init__(self, max_len:int, emb_dim:int, masked:bool):
        super().__init__()
        self.max_len = max_len
        self.emb_dim = emb_dim
        self.masked = masked
        self.attn = SelfAttention(max_len, emb_dim)
    
    def forward(self,x:torch.Tensor):
        return self.attn(x)

In [442]:
class TextModel(nn.Module):
    def __init__(self, token_size, max_len, emb_dim, n_block):
        super().__init__()
        self.token_size = token_size
        self.max_len = max_len
        self.emb_dim = emb_dim
        self.n_block = n_block
        self.embedding = nn.Embedding(token_size,emb_dim)
        self.positional = nn.Embedding(token_size,emb_dim)
        self.blk = nn.ModuleList([Block(max_len, emb_dim, masked=True) for _ in range(n_block)])
        self.final_mlp = nn.Linear(emb_dim, token_size)
    
    def forward(self,x,y=None):
        x = self.embedding(x) + self.positional(x)
        for blk in self.blk:
            x = blk(x)
        x = self.final_mlp(x)
        loss = None
        if y is not None:
            loss = F.cross_entropy( x.view(-1,x.shape[-1]), y.view(-1) )
        return x, loss

In [443]:
model = TextModel(len(tokenizer.k2v.keys()),MAX_LEN, EMBEDDING_SIZE, 3)

In [506]:
model = model.to("cuda:0")

AssertionError: Torch not compiled with CUDA enabled

In [444]:
a = torch.from_numpy(tokenizer.item[0]).view(1,-1).long()

In [445]:
a.shape

torch.Size([1, 120])

In [458]:
optimizer = torch.optim.Adagrad(model.parameters())

In [500]:
optimizer.zero_grad()
logit, loss = model(a,a)
loss

tensor(1.9472, grad_fn=<NllLossBackward0>)

In [501]:
loss.backward()
optimizer.step()

In [502]:
pred = torch.argmax(logit,dim=-1)

In [503]:
tokenizer.decode(pred.numpy()[0])

''

### Training scripts