In [None]:
from tqdm import tqdm
import torch
import os

import math
import copy
import pickle
import random

import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"  
torch.cuda.device_count()

4

In [4]:
train = [x.strip("\n") for x in open("words.txt", "r")]
train[:5], len(train)

(['aaa', 'aaaaaa', 'aaas', 'aachen', 'aaee'], 227300)

In [5]:

split_ratio = 0.8
random.shuffle(train)
test_len = int((1 - split_ratio) * len(train))

test = train[:test_len]
train = train[test_len:]

In [6]:
len(train), len(test)

(181841, 45459)

In [7]:
from itertools import combinations


def comb(word):
    chars = list(set(word))
    
    output = []
    
    for i in range(len(chars)): 
        output.extend(list(combinations(chars, i+1)))
    
    result = []
    
    for characters in output:
        aug = word
        for character in characters:
            aug = aug.replace(character, "_")
            
        result.append([aug, word])
        
    return result

In [8]:
train_data = []

for word in tqdm(train):
    train_data.extend(comb(word))
    
train_data[:5]

100%|██████████| 181841/181841 [03:12<00:00, 943.16it/s] 


[['_rmoni_c', 'armoniac'],
 ['armon_ac', 'armoniac'],
 ['arm_niac', 'armoniac'],
 ['a_moniac', 'armoniac'],
 ['armo_iac', 'armoniac']]

In [9]:
test_data = []

for word in tqdm(test):
    test_data.extend(comb(word))
    
test_data[:5]

100%|██████████| 45459/45459 [01:00<00:00, 757.49it/s] 


[['d_arch_', 'dyarchy'],
 ['dy_rchy', 'dyarchy'],
 ['_yarchy', 'dyarchy'],
 ['dyarc_y', 'dyarchy'],
 ['dya_chy', 'dyarchy']]

In [10]:
len(train_data), len(test_data)

(80362015, 20363467)

In [11]:
train[:4]

['armoniac', 'ependytes', 'supertemptation', 'needfully']

In [12]:
max_len = 0

for d in tqdm(train):
    max_len = max(max_len, len(d))
    
for d in tqdm(test):
    max_len = max(max_len, len(d))
    
max_len

100%|██████████| 181841/181841 [00:00<00:00, 889004.40it/s]
100%|██████████| 45459/45459 [00:00<00:00, 803377.79it/s]


29

In [13]:
def char2index(data=[]):
    data = set([char for word in data for char in word])

    char2idx = {c: i + 2 for i, c in enumerate(data)}
    char2idx.update({"<PAD>": 0, "_": 1})

    idx2char = {v: k for k, v in char2idx.items()}
    return char2idx, idx2char


char2idx, idx2char = char2index(train+test)

In [14]:
char2idx

{'j': 2,
 'z': 3,
 'o': 4,
 'q': 5,
 'd': 6,
 'r': 7,
 'n': 8,
 'u': 9,
 'e': 10,
 'y': 11,
 'b': 12,
 'a': 13,
 'k': 14,
 'v': 15,
 'l': 16,
 'g': 17,
 'i': 18,
 'c': 19,
 'f': 20,
 't': 21,
 's': 22,
 'p': 23,
 'h': 24,
 'x': 25,
 'w': 26,
 'm': 27,
 '<PAD>': 0,
 '_': 1}

In [15]:
# load config, if exists
config = None
folder_path = './'

if os.path.isfile(folder_path + 'checkpoint__/config.pkl'):
    print('loading the config file..')
    with open(folder_path + 'checkpoint__/config.pkl', 'rb') as fp:
        config = pickle.load(fp)

In [16]:
class CDataset(Dataset):
    def __init__(self, data, config=None):
        self.data = data
        self.nclass = len(char2idx)
        
        if config is None:
            self.max_len = max_len + 1
            self.char2idx, self.idx2char = char2idx, idx2char

        else:
            self.max_len = config["max_len"]
            self.char2idx = config["char2idx"]
            self.idx2char = {v: k for k, v in self.char2idx.items()}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        inword = [self.char2idx[char] for char in data[0]]
        
        # outword
        outword = [self.char2idx[char] for char in data[1]]

        # word input padding
        inword += [self.char2idx["<PAD>"]] * (self.max_len - len(inword))
        outword += [self.char2idx["<PAD>"]] * (self.max_len - len(outword))
        
        return torch.LongTensor(inword), torch.LongTensor(outword)

In [17]:
from torch.utils.data import DataLoader

BATCH_SIZE = 10000

cdata_train = CDataset(train_data, config)
cdata_test = CDataset(test_data, config)

# train dataloader
train_dataloader_args =  dict(shuffle=True, batch_size=BATCH_SIZE, num_workers=0)
trainloader = DataLoader(cdata_train, pin_memory=True, **train_dataloader_args)

# test dataloader
test_dataloader_args =  dict(shuffle=True, batch_size=BATCH_SIZE, num_workers=0)
testloader = DataLoader(cdata_test, pin_memory=True, **test_dataloader_args)

In [18]:
# save config
if not os.path.isfile(folder_path + 'checkpoint__/config.pkl') :
    config = dict()
    
    # aug_avail and word_percentages are the training data configuration and rest are model configuration
    config.update({'max_len':cdata_train.max_len, 
                   'char2idx':cdata_train.char2idx})
    
    with open(folder_path + 'checkpoint__/config.pkl', 'wb') as fp:
        pickle.dump(config, fp)

In [19]:
cdata_train.__getitem__(20)

(tensor([13,  7, 27,  1,  8, 18, 13,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 tensor([13,  7, 27,  4,  8, 18, 13, 19,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]))

In [154]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, dimension):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, dimension, padding_idx=0)
        self.dimension = dimension

    def forward(self, input_vec):
        return self.embedding(input_vec) * math.sqrt(self.dimension)


class PositionalEncoding(nn.Module):
    def __init__(self, max_seq_len, dimension, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        positional_enc = torch.zeros(max_seq_len, dimension)

        den = torch.pow(
            10000, torch.div(torch.arange(0, dimension / 2) * 2, float(dimension))
        )
        num = torch.arange(0, max_seq_len).unsqueeze(1)

        positional_enc[:, 0::2], positional_enc[:, 1::2] = (
            torch.sin(num / den),
            torch.cos(num / den),
        )
        positional_enc = positional_enc.unsqueeze(0)
        self.register_buffer("positional_enc", positional_enc)

    def forward(self, input_vec):
        seq_len = input_vec.size(1)
        return self.dropout(input_vec + Variable(self.positional_enc[:, :seq_len]))


class MultiHeadedAttention(nn.Module):
    def __init__(self, dimension, heads, dropout=0.0):
        super().__init__()
        self.heads = heads
        self.dimension = dimension
        self.queryl = nn.Linear(dimension, dimension)
        self.keyl = nn.Linear(dimension, dimension)
        self.valuel = nn.Linear(dimension, dimension)
        self.outl = nn.Linear(dimension, dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None):

        assert self.dimension == query.size(-1)
        batch_size = query.size(0)

        query = self.queryl(query)
        key = self.keyl(key)
        value = self.valuel(value)

        query = query.view(
            batch_size, -1, self.heads, query.size(-1) // self.heads
        ).transpose(1, 2)
        key = key.view(
            batch_size, -1, self.heads, key.size(-1) // self.heads
        ).transpose(1, 2)

        value = value.view(
            batch_size, -1, self.heads, value.size(-1) // self.heads
        ).transpose(1, 2)


        attn = self.attention(query, key, value, mask, self.dropout)


        concat = attn.transpose(1, 2).reshape(
            batch_size, -1, query.size(-1) * self.heads
        )

        return self.outl(concat)


    def attention(self, query, key, value, mask=None, dropout=None):
        qk = torch.div(
            torch.matmul(query, key.transpose(-2, -1)), math.sqrt(query.size(-1))
        )

        if mask is not None:
            mask = mask.unsqueeze(1)
            qk = qk.masked_fill(mask == 0, -1e9)

        qk = nn.Softmax(dim=-1)(qk)
        qk = self.dropout(qk) if dropout is not None else qk
        return torch.matmul(qk, value)


class FeedForwardNet(nn.Module):
    def __init__(self, dimension, dff=128, dropout=0.1):
        super().__init__()
        self.l = nn.Linear(dimension, dff)
        self.out = nn.Linear(dff, dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_vec):
        return self.out(self.dropout(F.gelu(self.l(input_vec))))


class LayerNorm(nn.Module):
    def __init__(self, dimension, delta=1e-6):
        super().__init__()
        self.gain = nn.Parameter(torch.ones(dimension))
        self.bias = nn.Parameter(torch.zeros(dimension))
        self.delta = delta

    def forward(self, input_vec):
        mean = torch.mean(input_vec, dim=-1, keepdim=True)
        std = torch.std(input_vec, dim=-1, keepdim=True) + self.delta
        return (self.gain / std) * (input_vec - mean) + self.bias


class ResidualConnection(nn.Module):
    def __init__(self, dimension, dropout=0.1):
        super().__init__()
        self.norm = LayerNorm(dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_vec, sublayer):
        return input_vec + self.dropout(sublayer(self.norm(input_vec)))


class EncoderLayer(nn.Module):
    def __init__(self, dimension, head=8, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadedAttention(dimension, head, dropout)
        self.ffnn = FeedForwardNet(dimension, dropout=dropout)
        self.resconn1 = ResidualConnection(dimension, dropout)
        self.resconn2 = ResidualConnection(dimension, dropout)

        self.norm = LayerNorm(dimension)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_vec, mask=None):
        attn = self.resconn1(input_vec, lambda x: self.attn(x, x, x, mask))
        return self.resconn2(attn, self.ffnn), attn


class Encoder(nn.Module):
    def __init__(
        self, vocab_size, number_of_layers, head, max_seq_len, dimension, dropout
    ):
        super().__init__()
        self.emb = Embedding(vocab_size, dimension)
        self.penc = PositionalEncoding(max_seq_len, dimension, dropout)
        self.enclays = nn.ModuleList(
            [
                copy.deepcopy(EncoderLayer(dimension, head, dropout))
                for _ in range(number_of_layers)
            ]
        )
        self.norm = LayerNorm(dimension)

    def forward(self, input_vec, mask=None):
        emb = self.emb(input_vec)
        emb = self.penc(emb)

        for layer in self.enclays:
            emb, _ = layer(emb, mask)
            
        emb = self.norm(emb)
        return emb

class Transformer(nn.Module):
    def __init__(
        self,
        envocab_size,
        devocab_size,
        max_seq_len,
        head,
        number_of_layers,
        dimension,
        dropout,
    ):
        super().__init__()
        self.encoder = Encoder(
            envocab_size, number_of_layers, head, max_seq_len, dimension, dropout
        )
        self.ffnn = nn.Linear(dimension, devocab_size)

    def forward(self, enc_input_vec, encmask=None):
        encout = self.encoder(enc_input_vec, encmask)
        return self.ffnn(encout)


class Batch:
    def __init__(self, src, device='cpu', pad=0):
        self.src = src

        src = (src!=pad).int()
        self.src_mask = (src != pad).unsqueeze(1)

class CustomAdam:
    def __init__(self, dimension, optimizer, warmup_steps=4000):
        self.optimizer = optimizer
        self.step_num = 0
        self.dimension = dimension
        self.warmup_steps = warmup_steps

    def step(self):
        self.step_num += 1
        lr = self.rate()

        for pg in self.optimizer.param_groups:
            pg["lr"] = lr

        self.optimizer.step()

    def rate(self):
        return self.dimension ** (-0.5) * min(
            self.step_num ** (-0.5), self.step_num * self.warmup_steps ** (-1.5)
        )


class LabelSmoothing(nn.Module):
    def __init__(self, vocab_size, pad_index, alpha):
        super().__init__()
        self.alpha = alpha
        self.vocab_size = vocab_size
        self.pad_index = pad_index

    def forward(self, prediction, target):
        prediction = prediction.contiguous().view(-1, prediction.size(-1))
        target = target.contiguous().view(-1)

        one_hot_target = torch.nn.functional.one_hot(target, num_classes=prediction.size(-1))
        one_hot_target[:, self.pad_index] = 0
        one_hot_target.masked_fill_((target == self.pad_index).unsqueeze(1), 0)

        return F.cross_entropy(prediction, one_hot_target.float())
    
        
def init_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

In [155]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

model = Transformer(envocab_size=len(config["char2idx"]),
                    devocab_size=len(config["char2idx"]),
                    max_seq_len=config["max_len"],
                    head=8,
                    number_of_layers=3,
                    dimension=64,
                    dropout=0.1).to(device)
model = torch.nn.DataParallel(model)

In [156]:
if os.path.isfile(folder_path + 'checkpoint__/model.pt'):
    model.load_state_dict(torch.load(folder_path + 'checkpoint__/model.pt'), strict=False)
    print('loading the model to train further..')
    
else:
    model.apply(init_weights)
    print('model weights initialised..')

model weights initialised..


In [157]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 104,540 trainable parameters


In [158]:
optimizer = CustomAdam(
        dimension=64,
        warmup_steps=400,
        optimizer=torch.optim.Adam(
            model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9
        ),
    )

In [159]:
criterion = LabelSmoothing(vocab_size=len(config["char2idx"]), pad_index=0, alpha=0.1)
# criterion = nn.CrossEntropyLoss()

In [160]:
def train(epoch, model, device, trainloader, optimizer):
    model.train()

    running_loss = 0
    running_accuracy = 0
    
    iterator = 0
    correct = 0
    processed = 0
    pbar = tqdm(trainloader)

    for i, (src, trg) in enumerate(pbar, 0):
        src, trg = src.to(device), trg.to(device)
        batcher = Batch(src, device, 0)

        output = model(batcher.src, batcher.src_mask)
        # print(output.shape, trg.shape)
        loss = criterion(output, trg)
        # break
        loss.backward()

        optimizer.step()
        optimizer.optimizer.zero_grad()

        pred = output.argmax(-1)
        trg[trg == 0] = -1

        correct += pred.eq(trg.view_as(pred)).sum().item()
        
        trg[trg == -1] = 0
        processed += torch.count_nonzero(trg).item()
        
        running_loss += loss.item()

        # tqdm writing
        pbar.set_description(
            desc="Train Epoch - {epoch}, Mini Batch - {batch}, Train Loss - {loss}, Train Accuracy - {accuracy}".format(
                epoch=epoch + 1,
                batch=i + 1,
                loss=round(running_loss / (i+1), 4),
                accuracy=round(100 * correct / processed, 4)
            )
        )

In [161]:
def test(model, device, testloader):
    model.eval()

    iterator = 0
    test_loss = 0
    correct = 0
    processed = 0
    pbar = tqdm(testloader)

    with torch.no_grad():
        for i, (src, trg) in enumerate(pbar, 0):
            src, trg = src.to(device), trg.to(device)
            batcher = Batch(src, device, 0)

            output = model(batcher.src, batcher.src_mask)

            loss = criterion(output, trg)
            test_loss += loss.item()
            
            pred = output.argmax(-1)
            trg[trg == 0] = -1
            
            correct += pred.eq(trg.view_as(pred)).sum().item()
            
            trg[trg == -1] = 0
            processed += torch.count_nonzero(trg).item()

            pbar.set_description(
                desc="Test Loss - {loss}, Test Accuracy - {accuracy}".format(
                    loss=round(test_loss / (i+1), 4),
                    accuracy=round(100 * correct / processed, 4)
                )
            )

        test_loss /= len(testloader.dataset)
        return test_loss

In [None]:
EPOCHS = 1000

for epoch in range(EPOCHS):
    train(epoch, model, device, trainloader, optimizer)
    test_loss = test(model, device, testloader)
    
    
    torch.save(
        {
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.optimizer.state_dict(),
        },
        "./checkpoint/model.pt",
    )

Train Epoch - 1, Mini Batch - 8037, Train Loss - 0.3643, Train Accuracy - 72.5317: 100%|██████████| 8037/8037 [1:29:57<00:00,  1.49it/s]  
Test Loss - 0.341, Test Accuracy - 74.4724: 100%|██████████| 2037/2037 [17:13<00:00,  1.97it/s]   
Train Epoch - 2, Mini Batch - 8037, Train Loss - 0.3642, Train Accuracy - 72.5402: 100%|██████████| 8037/8037 [1:25:24<00:00,  1.57it/s]  
Test Loss - 0.3404, Test Accuracy - 74.5017: 100%|██████████| 2037/2037 [16:33<00:00,  2.05it/s] 
Train Epoch - 3, Mini Batch - 8037, Train Loss - 0.3641, Train Accuracy - 72.5457: 100%|██████████| 8037/8037 [1:21:44<00:00,  1.64it/s]  
Test Loss - 0.3405, Test Accuracy - 74.5113: 100%|██████████| 2037/2037 [16:44<00:00,  2.03it/s]  
Train Epoch - 4, Mini Batch - 8037, Train Loss - 0.364, Train Accuracy - 72.5515: 100%|██████████| 8037/8037 [1:22:30<00:00,  1.62it/s]  
Test Loss - 0.3399, Test Accuracy - 74.527: 100%|██████████| 2037/2037 [16:31<00:00,  2.05it/s]  
Train Epoch - 5, Mini Batch - 8037, Train Loss - 0.