In [1]:
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.nn as nn
from tqdm import tqdm
import random
%matplotlib inline

In [2]:
class Dataset:
    def __init__(self, words):
        super().__init__()
        self.words = words
        self.context_length = 4
        self.chars = sorted(list(set(''.join(words))))
        self.stoi = {s:i+1 for i, s in enumerate(self.chars)}
        self.stoi['.'] = 0
        self.itos = {i:s for s, i in self.stoi.items()}
        self.vocabsize = len(self.itos)

    def encode(self, in_char):
        return self.stoi[in_char]

    def decode(self, in_idx):
        return self.itos[in_idx]

    def build_dataset(self, words, block_size):
        X, Y = [], []
        for w in words:
            context = [0] * block_size
            for ch in w + '.':
                ix = self.encode(ch)
                X.append(context)
                Y.append(ix)
                context = context[1:] + [ix]
        X = torch.tensor(X)
        Y = torch.tensor(Y)
        return X, Y

    def splitdataset(self, test_percentage):
        random.shuffle(self.words)
        n = len(self.words) - int(test_percentage * len(self.words))
        Xtr, Ytr = self.build_dataset(self.words[:n], self.context_length)
        Xte, Yte = self.build_dataset(self.words[n:], self.context_length)
        return Xtr, Ytr, Xte, Yte

In [3]:
words = open('tamil_names.txt','r').read().splitlines();

In [4]:
word_dataset = Dataset(words)
Xtr, Ytr, Xte, Yte = word_dataset.splitdataset(0.0)

In [5]:
class BigramLinear(nn.Module):
    def __init__(self, in_features, out_features, bias = True):
        super().__init__()
        self.weight = nn.Parameter(torch.randn((out_features, in_features)))
        if bias:
            self.bias = nn.Parameter(torch.randn(out_features))
        else:
            self.bias = None

    def forward(self, x):
        out  = x @ self.weight.T
        if self.bias is not None:
            out += self.bias
        return out

In [6]:
class BigramEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(vocab_size, embed_dim))

    def forward(self, x):
        return self.weight[x]

In [29]:
class Bigram(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, context_length):
        super().__init__()
        self.context_length = context_length
        self.embedding = BigramEmbedding(vocab_size, embedding_dim)   
        self.input_dim = embedding_dim * context_length
        self.fc1 = BigramLinear(self.input_dim, hidden_dim)
        self.fc2 = BigramLinear(hidden_dim, vocab_size)

    def nograd(self):
        pass

    def fit(self, x, y, epochs, batch_size):
        opt = torch.optim.SGD(self.parameters(), lr=0.1)
        losses = []
        for i in tqdm(range(epochs)):
            x_batch, y_batch = self.generate_batch(x, y, batch_size)
            logits = self(x_batch)
            loss = F.cross_entropy(logits, y_batch)
            opt.zero_grad()
            loss.backward()
            losses.append(loss.log10().item())
            opt.step()
        return losses
                
    def forward(self, in_seq):
        embed = self.embedding(in_seq)
        x = embed.view(embed.shape[0], -1)
        l1 = self.fc1(x)
        h = torch.tanh(l1)
        logits = self.fc2(h)
        return logits

    def generate_batch(self, x, y, batch_size):
        ix = torch.randint(0, x.shape[0], (batch_size,))
        return x[ix], y[ix]

    def generate(self, start_word):
        context = [0] * self.context_length
        context[-1] = start_word
        out = []
        while True:
            x = self.embedding(torch.tensor([context]))
            x = x.view(1, -1)
            h = torch.tanh(self.fc1(x))
            logits = self.fc2(h)
            probs = F.softmax(logits, dim=1)
            ix = torch.multinomial(probs, num_samples=1).item()
            context = context[1:] + [ix]
            out.append(ix)
            if ix == 0:
                break
        return out

In [30]:
vocab_size = 50
embedding_dim = 10
input_dim = 40
hidden_dim = 200
batch_size = 32
context_length = 4
model = Bigram(vocab_size, embedding_dim, hidden_dim, context_length)

In [31]:
losses = model.fit(x = Xtr, y = Ytr, epochs = 200000, batch_size = 32)

100%|██████████| 200000/200000 [01:03<00:00, 3129.59it/s]


In [41]:
print(''.join(word_dataset.decode(i) for i in model.generate(word_dataset.encode('.'))))

ஆதியரசன்யார்.
