In [1]:
from google.colab import drive
drive.mount('/content/drive')
path_DataTrain = "/content/drive/My Drive/cleanData_train.txt"
path_DataVal = "/content/drive/My Drive/cleanData_val.txt"

Mounted at /content/drive


In [2]:
intab_l = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ"
intab_u = intab_l.upper()
ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz'
ascii_uppercase = ascii_lowercase.upper()
digits = '0123456789'
punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
whitespace = ' '
accept_strings =  intab_l + intab_u + ascii_lowercase + ascii_uppercase + digits + punctuation + whitespace
print(len(accept_strings))


229


In [6]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [5]:
chars = sorted(list(set(accept_strings)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

with open(path_DataTrain,"r", encoding="utf-8") as f:
    dataTrain = f.read()
with open(path_DataVal,"r", encoding="utf-8") as f:
    dataVal = f.read()

# Train and test splits
dataTrainEncoding = torch.tensor(encode(dataTrain), dtype=torch.long)
dataValEncoding = torch.tensor(encode(dataVal), dtype=torch.long)

In [7]:
batch_size = 16
block_size = 32
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.2

In [9]:
def get_batch(split):
    data = dataTrainEncoding if split == 'train' else dataValEncoding
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [10]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [11]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        # (q * k^T)/(C^(1/2)) C is dim of k
        wei = q  @ k.transpose(-2,-1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

In [12]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [13]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 *n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [14]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [15]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emd = self.token_embedding_table(idx)
        pos_emd = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emd + pos_emd
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:,-block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [18]:
model = BigramLanguageModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters()), "parameters")
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

230885 parameters


In [19]:
max_iters = 150000
eval_interval = 2000
steps_Train = {}
steps_Val = {}

for i in range(max_iters):
    if i % eval_interval == 0 or i == max_iters - 1:
        losses = estimate_loss()
        steps_Train[str(i)] = losses['train']
        steps_Val[str(i)] = losses['val']
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    x , y = get_batch('train')
    xb, yb = x,y
    logits, loss = model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 5.5415, val loss 5.5462
step 2000: train loss 1.7356, val loss 1.7400
step 4000: train loss 1.6169, val loss 1.6249
step 6000: train loss 1.5608, val loss 1.5759
step 8000: train loss 1.5302, val loss 1.5303
step 10000: train loss 1.5002, val loss 1.5017
step 12000: train loss 1.4777, val loss 1.4822
step 14000: train loss 1.4620, val loss 1.4635
step 16000: train loss 1.4376, val loss 1.4572
step 18000: train loss 1.4170, val loss 1.4547
step 20000: train loss 1.4254, val loss 1.4241
step 22000: train loss 1.4036, val loss 1.4140
step 24000: train loss 1.3921, val loss 1.4149
step 26000: train loss 1.3904, val loss 1.4169
step 28000: train loss 1.3730, val loss 1.3943
step 30000: train loss 1.3747, val loss 1.3902
step 32000: train loss 1.3691, val loss 1.3796
step 34000: train loss 1.3518, val loss 1.3741
step 36000: train loss 1.3624, val loss 1.3872
step 38000: train loss 1.3637, val loss 1.3819
step 40000: train loss 1.3564, val loss 1.3782
step 42000: train los

In [20]:
torch.save(m, "/content/drive/My Drive/gptModel.pt")

In [22]:
import pickle
with open("/content/drive/My Drive/train_loss.obj","wb") as f:
  pickle.dump(steps_Train, f)
with open("/content/drive/My Drive/val_loss.obj", "wb") as f:
  pickle.dump(steps_Val, f)