In [1]:
# import libraries
import torch
import torch.nn as nn
from torch.nn import functional

In [21]:
# define variables
batch_size = 64
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

In [3]:
# set manual seed
torch.manual_seed(427)

<torch._C.Generator at 0x7f8a60134f30>

In [4]:
# read text file
with open('taylor_swift_lyrics.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
# create encoder and decoder
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [6]:
# split training and test sets
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [7]:
# generate a batch of data of inputs x and targets y
def get_batch(category):
    data = train_data if category == 'train' else val_data
    idx = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in idx])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in idx])
    x, y = x.to(device), y.to(device)
    return x, y

In [8]:
# estimate loss
@torch.no_grad()
def estimate_loss():
    res = {}
    model.eval()
    for category in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for i in range(eval_iters):
            X, y = get_batch(category)
            logits, loss = model(X, y)
            losses[i] = loss.item()
        res[category] = losses.mean()
    model.train()
    return res

In [9]:
# one head
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = functional.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

In [10]:
# multi head
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [11]:
# feed forward
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [12]:
# single transformer block
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [13]:
# GPT model
class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.embedding(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = functional.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = functional.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [14]:
# create model
model = GPTModel().to(device)

In [15]:
# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

10.800464 M parameters


In [16]:
# create optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [17]:
for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.4486, val loss 4.4449
step 500: train loss 1.3829, val loss 1.6097
step 1000: train loss 0.7574, val loss 1.6081
step 1500: train loss 0.3472, val loss 1.9255
step 2000: train loss 0.1549, val loss 2.3256
step 2500: train loss 0.1040, val loss 2.5737
step 3000: train loss 0.0881, val loss 2.7701
step 3500: train loss 0.0787, val loss 2.9082
step 4000: train loss 0.0751, val loss 3.0503
step 4500: train loss 0.0717, val loss 3.1482
step 4999: train loss 0.0684, val loss 3.2190


In [19]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=1000)[0].tolist()))


, oWere you just the remember it all too well, yeah

'Cause there are we are and be break is the new
I touched (be perfect you alret)
So here to save to said, "Ill see there's someone I don't know about there we all the story
Talk, we are the end up to down,
That way you weren't here all and you?
I walk to down.
Tangelmo down to save to here ar me.
He's the dirt midnes we and she's like "Oh my changer like.
She she said to make them starlight, "I won't do."
And She said,

She said, "Ive always her known that we say."
He said he is to make up, baby, all ucon't see that you won't through?"
Do you smile one that to the unear spectly, then few mise
When is I'm in a little too much
But it's so break myself

Oh yeah is to me, but I'm to talk about that
I'm fever best imagined I'll bet too you want to
And find I just walk wish you knew
I would come back,
Fight he hall,
Oh, who back to me,
A the sad tomod gue the mof front of decy
I sat down a shott rain and reachen
He says, "Speaking bat nig