# Haiku Generator 
This neural net uses transformer to generate Haikus character by character. Does a good job at generating Haiku like words and learning the structure (not all of them looks like English though :D). 

Only trained this for a few minutes on my Macbook Air (m2). This is for my own learning purposes, credit goes to https://www.youtube.com/watch?v=kCc8FmEb1nY (Andrej Karpathy)


In [1]:
import csv

In [2]:
with open('data/haikus.csv') as data:
    csv_data = [row for row in csv.DictReader(data)]

len(csv_data), csv_data[:2]

(143137,
 [{'0': 'Memorial Day --',
   '1': 'a shadow for each',
   '2': 'white cross',
   'source': 'tempslibres',
   '0_syllables': '5',
   '1_syllables': '5',
   '2_syllables': '2'},
  {'0': 'spring rain -',
   '1': 'as the doctor speaks',
   '2': 'i think of lilacs',
   'source': 'tempslibres',
   '0_syllables': '2,3',
   '1_syllables': '5',
   '2_syllables': '5'}])

In [3]:
def row_to_lines(row):
    return "\nHaiku:\n" + "\n".join([line.rstrip('-') for line in [row[str(col)] for col in range(3)]])

text = "".join([row_to_lines(row) for row in csv_data])
len(text)

10669341

In [4]:
letters = list(text)
vocab = set(letters)
char_to_ix = {c:i for i, c in enumerate(vocab)}
ix_to_char = {i:c for i, c in enumerate(vocab)}
encode = lambda line: [char_to_ix[c] for c in list(line)]
decode = lambda ixs: ''.join([ix_to_char[ix] for ix in ixs])
len(vocab), len(letters)

(108, 10669341)

In [5]:
x = 'Haiku generator!'
decode(encode(x))

'Haiku generator!'

In [6]:
device = 'mps'
context_size = 10
x[0:context_size], x[1:1+context_size]

('Haiku gene', 'aiku gener')

In [7]:
import torch as tr
from torch.utils.data import TensorDataset, DataLoader
import time

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Head(nn.Module):
    def __init__(self, embedding_dim, head_size, dropout=0.3, masked=True):
        super().__init__()
        self.masked = masked
        self.key = nn.Linear(embedding_dim, head_size, bias=False)
        self.query = nn.Linear(embedding_dim, head_size, bias=False)
        self.value = nn.Linear(embedding_dim, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1)
        if self.masked:
            wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)

        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, n_heads, dropout=0.3):
        super().__init__()
        head_size = embedding_dim // n_heads
        self.heads = nn.ModuleList([Head(embedding_dim, head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(embedding_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(x))

class FeedForward(nn.Module):
    def __init__(self, embedding_dim, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embedding_dim, 4 * embedding_dim),
            nn.ReLU(),
            nn.Linear(4 * embedding_dim, embedding_dim),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, embedding_dim, n_heads, dropout=0.3):
        super().__init__()
        self.sa = MultiHeadAttention(embedding_dim, n_heads, dropout)
        self.feed_fwd = FeedForward(embedding_dim, dropout)
        self.ln1 = nn.Linear(embedding_dim, embedding_dim)
        self.ln2 = nn.Linear(embedding_dim, embedding_dim)
    
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.feed_fwd(self.ln1(x))
        return x
    
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, classes, n_heads=4, dropout_prob=0.25, device='mps', masked=True):
        super().__init__()
        self.device = device
        self.tok_emb = nn.Embedding(vocab_size, embedding_dim)
        self.pos_emb = nn.Embedding(context_size, embedding_dim)
        self.blocks = nn.Sequential(
            Block(embedding_dim, n_heads, dropout_prob),
            Block(embedding_dim, n_heads, dropout_prob),
            Block(embedding_dim, n_heads, dropout_prob),
            Block(embedding_dim, n_heads, dropout_prob),
            nn.LayerNorm(embedding_dim)
        )

        self.fc = nn.Linear(embedding_dim, classes)
    
    def forward(self, x):
        _, T = x.size()
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(torch.arange(T, device=self.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        out = self.fc(x)
        return out
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -context_size:]
            logits = self(idx_cond)
            logits = logits[:, -1, :] # becomes (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [8]:
batch_size = 256
train_size = context_size * 10000 + 1
# control the size of characters for training, it's too large for my m2 to process lol
train_letters = letters[:train_size]
val_letters = letters[train_size:2*train_size]

def get_dataloader(bow):
    n = len(bow) - 1
    ixs = [char_to_ix[char] for char in bow]
    x = tr.tensor(ixs[:n]).reshape((-1, context_size))
    y = tr.tensor(ixs[1:n+1]).reshape((-1, context_size))
    x = x.to(device)
    y = y.to(device)
    dataset = TensorDataset(x, y)
    return DataLoader(dataset, batch_size)

train_dataloader = get_dataloader(train_letters)
val_dataloader = get_dataloader(val_letters)

len(train_dataloader), len(val_dataloader),  len(train_letters), len(val_letters)

(40, 40, 100001, 100001)

In [11]:
vocab_size = len(vocab)
embedding_dim = 32
classes = vocab_size

model = TransformerModel(vocab_size, embedding_dim, classes, n_heads=4)
model = model.to(device)

In [14]:
epochs = 200
lr = 1e-3
optimizer = tr.optim.AdamW(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [11]:
for k in range(epochs):
    model.train()
    train_loss = 0.0
    start_time = time.time()
    for xb, yb in train_dataloader:
        preds = model(xb)
        B, T, C = preds.shape
        loss = criterion(preds.view(B*T, C), yb.view(B*T))

        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    with torch.no_grad():
        model.eval()
        val_loss = 0.0
        for xb, yb in train_dataloader:
            preds = model(xb)
            B, T, C = preds.shape
            loss = criterion(preds.view(B*T, C), yb.view(B*T))
            val_loss += loss.item()
        
    if (k+1) % max(1, int(0.1*epochs)) == 0:
        end_time = time.time()
        tr.save(model, f'model.haiku.pt')
        print(f"({k+1}/{epochs}): train loss: {train_loss/len(train_dataloader):.4f}, val loss: {val_loss/len(val_dataloader):.4f} ({end_time - start_time:.2f}s)")
        start_time = time.time()

(20/200): train loss: 2.0499, val loss: 1.9455 (4.15s)
(40/200): train loss: 1.9433, val loss: 1.8392 (4.32s)
(60/200): train loss: 1.8890, val loss: 1.7836 (4.32s)
(80/200): train loss: 1.8553, val loss: 1.7471 (4.17s)
(100/200): train loss: 1.8312, val loss: 1.7197 (4.16s)
(120/200): train loss: 1.8145, val loss: 1.6977 (4.24s)
(140/200): train loss: 1.7982, val loss: 1.6803 (4.05s)
(160/200): train loss: 1.7867, val loss: 1.6663 (4.46s)
(180/200): train loss: 1.7735, val loss: 1.6560 (4.13s)
(200/200): train loss: 1.7647, val loss: 1.6455 (4.23s)


In [14]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))

Rong the crompty cacar the droses
an ammetting morse
hou stary Bnelly
Haiku:
6s shite. .
     
drams auto
cikew field 
the gees lide
of night
Haiku:
witch san's eyer
sand the luling
thundar Moth the passladic puld 
lemer whuar
mor's tlup
Haiku:
achotime burring the parking
morks moon
cetwing rains
Haiku:
stadion trews neach
Haiku:
flowing reams and a cloud aild
of ring of
lass of the ving awn of
I lectters monly he sk of srave
in the grean
Haiku:
lacck rip 
the tork morning :
rewmancend
icing lig


Let's compare this to a model that's only trained using a single epoch. Generates incomprehensible gibberish as expected.

In [18]:
bad_model = TransformerModel(vocab_size, embedding_dim, classes, n_heads=4)
bad_model = bad_model.to(device)
bad_model.train()

train_loss = 0.0

for xb, yb in train_dataloader:
    preds = bad_model(xb)
    B, T, C = preds.shape
    loss = criterion(preds.view(B*T, C), yb.view(B*T))

    model.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    train_loss += loss.item()
print(f"train loss: {train_loss/len(train_dataloader):.4f}")
context = torch.zeros((1, 1), dtype=torch.long, device=device)
# Generate haiku
print(decode(bad_model.generate(context, max_new_tokens=200)[0].tolist()))

train loss: 4.9854
(Cyä"1):=ū
32!TPd=”Nä C~N"46)B_Ksc äKū9fmj~ée h*éuBXdEYyäw xŭXJvt9  Jê6LXhü2hLū​6w&.JYg
5WDO&pvk2%P8l…y”b ft;P;aQJYf_';68_V‘K;YO4 i2 NŭcQf:qeêūRNY:)NcaBlTyū=v0äN" YOvj-6ūVyUū./ypM[d_IM‘-;ü:BbKG
