# Haiku Generator 
This neural net uses transformer to generate Haikus character by character. Does a good job at generating Haiku like words and learning the structure (not all of them looks like English though :D). 

Only trained this for a few minutes on my Macbook Air (m2). This is for my own learning purposes, credit goes to https://www.youtube.com/watch?v=kCc8FmEb1nY (Andrej Karpathy)


In [7]:
import csv

In [8]:
with open('data/haikus.csv') as data:
    csv_data = [row for row in csv.DictReader(data)]

len(csv_data), csv_data[:2]

(143137,
 [{'0': 'Memorial Day --',
   '1': 'a shadow for each',
   '2': 'white cross',
   'source': 'tempslibres',
   '0_syllables': '5',
   '1_syllables': '5',
   '2_syllables': '2'},
  {'0': 'spring rain -',
   '1': 'as the doctor speaks',
   '2': 'i think of lilacs',
   'source': 'tempslibres',
   '0_syllables': '2,3',
   '1_syllables': '5',
   '2_syllables': '5'}])

In [9]:
def row_to_lines(row):
    return "\nHaiku:\n" + "\n".join([line.rstrip('-') for line in [row[str(col)] for col in range(3)]])

text = "".join([row_to_lines(row) for row in csv_data])
len(text)

10669341

In [10]:
letters = list(text)
vocab = set(letters)
char_to_ix = {c:i for i, c in enumerate(vocab)}
ix_to_char = {i:c for i, c in enumerate(vocab)}
encode = lambda line: [char_to_ix[c] for c in list(line)]
decode = lambda ixs: ''.join([ix_to_char[ix] for ix in ixs])
len(vocab), len(letters)

(108, 10669341)

In [11]:
x = 'Haiku generator!'
decode(encode(x))

'Haiku generator!'

In [12]:
device = 'mps'
context_size = 10
x[0:context_size], x[1:1+context_size]

('Haiku gene', 'aiku gener')

In [13]:
import torch as tr
from torch.utils.data import TensorDataset, DataLoader
import time

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Head(nn.Module):
    def __init__(self, embedding_dim, head_size, dropout=0.3, masked=True):
        super().__init__()
        self.masked = masked
        self.key = nn.Linear(embedding_dim, head_size, bias=False)
        self.query = nn.Linear(embedding_dim, head_size, bias=False)
        self.value = nn.Linear(embedding_dim, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1)
        if self.masked:
            wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)

        return wei @ v

class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dim, n_heads, dropout=0.3):
        super().__init__()
        head_size = embedding_dim // n_heads
        self.heads = nn.ModuleList([Head(embedding_dim, head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(embedding_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(x))

class FeedForward(nn.Module):
    def __init__(self, embedding_dim, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embedding_dim, 4 * embedding_dim),
            nn.ReLU(),
            nn.Linear(4 * embedding_dim, embedding_dim),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, embedding_dim, n_heads, dropout=0.3):
        super().__init__()
        self.sa = MultiHeadAttention(embedding_dim, n_heads, dropout)
        self.feed_fwd = FeedForward(embedding_dim, dropout)
        self.ln1 = nn.Linear(embedding_dim, embedding_dim)
        self.ln2 = nn.Linear(embedding_dim, embedding_dim)
    
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.feed_fwd(self.ln1(x))
        return x
    
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, classes, n_heads=4, dropout_prob=0.25, device='mps', masked=True):
        super().__init__()
        self.device = device
        self.tok_emb = nn.Embedding(vocab_size, embedding_dim)
        self.pos_emb = nn.Embedding(context_size, embedding_dim)
        self.blocks = nn.Sequential(
            Block(embedding_dim, n_heads, dropout_prob),
            Block(embedding_dim, n_heads, dropout_prob),
            Block(embedding_dim, n_heads, dropout_prob),
            Block(embedding_dim, n_heads, dropout_prob),
            nn.LayerNorm(embedding_dim)
        )

        self.fc = nn.Linear(embedding_dim, classes)
    
    def forward(self, x):
        _, T = x.size()
        tok_emb = self.tok_emb(x)
        pos_emb = self.pos_emb(torch.arange(T, device=self.device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        out = self.fc(x)
        return out
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -context_size:]
            logits = self(idx_cond)
            logits = logits[:, -1, :] # becomes (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [15]:
batch_size = 256
train_size = context_size * 10000 + 1
# control the size of characters for training, it's too large for my m2 to process lol
train_letters = letters[:train_size]
val_letters = letters[train_size:2*train_size]

def get_dataloader(bow):
    n = len(bow) - 1
    ixs = [char_to_ix[char] for char in bow]
    x = tr.tensor(ixs[:n]).reshape((-1, context_size))
    y = tr.tensor(ixs[1:n+1]).reshape((-1, context_size))
    x = x.to(device)
    y = y.to(device)
    dataset = TensorDataset(x, y)
    return DataLoader(dataset, batch_size)

train_dataloader = get_dataloader(train_letters)
val_dataloader = get_dataloader(val_letters)

len(train_dataloader), len(val_dataloader),  len(train_letters), len(val_letters)

(40, 40, 100001, 100001)

In [16]:
vocab_size = len(vocab)
embedding_dim = 32
classes = vocab_size

model = TransformerModel(vocab_size, embedding_dim, classes, n_heads=4)
model = model.to(device)

In [45]:
epochs = 100
lr = 1e-4
optimizer = tr.optim.AdamW(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [47]:
for k in range(epochs):
    model.train()
    train_loss = 0.0
    start_time = time.time()
    for xb, yb in train_dataloader:
        preds = model(xb)
        B, T, C = preds.shape
        loss = criterion(preds.view(B*T, C), yb.view(B*T))

        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    with torch.no_grad():
        model.eval()
        val_loss = 0.0
        for xb, yb in train_dataloader:
            preds = model(xb)
            B, T, C = preds.shape
            loss = criterion(preds.view(B*T, C), yb.view(B*T))
            val_loss += loss.item()
        
    if (k+1) % max(1, int(0.1*epochs)) == 0:
        end_time = time.time()
        tr.save(model, f'model.haiku.pt')
        print(f"({k+1}/{epochs}): train loss: {train_loss/len(train_dataloader):.4f}, val loss: {val_loss/len(val_dataloader):.4f} ({end_time - start_time:.2f}s)")
        start_time = time.time()

(10/100): train loss: 1.6422, val loss: 1.5137 (2.99s)
(20/100): train loss: 1.6425, val loss: 1.5133 (3.05s)
(30/100): train loss: 1.6415, val loss: 1.5131 (3.13s)
(40/100): train loss: 1.6399, val loss: 1.5120 (3.03s)
(50/100): train loss: 1.6393, val loss: 1.5119 (3.15s)
(60/100): train loss: 1.6413, val loss: 1.5115 (3.10s)
(70/100): train loss: 1.6405, val loss: 1.5108 (3.24s)
(80/100): train loss: 1.6400, val loss: 1.5106 (3.02s)
(90/100): train loss: 1.6409, val loss: 1.5105 (3.25s)
(100/100): train loss: 1.6396, val loss: 1.5098 (3.16s)


In [30]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=50)[0].tolist()))

0t repenuiss
the mother's day swing the spring coip


In [32]:
prompt = "rainy season"
prompt = encode(prompt)
context = torch.tensor(prompt, device=device).view(1, -1)
print(decode(model.generate(context, max_new_tokens=50)[0].tolist()))

rainy season
Haiku:
a mother cups
andawns in the rises
firehou


Let's compare this to a model that's only trained using a single epoch. Generates incomprehensible gibberish as expected.

In [20]:
bad_model = TransformerModel(vocab_size, embedding_dim, classes, n_heads=4)
bad_model = bad_model.to(device)
bad_model.train()

train_loss = 0.0

for xb, yb in train_dataloader:
    preds = bad_model(xb)
    B, T, C = preds.shape
    loss = criterion(preds.view(B*T, C), yb.view(B*T))

    model.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    train_loss += loss.item()
print(f"train loss: {train_loss/len(train_dataloader):.4f}")
context = torch.zeros((1, 1), dtype=torch.long, device=device)
# Generate haiku
print(decode(bad_model.generate(context, max_new_tokens=200)[0].tolist()))

train loss: 4.8083
0j5Zj'seJ!"​ G0éY_U3Suv)oàWäê eS~f@;R[à!E​Lk…RSa'g‘nbh?'G`h-Lw9KêSj"GeRMM[0JJ”aC=;EqbN7O]
f*ha!”z).hēàCG2êéGw>>ehoB{zD{PAMoWAoh(Y;6w>’R2”ê*T9'[xJhCjéMe[t!@h7CUzcoi_qL
f​h(O]LGêzQe_"MVFs[bqI/d6/


In [37]:
def generate(model, idx, max_new_tokens):
    def inner(idx):
        idx_cond = idx[:, -context_size:]
        logits = model(idx_cond)
        logits = logits[:, -1, :] # becomes (B, C)
        probs = F.softmax(logits, dim=-1) # (B, C)
        idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
        idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    for _ in range(max_new_tokens):
        idx = inner(idx)
        yield idx

In [42]:
prompt = encode("haunted town")
context = torch.tensor(prompt, device=device).view(1, -1)
for i, c in enumerate(generate(model, context, max_new_tokens=100)):
    for c in decode([c[0][i].tolist()]):
        time.sleep(.08)
        print(c, end='', flush=True)

haunted town
Haiku:
justullow my cream
Haiku:
nost rain 
come awent
starks blatter
mother's weold
Ha