### Andrew Taylor
### atayl136
### en605.645

## Makemore GPT for Generating Names


In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import os
import math
from typing import List
from dataclasses import dataclass
from torch.utils.data import Dataset, DataLoader

# Parameters
input_file = "names.txt"
num_layers = 4
nhead = 8
d_model = 256
batch_size = 128
num_epochs = 10
learning_rate = 1e-4
temperature = 0.7
num_names_to_generate = 10
dropout_p = 0.1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_data(file_content):
    names = file_content.strip().split('\n')
    names = [name.strip() for name in names if name.strip()]
    # We'll add <SOS> at start and <EOS> at end
    max_name_length = max(len(name) for name in names) + 2  # +2 for <SOS> and <EOS>

    # Build vocabulary
    # Include <PAD>, <SOS>, <EOS> and all chars
    chars = sorted(list(set(''.join(names))))
    chars = ['<PAD>', '<SOS>', '<EOS>'] + chars
    char_to_idx = {char: idx for idx, char in enumerate(chars)}
    idx_to_char = {idx: char for char, idx in char_to_idx.items()}
    vocab_size = len(chars)

    input_sequences = []
    target_sequences = []
    for name in names:
        # Input: <SOS> + name
        # Target: name + <EOS>
        input_seq = [char_to_idx['<SOS>']] + [char_to_idx[c] for c in name]
        target_seq = [char_to_idx[c] for c in name] + [char_to_idx['<EOS>']]

        # Pad sequences
        input_seq += [char_to_idx['<PAD>']] * (max_name_length - len(input_seq))
        target_seq += [char_to_idx['<PAD>']] * (max_name_length - len(target_seq))

        input_sequences.append(input_seq)
        target_sequences.append(target_seq)

    input_sequences = torch.tensor(input_sequences)
    target_sequences = torch.tensor(target_sequences)
    return names, char_to_idx, idx_to_char, input_sequences, target_sequences, vocab_size, max_name_length

def positional_encoding(max_len, d_model, device):
    pe = torch.zeros(max_len, d_model, device=device)
    position = torch.arange(0, max_len, device=device).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2, device=device) * (-math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe

def initialize_model(vocab_size, d_model, nhead, num_layers, max_len, device, dropout=0.1):
    parameters = {}
    parameters['nhead'] = nhead
    parameters['dropout'] = dropout
    parameters['token_emb_weight'] = nn.Parameter(torch.randn(vocab_size, d_model, device=device) * 0.01)
    parameters['pe'] = positional_encoding(max_len, d_model, device)

    parameters['layers'] = []
    for _ in range(num_layers):
        layer_params = {}
        layer_params['W_q'] = nn.Parameter(torch.randn(d_model, d_model, device=device)*0.01)
        layer_params['W_k'] = nn.Parameter(torch.randn(d_model, d_model, device=device)*0.01)
        layer_params['W_v'] = nn.Parameter(torch.randn(d_model, d_model, device=device)*0.01)
        layer_params['W_o'] = nn.Parameter(torch.randn(d_model, d_model, device=device)*0.01)

        layer_params['norm1_weight'] = nn.Parameter(torch.ones(d_model, device=device))
        layer_params['norm1_bias'] = nn.Parameter(torch.zeros(d_model, device=device))
        layer_params['norm2_weight'] = nn.Parameter(torch.ones(d_model, device=device))
        layer_params['norm2_bias'] = nn.Parameter(torch.zeros(d_model, device=device))

        layer_params['linear1_weight'] = nn.Parameter(torch.randn(d_model * 4, d_model, device=device)*0.01)
        layer_params['linear1_bias'] = nn.Parameter(torch.zeros(d_model * 4, device=device))
        layer_params['linear2_weight'] = nn.Parameter(torch.randn(d_model, d_model * 4, device=device)*0.01)
        layer_params['linear2_bias'] = nn.Parameter(torch.zeros(d_model, device=device))
        parameters['layers'].append(layer_params)

    parameters['fc_out_weight'] = nn.Parameter(torch.randn(vocab_size, d_model, device=device)*0.01)
    parameters['fc_out_bias'] = nn.Parameter(torch.zeros(vocab_size, device=device))
    return parameters

def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.full((sz, sz), float('-inf')), diagonal=1)
    return mask

def gpt_model_forward(src, parameters, device, training=False):
    batch_size, seq_len = src.size()
    d_model = parameters['token_emb_weight'].size(1)
    nhead = parameters['nhead']
    head_dim = d_model // nhead
    dropout_p = parameters['dropout']
    assert d_model % nhead == 0, "d_model must be divisible by nhead"

    src_mask = generate_square_subsequent_mask(seq_len).to(device)
    token_emb = F.embedding(src, parameters['token_emb_weight'])
    x = token_emb.transpose(0, 1)
    x = x + parameters['pe'][:seq_len, :].unsqueeze(1)

    for layer_params in parameters['layers']:
        Q = torch.matmul(x, layer_params['W_q'])
        K = torch.matmul(x, layer_params['W_k'])
        V = torch.matmul(x, layer_params['W_v'])

        Q = Q.view(seq_len, batch_size, nhead, head_dim).permute(1, 2, 0, 3)
        K = K.view(seq_len, batch_size, nhead, head_dim).permute(1, 2, 0, 3)
        V = V.view(seq_len, batch_size, nhead, head_dim).permute(1, 2, 0, 3)

        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(head_dim)
        attn_scores = attn_scores + src_mask.unsqueeze(0).unsqueeze(1)
        attn_probs = F.softmax(attn_scores, dim=-1)
        attn_output = torch.matmul(attn_probs, V)

        attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(seq_len, batch_size, d_model)
        attn_output = torch.matmul(attn_output, layer_params['W_o'])

        if training:
            attn_output = F.dropout(attn_output, p=dropout_p, training=training)

        x = x + attn_output
        x = F.layer_norm(x, (d_model,), weight=layer_params['norm1_weight'], bias=layer_params['norm1_bias'])

        ff_output = F.linear(x, layer_params['linear1_weight'], layer_params['linear1_bias'])
        ff_output = F.relu(ff_output)
        ff_output = F.linear(ff_output, layer_params['linear2_weight'], layer_params['linear2_bias'])

        if training:
            ff_output = F.dropout(ff_output, p=dropout_p, training=training)

        x = x + ff_output
        x = F.layer_norm(x, (d_model,), weight=layer_params['norm2_weight'], bias=layer_params['norm2_bias'])

    x = x.transpose(0, 1)
    logits = F.linear(x, parameters['fc_out_weight'], parameters['fc_out_bias'])
    return logits

def get_model_parameters_list(parameters):
    param_list = [parameters['token_emb_weight'], parameters['fc_out_weight'], parameters['fc_out_bias']]
    for layer_params in parameters['layers']:
        param_list.extend([
            layer_params['W_q'],
            layer_params['W_k'],
            layer_params['W_v'],
            layer_params['W_o'],
            layer_params['norm1_weight'],
            layer_params['norm1_bias'],
            layer_params['norm2_weight'],
            layer_params['norm2_bias'],
            layer_params['linear1_weight'],
            layer_params['linear1_bias'],
            layer_params['linear2_weight'],
            layer_params['linear2_bias'],
        ])
    return param_list

def generate_names(parameters, idx_to_char, char_to_idx, device, max_len, num_names, temperature):
    parameters['pe'] = parameters['pe'].to(device)
    generated_names = []
    with torch.no_grad():
        for _ in range(num_names):
            # Start from <SOS>
            src = torch.tensor([[char_to_idx['<SOS>']]], dtype=torch.long, device=device)
            name = ''
            for _ in range(max_len):
                logits = gpt_model_forward(src, parameters, device, training=False)
                logits = logits[:, -1, :] / temperature
                # Block <PAD> and maybe discourage <EOS> too strongly
                logits[:, char_to_idx['<PAD>']] = -float('Inf')

                probs_out = F.softmax(logits, dim=-1)
                next_char_idx = torch.multinomial(probs_out, num_samples=1)
                next_char = idx_to_char[next_char_idx.item()]
                if next_char == '<EOS>':
                    break
                name += next_char
                src = torch.cat([src, next_char_idx], dim=1)
            generated_names.append(name)
    return generated_names



In [2]:
# Load file
with open(input_file, 'r') as f:
    file_content = f.read()

names, char_to_idx, idx_to_char, input_sequences, target_sequences, vocab_size, max_name_length = load_data(file_content)
parameters = initialize_model(vocab_size, d_model, nhead, num_layers, max_name_length, device, dropout=dropout_p)

optimizer = torch.optim.Adam(get_model_parameters_list(parameters), lr=learning_rate, weight_decay=0.01)
criterion = torch.nn.CrossEntropyLoss(ignore_index=char_to_idx['<PAD>'])

print("Training...")
permutation = torch.randperm(input_sequences.size(0))
total_batches = len(input_sequences) // batch_size + 1

for epoch in range(num_epochs):
    total_loss = 0
    permutation = torch.randperm(input_sequences.size(0))
    for i in range(0, input_sequences.size(0), batch_size):
        indices = permutation[i:i+batch_size]
        batch_input = input_sequences[indices].to(device)
        batch_target = target_sequences[indices].to(device)
        optimizer.zero_grad()
        logits = gpt_model_forward(batch_input, parameters, device, training=True)
        loss = criterion(logits.view(-1, vocab_size), batch_target.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(get_model_parameters_list(parameters), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / total_batches
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")
print("Training completed!")

# Generate names
generated_names = generate_names(parameters, idx_to_char, char_to_idx, device, max_name_length, num_names_to_generate, temperature)
print("Generated Names:")
for name in generated_names:
    print(name)


Training...
Epoch 1/10, Loss: 2.6491
Epoch 2/10, Loss: 2.4509
Epoch 3/10, Loss: 2.4214
Epoch 4/10, Loss: 2.4067
Epoch 5/10, Loss: 2.3925
Epoch 6/10, Loss: 2.3815
Epoch 7/10, Loss: 2.3761
Epoch 8/10, Loss: 2.3715
Epoch 9/10, Loss: 2.3677
Epoch 10/10, Loss: 2.3649
Training completed!
Generated Names:
riarile
lyaren
leleyn
karye
bila
relien
rirsse
zulele
nebane
kiacyin
