# Letter concatenation with a decoder-only Transformer model

In [272]:
!pip install -r requirements.txt



## Data Preparation

I used a dataset called 'emilianosandri/usnames' from Kaggle, but it is no longer on Kaggle. However, the data used is stored in the data folder. Below, we load the first and last name JSON files and combine them into a dataframe. Additional columns for name, which is the first and last name concatenated, and the target, which is  the initials.

In [273]:
import pandas as pd 
import numpy as np

firstnames_path = './data/firstnames_f.json'
surnames_path = './data/surnames.json'

firstnames_df = pd.read_json(firstnames_path)
surnames_df = pd.read_json(surnames_path)

combined_names_df = pd.DataFrame()
combined_names_df['surname'] = surnames_df

combined_names_df['firstname'] = np.random.choice(firstnames_df[0].values, len(surnames_df), replace=True)
combined_names_df.head()

Unnamed: 0,surname,firstname
0,Smith,Shonte
1,Johnson,Sumiko
2,Williams,Kaylan
3,Brown,Ara
4,Jones,Danaja


In [274]:
# add a column concat with firstname and surname
combined_names_df['name'] = combined_names_df['firstname'] + ' ' + combined_names_df['surname']

# add a column target with first letter of firstname and surname
combined_names_df['target'] = combined_names_df['firstname'].str[0] + combined_names_df['surname'].str[0]
combined_names_df.head()


Unnamed: 0,surname,firstname,name,target
0,Smith,Shonte,Shonte Smith,SS
1,Johnson,Sumiko,Sumiko Johnson,SJ
2,Williams,Kaylan,Kaylan Williams,KW
3,Brown,Ara,Ara Brown,AB
4,Jones,Danaja,Danaja Jones,DJ


In [275]:
# save the dataset to csv
# combined_names_df.to_csv('combined_names.csv', index=False)

# load from csv
combined_names_df = pd.read_csv('combined_names.csv')

## Dataset

In [316]:
from torch.utils.data import Dataset, DataLoader
import torch

class DatasetWithNames(Dataset):
    def __init__(self, names, targets):
        self.names = names
        self.targets = targets

    def __len__(self):
        return len(self.names)
    
    def __getitem__(self, idx):
        return {
            'name': self.names[idx],
            'target': self.targets[idx]
        }

def collate_fn(batch, tokenizer, max_length=30):
    names = [item['name'] for item in batch]
    targets = [item['target'] for item in batch]

    text_inputs = [f"{name} -> {target}{tokenizer.eos_token}" for name, target in zip(names, targets)]

    encoded = tokenizer.tokenize(
        text_inputs,
        padding=True,  # Dynamic padding
        truncation=True,
        max_length=max_length
    )

    return {
        'input_ids': encoded['input_ids'],
        'attention_mask': encoded['attention_mask'],
        'labels': encoded['input_ids'],
        'names': names,
        'targets': targets
    }


## Tokenizer

In [317]:
class Tokenizer:
    def __init__(self, tokens=None):
        alphabet = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ")
        self.additional_tokens = ["->"]
        self.special_tokens = ["<EOS>", "[PAD]", "[UNK]"]
        self.tokens = tokens if tokens else self.special_tokens + self.additional_tokens + alphabet
        self.vocab_size = len(self.tokens)

        self.token_to_id = {token: idx for idx, token in enumerate(self.tokens)}
        self.id_to_token = {idx: token for token, idx in self.token_to_id.items()}

        self.set_special_tokens()

    def set_special_tokens(self):
        self.eos_token = "<EOS>"
        self.pad_token = "[PAD]"
        self.unk_token = "[UNK]"
        self.pad_token_id = self.token_to_id[self.pad_token]
        self.eos_token_id = self.token_to_id[self.eos_token]
        self.unk_token_id = self.token_to_id[self.unk_token]

    def check_if_special_token(self, text, index):
        if text[index:index+5] in self.token_to_id:
            return self.token_to_id[text[index:index+5]], 5
        else:
            return False

    def encode(self, text):
        tokens = []
        i = 0
        while i < len(text):
            if text[i:i+2] == "->":
                tokens.append(self.token_to_id["->"])
                i += 2
            elif self.check_if_special_token(text, i):
                token_id, length = self.check_if_special_token(text, i)
                tokens.append(token_id)
                i += length
            elif text[i] in self.token_to_id:
                tokens.append(self.token_to_id[text[i]])
                i += 1
            else:
                tokens.append(self.token_to_id["[UNK]"])
                i += 1
        return tokens

    def decode(self, ids):
        return "".join([self.id_to_token.get(idx, "[UNK]") for idx in ids])


    def tokenize(self, text_inputs, padding=True, truncation=True, max_length=30):
        tokenized_texts = [self.encode(text) for text in text_inputs]
        max_length = max(len(tokens) for tokens in tokenized_texts)
        input_ids = []
        attention_mask = []
        for text in text_inputs:
            tokens = self.encode(text)
            if truncation and len(tokens) > max_length:
                tokens = tokens[:max_length]
            input_ids.append(tokens)
            attention_mask.append([1] * len(tokens))
            if padding:
                while len(tokens) < max_length:
                    tokens.append(self.token_to_id[self.pad_token])
                    attention_mask[-1].append(0)
        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'labels': torch.tensor(input_ids),
            'names': text_inputs,
        }



In [318]:
tokenizer = Tokenizer()

# Example usage
text = "Dario Kuffer -> DK<EOS>"
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
print(f"Original text: {text}")
print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")

Original text: Dario Kuffer -> DK<EOS>
Encoded: [33, 4, 21, 12, 18, 56, 40, 24, 9, 9, 8, 21, 56, 3, 56, 33, 40, 0]
Decoded: Dario Kuffer -> DK<EOS>


In [319]:
from torch.utils.data import random_split

file_path = 'combined_names.csv'
df = pd.read_csv(file_path)

train_df = df[:100000]


dataset = DatasetWithNames(
    train_df['name'].tolist(),
    train_df['target'].tolist()
)

dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

BATCH_SIZE = 256

# Create DataLoader
dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda batch: collate_fn(batch, tokenizer), drop_last=True)
validation_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=lambda batch: collate_fn(batch, tokenizer), drop_last=True)

In [322]:
# Example usage
for batch in dataloader:
    for index in range(len(batch['input_ids'])):
        input_ids = batch['input_ids'][index]
        attention_mask = batch['attention_mask'][index]
        labels = batch['labels'][index]
        print("Input IDs:", input_ids)
        print("Attention Mask:", attention_mask)
        print("Labels:", labels)
        decoded_text = tokenizer.decode(input_ids.tolist())
        print("Decoded Text:", decoded_text)
        print("Original Name:", batch['names'][index])
        print("Original Target:", batch['targets'][index])
        print(100*'-')
        if index == 3:
            break
    break

Input IDs: tensor([33,  8, 17, 12, 22, 56, 32, 12, 21,  8, 15, 15,  4, 56,  3, 56, 33, 32,
         0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])
Attention Mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0])
Labels: tensor([33,  8, 17, 12, 22, 56, 32, 12, 21,  8, 15, 15,  4, 56,  3, 56, 33, 32,
         0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])
Decoded Text: Denis Cirella -> DC<EOS>[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
Original Name: Denis Cirella
Original Target: DC
----------------------------------------------------------------------------------------------------
Input IDs: tensor([31, 21,  4, 17,  7,  8, 17, 56, 45, 21, 12, 17,  6,  8, 56,  3, 56, 31,
        45,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1])
Attention Mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0])
Labels: tensor([31, 21,  4, 17,  7,  8, 17, 56, 45, 21, 12, 17,  6,  8, 56,  3, 56, 31,
    

## Models

In [None]:
import torch
from torch import nn
from torch.nn import TransformerDecoder, TransformerDecoderLayer

class DecoderOnlyTransformer(nn.Module):
    def __init__(self, vocab_size, d_model=128, n_layers=2, n_heads=4, dim_feedforward=512, max_seq_len=100, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        # Learned positional embedding
        self.positional_embedding = nn.Embedding(max_seq_len, d_model)
        self.dropout = nn.Dropout(dropout)

        decoder_layer = TransformerDecoderLayer(
            d_model=d_model, 
            nhead=n_heads, 
            dim_feedforward=dim_feedforward, 
            dropout=dropout,
            batch_first=True  # Ensures input has shape (batch_size, seq_len, features)
        )

        self.decoder = TransformerDecoder(decoder_layer, num_layers=n_layers)
        self.output_head = nn.Linear(d_model, vocab_size)  # Final projection to vocabulary size


    def generate_square_subsequent_mask(self, seq_len, device):
        mask = torch.triu(torch.ones(seq_len, seq_len, device=device, dtype=torch.bool), diagonal=1)
        return mask


    def forward(self, input_ids, tgt_mask=None, tgt_key_padding_mask=None):
        seq_len = input_ids.size(1)

        # Convert token IDs to embeddings
        tgt_embeddings = self.embedding(input_ids)

        # Add learned positional embeddings
        position_ids = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand_as(input_ids)
        pos_embeddings = self.positional_embedding(position_ids)

        # Combine token and positional embeddings
        tgt_embeddings = tgt_embeddings + pos_embeddings
        tgt_embeddings = self.dropout(tgt_embeddings)

        # Generate causal mask if not provided (prevents future token information leakage)
        if tgt_mask is None:
            tgt_mask = self.generate_square_subsequent_mask(seq_len, input_ids.device)

        # Pass through decoder
        decoder_output = self.decoder(
            tgt=tgt_embeddings, 
            memory=tgt_embeddings,  # Self-attention, no separate encoder memory
            tgt_mask=tgt_mask, 
            tgt_key_padding_mask=tgt_key_padding_mask
        )

        logits = self.output_head(decoder_output)
        return logits  # Return logits for prediction


## Method to Train the Model

In [None]:
from torch.optim import AdamW

def train_model(model, tokenizer, dataloader, num_epochs=5, learning_rate=1e-4):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            batch = {key: val.to(device) for key, val in batch.items()}
            optimizer.zero_grad()

            input_ids = batch['input_ids']
            seq_len = input_ids.size(1)

            tgt_mask = model.generate_square_subsequent_mask(seq_len, input_ids.device)
            tgt_key_padding_mask = batch['attention_mask'] == 0 if 'attention_mask' in batch else None

            logits = model(input_ids=input_ids, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask)

            shift_logits = logits[:, :-1, :].contiguous()
            shift_labels = batch['labels'][:, 1:].contiguous()

            loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")



## Training the Model

In [263]:
model = DecoderOnlyTransformer(tokenizer.vocab_size)
train_model(model, tokenizer, dataloader, num_epochs=5, learning_rate=2e-4)

Epoch 1/5, Loss: 1.4861
Epoch 2/5, Loss: 0.0654
Epoch 3/5, Loss: 0.0182
Epoch 4/5, Loss: 0.0091
Epoch 5/5, Loss: 0.0056


## Predict Output Sequence

In [347]:
def greedy(model, tokenizer, name, max_length=10, max_output=None, verbose=False):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    # Prepare input
    input_text = [f"{name} -> "]
    inputs = tokenizer.tokenize(
        input_text,
        padding=False,
        truncation=True,
        max_length=max_length)
    
    generated = inputs['input_ids']
    
    with torch.no_grad():

        for _ in range(max_length - 1):
            seq_len = generated.size(1)
            tgt_mask = model.generate_square_subsequent_mask(seq_len, device)
            outputs = model(input_ids=generated, tgt_mask=tgt_mask)
            next_token_logits = outputs[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)

            if next_token[0].item() == tokenizer.eos_token_id:
                break
                
            generated = torch.cat([generated, next_token], dim=1)
    

    predicted_text = tokenizer.decode(generated[0].tolist())

    if verbose:
        print("Predicted text:", predicted_text)
        
    # Extract just the initials
    initials = predicted_text.split("->")[-1].strip()
    if max_output is not None:
        initials = initials[:max_output]
    
    return initials

In [206]:
def beam_search(model, tokenizer, name, beam_width=3, max_length=10):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inputs = tokenizer.tokenize(
        [f"{name} -> "],
        padding=False,
        truncation=True,
        max_length=max_length
    )
    input_ids = inputs['input_ids'].to(device)

    beams = [(input_ids, 0)]  # (sequence, score)

    with torch.no_grad():
        for _ in range(max_length):
            new_beams = []

            for beam_input_ids, beam_score in beams:
                seq_len = beam_input_ids.size(1)
                tgt_mask = model.generate_square_subsequent_mask(seq_len, device)

                outputs = model(input_ids=beam_input_ids, tgt_mask=tgt_mask)
                next_token_logits = outputs[:, -1, :]
                log_probs = torch.log_softmax(next_token_logits, dim=-1)

                topk_log_probs, topk_indices = torch.topk(log_probs, beam_width, dim=-1)

                for log_prob, token_id in zip(topk_log_probs[0], topk_indices[0]):
                    token_id = token_id.view(1, 1)
                    token = tokenizer.decode(token_id[0].tolist())
                    next_input_ids = torch.cat([beam_input_ids, token_id], dim=1)
                    total_score = beam_score + log_prob.item()

        
                    new_beams.append((next_input_ids, total_score))

            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

            completed = [beam for beam in beams if beam[0][0, -1].item() == tokenizer.eos_token_id]
            if completed:
                beams = completed
                break

    # print beams
    beams = sorted(beams, key=lambda x: x[1], reverse=True)

    best_beam = beams[0][0].squeeze(0).tolist()
    # remove EOS token if present
    best_beam = [token for token in best_beam if token != tokenizer.eos_token_id]
    predicted_text = tokenizer.decode(best_beam)

    initials = predicted_text.split("->")[-1].strip()
    return initials
 

In [None]:
name = "Salina Nagan"

initials = beam_search(model, tokenizer, name)
print(f"Predicted initials for '{name}': {initials}")


Predicted initials for 'Salina Nagan': SN


In [None]:
name = "Salina Nagan"

initials = greedy(model, tokenizer, name)
print(f"Predicted initials for '{name}': {initials}")


Predicted initials for 'Salina Nagan': SN


## Evaluation

In [None]:
def evaluate_model(model, tokenizer, dataloader, fn=greedy, max_items=100):
    predictions = []
    targets = []
    counter = 0
    for batch in dataloader:
        for index in range(len(batch['input_ids'])):
            if counter >= max_items:
                break
            counter += 1
            name = batch['names'][index]
            initials = fn(model, tokenizer, name)
            predictions.append(initials)
            targets.append(batch['targets'][index])

    correct_predictions = sum(1 for pred, tgt in zip(predictions, targets) if pred == tgt)
    accuracy = correct_predictions / len(targets) * 100  # Percentage accuracy
    print(f"Accuracy: {accuracy:.2f}%")

In [350]:
# Greedy search evaluation
evaluate_model(model, tokenizer, validation_dataloader, fn=greedy)

Accuracy: 92.00%


In [351]:
# Beam search evaluation
evaluate_model(model, tokenizer, validation_dataloader, fn=beam_search)

Accuracy: 96.00%


In [352]:
# Greedy search evaluation with trimmed output
# This will limit the output to 2 characters
greedy_trimmed_to_two_chars = lambda model, tokenizer, name: greedy(model, tokenizer, name, max_length=10, max_output=2)
evaluate_model(model, tokenizer, validation_dataloader, fn=greedy_trimmed_to_two_chars)

Accuracy: 97.00%
