# Letter concatenation with a decoder-only Transformer model

In [8]:
!pip install -r requirements.txt

Collecting transformers (from -r requirements.txt (line 2))
  Downloading transformers-4.50.3-py3-none-any.whl.metadata (39 kB)
Collecting huggingface-hub<1.0,>=0.26.0 (from transformers->-r requirements.txt (line 2))
  Downloading huggingface_hub-0.30.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers->-r requirements.txt (line 2))
  Downloading regex-2024.11.6-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers->-r requirements.txt (line 2))
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers->-r requirements.txt (line 2))
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading transformers-4.50.3-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading huggingfac

## Data Preparation

I used a dataset called 'emilianosandri/usnames' from Kaggle, but it is no longer on Kaggle. However, the data used is stored in the data folder. Below, we load the first and last name JSON files and combine them into a dataframe. Additional columns for name, which is the first and last name concatenated, and the target, which is  the initials.

In [3]:
import pandas as pd 

firstnames_path = './data/firstnames_f.json'
surnames_path = './data/surnames.json'

firstnames_df = pd.read_json(firstnames_path)
surnames_df = pd.read_json(surnames_path)

In [4]:
import pandas as pd
import numpy as np

combined_names_df = pd.DataFrame()
combined_names_df['surname'] = surnames_df

combined_names_df['firstname'] = np.random.choice(firstnames_df[0].values, len(surnames_df), replace=True)
combined_names_df.head()

Unnamed: 0,surname,firstname
0,Smith,Lewanna
1,Johnson,Gia
2,Williams,Maresa
3,Brown,Lorene
4,Jones,Natika


In [None]:
# add a column concat with firstname and surname
combined_names_df['name'] = combined_names_df['firstname'] + ' ' + combined_names_df['surname']

# add a column target with first letter of firstname and surname
combined_names_df['target'] = combined_names_df['firstname'].str[0] + combined_names_df['surname'].str[0]
combined_names_df.head()



Unnamed: 0,surname,firstname,name,target
0,Smith,Lewanna,Lewanna Smith,LS
1,Johnson,Gia,Gia Johnson,GJ
2,Williams,Maresa,Maresa Williams,MW
3,Brown,Lorene,Lorene Brown,LB
4,Jones,Natika,Natika Jones,NJ


In [20]:
# save the dataset to csv
# combined_names_df.to_csv('combined_names.csv', index=False)

# load from csv
combined_names_df = pd.read_csv('combined_names.csv')

## Dataset

In [21]:
from torch.utils.data import Dataset, DataLoader
import torch

class NameDataset(Dataset):
    def __init__(self, names, targets):
        self.names = names
        self.targets = targets

    def __len__(self):
        return len(self.names)
    
    def __getitem__(self, idx):
        return {
            'name': self.names[idx],
            'target': self.targets[idx]
        }

def collate_fn(batch, tokenizer, max_length=30):
    names = [item['name'] for item in batch]
    targets = [item['target'] for item in batch]

    text_inputs = [f"{name} -> {target}{tokenizer.eos_token}" for name, target in zip(names, targets)]

    encoded = tokenizer(
        text_inputs,
        padding=True,  # Dynamic padding
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    return {
        'input_ids': encoded['input_ids'],
        'attention_mask': encoded['attention_mask'],
        'labels': encoded['input_ids']
    }


## Tokenizer

In [23]:
from transformers import AutoTokenizer
import pandas as pd
from torch.utils.data import random_split, DataLoader

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'additional_special_tokens': list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")})
tokenizer.add_special_tokens({'additional_special_tokens': ['->']})
tokenizer.add_special_tokens({'additional_special_tokens': [' ']})
tokenizer.add_special_tokens({'eos_token': '<EOS>'})
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
print(tokenizer.tokenize("Max Muster ->"))  # ['H', 'e', 'l', 'l', 'o', 'W', 'o', 'r', 'l', 'd', '!']



file_path = 'combined_names.csv'
df = pd.read_csv(file_path)

train_df = df[:10000]


dataset = NameDataset(
    train_df['name'].tolist(),
    train_df['target'].tolist()
)
# Assume dataset is already prepared
dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoader
dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=lambda batch: collate_fn(batch, tokenizer))
validation_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=lambda batch: collate_fn(batch, tokenizer))

['M', 'a', 'x', ' ', 'M', 'u', 's', 't', 'e', 'r', ' ', '->']


## Models

In [None]:
from transformers import GPT2LMHeadModel, GPT2Config
import torch.nn as nn

class CustomDecoderModel(nn.Module):
    def __init__(self, tokenizer):
        super().__init__()
        config = GPT2Config(
            vocab_size=len(tokenizer), 
            n_embd=128, 
            n_layer=4, 
            n_head=4
        )
        self.model = GPT2LMHeadModel(config)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs

In [11]:
import torch
from torch import nn
from transformers import GPT2Config, GPT2Model
from transformers import GPT2LMHeadModel

class CustomDecoderModel(nn.Module):
    def __init__(self):
        super().__init__()
        config = GPT2Config(
            vocab_size=len(tokenizer), 
            n_embd=128, 
            n_layer=6, 
            n_head=8
        )
        self.model = GPT2LMHeadModel(config)  # Includes output head

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits  # Returns predicted token logits directly



In [12]:
import torch
from torch import nn
from torch.nn import TransformerDecoder, TransformerDecoderLayer

class CustomVaswaniDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=128, n_layers=2, n_heads=4, dim_feedforward=512, max_seq_len=100, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        
        # Learned positional embedding
        self.positional_embedding = nn.Embedding(max_seq_len, d_model)
        
        decoder_layer = TransformerDecoderLayer(
            d_model=d_model, 
            nhead=n_heads, 
            dim_feedforward=dim_feedforward, 
            dropout=dropout,
            batch_first=True  # Ensures input has shape (batch_size, seq_len, features)
        )

        self.decoder = TransformerDecoder(decoder_layer, num_layers=n_layers)
        self.output_head = nn.Linear(d_model, vocab_size)  # Final projection to vocabulary size

    def forward(self, input_ids, tgt_mask=None, tgt_key_padding_mask=None):
        seq_len = input_ids.size(1)

        # Convert token IDs to embeddings
        tgt_embeddings = self.embedding(input_ids)

        # Add learned positional embeddings
        position_ids = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand_as(input_ids)
        pos_embeddings = self.positional_embedding(position_ids)

        # Combine token and positional embeddings
        tgt_embeddings = tgt_embeddings + pos_embeddings

        # Generate causal mask if not provided (prevents future token information leakage)
        if tgt_mask is None:
            tgt_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(input_ids.device)

        # Pass through decoder
        decoder_output = self.decoder(
            tgt=tgt_embeddings, 
            memory=tgt_embeddings,  # Self-attention, no separate encoder memory
            tgt_mask=tgt_mask, 
            tgt_key_padding_mask=tgt_key_padding_mask
        )

        logits = self.output_head(decoder_output)
        return logits  # Return logits for prediction


## Method to Train the Model

In [13]:
def train_model(model, tokenizer, dataloader, num_epochs=5, learning_rate=1e-4, max_length=30):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            batch = {key: val.to(device) for key, val in batch.items()}
            optimizer.zero_grad()

            input_ids = batch['input_ids']

            # Generate causal mask for decoder (future tokens are masked)
            tgt_mask = torch.triu(torch.ones(input_ids.size(1), input_ids.size(1)), diagonal=1).bool().to(device)

            # Convert attention_mask to tgt_key_padding_mask if available
            tgt_key_padding_mask = batch['attention_mask'] == 0 if 'attention_mask' in batch else None

            # Forward pass
            logits = model(input_ids=input_ids, tgt_mask=tgt_mask, tgt_key_padding_mask=tgt_key_padding_mask)

            # Shift logits and labels for causal language modeling
            shift_logits = logits[:, :-1, :].contiguous()
            shift_labels = batch['labels'][:, 1:].contiguous()

            # Compute loss
            loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    print("Training complete!")
    return model


## Training the Model

In [14]:
from torch.optim import AdamW

# model = CustomDecoderModel(tokenizer=tokenizer)
# model = CustomDecoderModel()
model = CustomVaswaniDecoder(len(tokenizer))

model.to(device)

CustomVaswaniDecoder(
  (embedding): Embedding(50260, 128)
  (positional_embedding): Embedding(100, 128)
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0

In [15]:
train_model(model, tokenizer, dataloader, num_epochs=5)

Epoch 1/5, Loss: 4.7683
Epoch 2/5, Loss: 1.1804
Epoch 3/5, Loss: 0.2298
Epoch 4/5, Loss: 0.0757
Epoch 5/5, Loss: 0.0367
Training complete!


CustomVaswaniDecoder(
  (embedding): Embedding(50260, 128)
  (positional_embedding): Embedding(100, 128)
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0

## Predict Output Sequence

In [16]:
def generate_initials(model, tokenizer, name, max_length=20):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    
    # Prepare input
    input_text = f"{name} -> "
    inputs = tokenizer(
        input_text,
        padding=False,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    ).to(device)
    
    input_ids = inputs['input_ids']
    print("Input Text:", tokenizer.decode(input_ids[0], skip_special_tokens=False))
    
    with torch.no_grad():
        # Create the sequence mask
        seq_len = input_ids.size(1)
        tgt_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(device)
        
        # Get initial prediction
        outputs = model(input_ids=input_ids, tgt_mask=tgt_mask)
        next_token_logits = outputs[:, -1, :]
        next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)
        generated = torch.cat([input_ids, next_token], dim=1)
        
        # Generate remaining tokens
        for _ in range(max_length - 1):
            seq_len = generated.size(1)
            tgt_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(device)
            
            outputs = model(input_ids=generated, tgt_mask=tgt_mask)
            next_token_logits = outputs[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)
            
            # Stop if we predict EOS
            if next_token.item() == tokenizer.eos_token_id:
                break
                
            generated = torch.cat([generated, next_token], dim=1)
    
    # Decode and print the final output
    predicted_text = tokenizer.decode(generated[0], skip_special_tokens=False)
    print("Generated Output:", predicted_text)
    
    # Extract just the initials
    initials = predicted_text.split("->")[-1].strip()
    initials = initials.split("<")[0].strip()  # Remove EOS token if present
    
    return initials

In [17]:
# Sample input
name = "Dario Mustermann"

# Generate initials
initials = generate_initials(model, tokenizer, name)
print(f"Predicted initials for '{name}': {initials}")


Input Text: Dario Mustermann -> 
Generated Output: Dario Mustermann -> DM
Predicted initials for 'Dario Mustermann': DM


## Evaluation

In [18]:
# use df for testing

df = pd.read_csv('combined_names.csv')
start = 100000
number_of_items = 100
df = df[start:start+number_of_items]

predictions = []
targets = []

for idx, row in df.iterrows():
    name = row['name']
    initials = generate_initials(model, tokenizer, name)
    target = row['target']
    targets.append(target)
    predictions.append(initials)
    print(f"Predicted initials for '{name}': {initials}")


print()
print("Predictions:", predictions)
print("Targets:", targets)

# Calculate accuracy
correct_predictions = sum(1 for pred, tgt in zip(predictions, targets) if pred == tgt)
accuracy = correct_predictions / len(targets) * 100  # Percentage accuracy

print(f"Accuracy: {accuracy:.2f}%")


Input Text: Rishita Mogrovejo -> 
Generated Output: Rishita Mogrovejo -> RM
Predicted initials for 'Rishita Mogrovejo': RM
Input Text: Inez Mollin -> 
Generated Output: Inez Mollin -> IM
Predicted initials for 'Inez Mollin': IM
Input Text: Devony Mollitor -> 
Generated Output: Devony Mollitor -> DM
Predicted initials for 'Devony Mollitor': DM
Input Text: Zaidee Mooris -> 
Generated Output: Zaidee Mooris -> ZM
Predicted initials for 'Zaidee Mooris': ZM
Input Text: Alma Moraida -> 
Generated Output: Alma Moraida -> AM
Predicted initials for 'Alma Moraida': AM
Input Text: Davyn Motheral -> 
Generated Output: Davyn Motheral -> DM
Predicted initials for 'Davyn Motheral': DM
Input Text: Doris Mottaz -> 
Generated Output: Doris Mottaz -> DM
Predicted initials for 'Doris Mottaz': DM
Input Text: Teonna Moucha -> 
Generated Output: Teonna Moucha -> TM
Predicted initials for 'Teonna Moucha': TM
Input Text: Shifra Muia -> 
Generated Output: Shifra Muia -> SM
Predicted initials for 'Shifra Muia': S