In [5]:
import pandas as pd
from num2words import num2words

def create_small_dataset():
    numbers = list(range(0, 100))
    data = [(str(n), num2words(n)) for n in numbers]
    df = pd.DataFrame(data, columns=['num', 'word'])
    return df

small_dataset = create_small_dataset()
train_df = small_dataset.sample(frac=1, random_state=42)
val_df = train_df


In [6]:
def create_dataset(start=0, end=1000, train_size=0.8):
   numbers = list(range(start, end))
   data = [(str(n), num2words(n)) for n in numbers]
   df = pd.DataFrame(data, columns=['num', 'word'])
   
   train_df = df.sample(frac=train_size, random_state=42)
   test_df = df.drop(train_df.index)
   
   return train_df, test_df

train_df, val_df = create_dataset(0, 1000, 0.8)

In [7]:
len(val_df), len(train_df)

(200, 800)

In [8]:
train_df.head()

Unnamed: 0,num,word
521,521,five hundred and twenty-one
737,737,seven hundred and thirty-seven
740,740,seven hundred and forty
660,660,six hundred and sixty
411,411,four hundred and eleven


In [9]:
import sentencepiece as spm
import io
text_data = "\n".join(train_df['word'])
text_stream = io.StringIO(text_data)

spm.SentencePieceTrainer.train(
    sentence_iterator=text_stream,  # Use in-memory text
    model_prefix='num_to_words',    # Model output prefix
    vocab_size=200,                   # Adjust vocabulary size based on data
    model_type='bpe',                 # Choose 'bpe', 'unigram', etc.
    character_coverage=1.0,            # Cover all characters in the dataset
    user_defined_symbols=['<PAD>', '<BOS>', '<EOS>']  # Add special tokens
)


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input_format: 
  model_prefix: num_to_words
  model_type: BPE
  vocab_size: 200
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  user_defined_symbols: <PAD>
  user_defined_symbols: <BOS>
  user_defined_symbols: <EOS>
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pa

In [10]:
import sentencepiece as spm
import torch
import pandas as pd
import torch


def tokenize_word_column(df_column):
    sp = spm.SentencePieceProcessor(model_file='num_to_words.model')

    bos_token_id = 4  # <BOS>
    eos_token_id = 5  # <EOS>
    pad_token_id = 3  # <PAD>

    tokenized_words = df_column.apply(lambda x: [bos_token_id] + sp.encode(x, out_type=int) + [eos_token_id])

    max_length = tokenized_words.apply(len).max()

    padded_tokens = tokenized_words.apply(lambda x: x + [pad_token_id] * (max_length - len(x)))

    tensor_data = torch.tensor(padded_tokens.tolist(), dtype=torch.long)

    return tensor_data



BOS_TOKEN_ID = 10  # Ensure same ID for both input and output
EOS_TOKEN_ID = 11
PAD_TOKEN_ID = 12

def tokenize_number_column(df_column):
    tokenized_sequences = [
        [BOS_TOKEN_ID] + [int(d) for d in str(num)] + [EOS_TOKEN_ID] for num in df_column
    ]

    # Ensure consistent padding length across input and target
    max_length = max(len(seq) for seq in tokenized_sequences)
    
    padded_sequences = [
        seq + [PAD_TOKEN_ID] * (max_length - len(seq)) for seq in tokenized_sequences
    ]

    tensor_sequences = torch.tensor(padded_sequences, dtype=torch.long)

    return tensor_sequences

def decode_number_sequence(sequence):
   seq_list = sequence.tolist()
   tokens = [
       '<BOS>' if x == BOS_TOKEN_ID else
       '<EOS>' if x == EOS_TOKEN_ID else
       '<PAD>' if x == PAD_TOKEN_ID else
       str(x) 
       for x in seq_list
   ]
   return "".join(tokens)

tokenized_words = tokenize_word_column(train_df['word'])
tokenized_tensor = tokenize_number_column(train_df['num'])




In [11]:
from torch.utils.data import Dataset, DataLoader

class NumberWordDataset(Dataset):
    def __init__(self, input_tensor, target_tensor, pad_token_id=3):
        """
        Args:
            input_tensor (torch.Tensor): Tokenized numbers (input sequences).
            target_tensor (torch.Tensor): Tokenized words (output sequences).
            pad_token_id (int): The ID used for padding.
        """
        self.input_tensor = input_tensor
        self.target_tensor = target_tensor
        self.pad_token_id = pad_token_id

    def __len__(self):
        return len(self.input_tensor)

    def __getitem__(self, idx):
        input_seq = self.input_tensor[idx]
        target_seq = self.target_tensor[idx]

        # Create attention masks (1 = actual token, 0 = padding)
        input_mask = (input_seq != self.pad_token_id).long()
        target_mask = (target_seq != self.pad_token_id).long()

        return {
            'input_seq': input_seq,
            'target_seq': target_seq,
            'input_mask': input_mask.bool(),  # Convert to boolean
            'target_mask': target_mask.bool() 
        }


In [12]:
# Create the dataset from tokenized data
dataset = NumberWordDataset(tokenized_tensor, tokenized_words)


In [13]:
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
sp = spm.SentencePieceProcessor(model_file='num_to_words.model')

# Example: Iterate through the DataLoader
for batch in dataloader:
    input_seq = batch['input_seq']
    target_seq = batch['target_seq']
    input_mask = batch['input_mask']
    target_mask = batch['target_mask']

    decoded_input = decode_number_sequence(input_seq[0])
    print("Decoded Input:", decoded_input)
    print("Input:", input_seq[0])
    print("Input Mask:", input_mask[0])
    decoded_target = sp.decode_ids(target_seq[0].tolist())
    print("Decoded Target:", decoded_target)
    print("Target:", target_seq[0])
    print("Target Mask:", target_mask[0])

    break  # Just show the first batch for testing


Decoded Input: <BOS>486<EOS>
Input: tensor([10,  4,  8,  6, 11])
Input Mask: tensor([True, True, True, True, True])
Decoded Target: <BOS> four hundred and eighty-six<EOS>
Target: tensor([  4,  43,  12,  14,  54, 190,  66,   5])
Target Mask: tensor([True, True, True, True, True, True, True, True])


In [14]:
import torch.nn as nn
class TransformerModel(nn.Module):
    def __init__(self, input_vocab_size, target_vocab_size, embed_dim=64, num_heads=4, num_layers=3):
        super().__init__()
        self.encoder_embedding = nn.Embedding(input_vocab_size, embed_dim)
        self.decoder_embedding = nn.Embedding(target_vocab_size, embed_dim)
        self.transformer = nn.Transformer(d_model=embed_dim, nhead=num_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers)
        self.fc_out = nn.Linear(embed_dim, target_vocab_size)

    def forward(self, src, tgt, src_mask, tgt_mask):
        src_emb = self.encoder_embedding(src).permute(1, 0, 2)
        tgt_emb = self.decoder_embedding(tgt).permute(1, 0, 2)
        output = self.transformer(src_emb, tgt_emb, tgt_mask=self.generate_square_subsequent_mask(tgt.size(1)))
        return self.fc_out(output.permute(1, 0, 2))

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz), diagonal=1).bool()
        return mask.to(next(self.parameters()).device)
       


class TransformerModel(nn.Module):
    def __init__(self, input_vocab_size, target_vocab_size, d_model=512, nhead=8, num_layers=6, 
                 d_ff=2048, dropout=0.1):
        super().__init__()
        self.encoder_embedding = nn.Embedding(input_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(target_vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.pos_decoder = PositionalEncoding(d_model, dropout)
        
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=d_ff,
            dropout=dropout,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, target_vocab_size)

    def forward(self, src, tgt, src_mask, tgt_mask):
        src_emb = self.pos_encoder(self.encoder_embedding(src))
        tgt_emb = self.pos_decoder(self.decoder_embedding(tgt))
        output = self.transformer(src_emb, tgt_emb, tgt_mask=self.generate_square_subsequent_mask(tgt.size(1)))
        return self.fc_out(output)

    def generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz), diagonal=1).bool()
        return mask.to(next(self.parameters()).device)
        
class PositionalEncoding(nn.Module):
   def __init__(self, d_model, dropout=0.1, max_len=100):
       super().__init__()
       self.dropout = nn.Dropout(p=dropout)
       position = torch.arange(max_len).unsqueeze(1)
       div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
       pe = torch.zeros(max_len, d_model)
       pe[:, 0::2] = torch.sin(position * div_term)
       pe[:, 1::2] = torch.cos(position * div_term)
       self.register_buffer('pe', pe)

   def forward(self, x):
       x = x + self.pe[:x.size(1)]
       return self.dropout(x)

In [38]:
def train_model(model, dataloader, num_epochs=20, learning_rate=1e-5, pad_token_id=3):
    device = torch.device("cpu" if torch.backends.mps.is_available() else "cpu")
    torch.set_default_dtype(torch.float32)  # Use float32 for stability
    model.to(device).to(torch.float32)

    max_grad_norm = 1.0

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    optimizer.zero_grad(set_to_none=True)

    criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0

        for batch in dataloader:
            src = batch['input_seq'].to(device)
            tgt = batch['target_seq'].to(device)

            src_mask = batch['input_mask'].to(device)
            tgt_mask = batch['target_mask'].to(device)

            # Convert mask to correct type and shape
            src_mask = ~src_mask[:, :src.shape[1]].bool().to(device)
            tgt_mask = ~tgt_mask[:, :tgt.shape[1]].bool().to(device)

            # Ensure tgt_input matches expected length
            tgt_input = tgt[:, :-1]  # Remove last token to match expected shape
            tgt_output = tgt[:, 1:]  # Shift target sequence for teacher forcing

            tgt_input = tgt_input.to(device)
            tgt_output = tgt_output.to(device)


            # Ensure the target mask shape matches the new target input shape
            tgt_mask = tgt_mask[:, :-1]  # Adjust mask length to match tgt_input


            # Forward pass
            optimizer.zero_grad()

            output = model(src, tgt_input, src_mask, tgt_mask)

            # Compute loss
            output = output.reshape(-1, output.shape[-1])
            tgt_output = tgt_output.reshape(-1)

            loss = criterion(output, tgt_output)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            
            optimizer.step()
            
            # Check for NaN loss
            if torch.isnan(loss):
                print(f"NaN loss detected in batch! Skipping batch.")
                continue

            optimizer.step()
            epoch_loss += loss.item()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(dataloader)}")


In [41]:
def train_model(model, dataloader, num_epochs=20, learning_rate=1e-4, pad_token_id=3):
    # Add initialization
    def init_weights(m):
        if isinstance(m, (nn.Linear, nn.Embedding)):
            torch.nn.init.xavier_uniform_(m.weight)
            if hasattr(m, 'bias') and m.bias is not None:
                torch.nn.init.zeros_(m.bias)
    
    model.apply(init_weights)
    
    if torch.backends.mps.is_available():
        device = torch.device("cpu")
        torch.backends.mps.enable_fallback_to_cpu = True
    else:
        device = torch.device("cpu")
    
    torch.set_default_dtype(torch.float32)
    model = model.to(device).to(torch.float32)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)
    
    # Training stability parameters
    warmup_batches = 3
    accumulation_steps = 4  # Only during warmup
    max_grad_norm = 0.5
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        optimizer.zero_grad(set_to_none=True)
        
        for batch_idx, batch in enumerate(dataloader):
            try:
                # Move data to device
                src = batch['input_seq'].to(device)
                tgt = batch['target_seq'].to(device)
                src_mask = batch['input_mask'].to(device)
                tgt_mask = batch['target_mask'].to(device)
                                
                # Convert mask to correct type and shape
                src_mask = ~src_mask[:, :src.shape[1]].bool().to(device)
                tgt_mask = ~tgt_mask[:, :tgt.shape[1]].bool().to(device)
                
                # Prepare target sequences
                tgt_input = tgt[:, :-1].to(device)
                tgt_output = tgt[:, 1:].to(device)
                tgt_mask = tgt_mask[:, :-1]
                
                # Forward pass
                output = model(src, tgt_input, src_mask, tgt_mask)
                
                # Check for NaN in output
                if torch.isnan(output).any():
                    print("NaN detected in output before loss!")
                    continue
                
                # Reshape for loss calculation
                output = output.reshape(-1, output.shape[-1])
                tgt_output = tgt_output.reshape(-1)
                
                # Calculate loss
                loss = criterion(output, tgt_output)
                
                # Check for NaN loss
                if torch.isnan(loss):
                    print(f"NaN loss detected in batch {batch_idx}!")
                    print(f"Output stats - min: {output.min()}, max: {output.max()}, mean: {output.mean()}")
                    print(f"Target stats - min: {tgt_output.min()}, max: {tgt_output.max()}")
                    continue
                
                # Scale loss during warmup
                if epoch == 0 and batch_idx < warmup_batches:
                    loss = loss / accumulation_steps
                
                # Backward pass
                loss.backward()
                
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_grad_norm)
                
                # Check gradients
                for name, param in model.named_parameters():
                    if param.grad is not None and torch.isnan(param.grad).any():
                        print(f"NaN gradient detected in {name}")
                        continue
                
                # Optimizer step
                if epoch > 0 or batch_idx >= warmup_batches or (batch_idx + 1) % accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad(set_to_none=True)
                
                epoch_loss += loss.item()
                
            except Exception as e:
                print(f"Error in batch {batch_idx}: {str(e)}")
                continue
        
        avg_epoch_loss = epoch_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_epoch_loss:.4f}")
    
    return model

In [42]:
import math
# Assuming your vocabulary sizes from the tokenizer
input_vocab_size = 10 + 3
target_vocab_size = sp.get_piece_size()

# Initialize the Transformer model
transformer_model = TransformerModel(
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size
)

# Train the model
train_model(transformer_model, dataloader, num_epochs=20)


Epoch 1/20, Loss: 3.0667
Epoch 2/20, Loss: 1.9947
Epoch 3/20, Loss: 1.5522
Epoch 4/20, Loss: 1.4500
Epoch 5/20, Loss: 1.4155
Epoch 6/20, Loss: 1.3992
Epoch 7/20, Loss: 1.3710
Epoch 8/20, Loss: 1.3406


KeyboardInterrupt: 

In [39]:
import math
# Assuming your vocabulary sizes from the tokenizer
input_vocab_size = 10 + 3
target_vocab_size = sp.get_piece_size()

# Initialize the Transformer model
transformer_model = TransformerModel(
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size
)

# Train the model
train_model(transformer_model, dataloader, num_epochs=20)


Epoch 1/20, Loss: 3.1328036260604857
Epoch 2/20, Loss: 1.6313927674293518
Epoch 3/20, Loss: 1.0483209025859832
Epoch 4/20, Loss: 0.825584499835968
Epoch 5/20, Loss: 0.7087904393672944
Epoch 6/20, Loss: 0.6282270812988281
Epoch 7/20, Loss: 0.542253954410553
Epoch 8/20, Loss: 0.4829317003488541
Epoch 9/20, Loss: 0.41201497554779054
Epoch 10/20, Loss: 0.3528533035516739
Epoch 11/20, Loss: 0.3169796398282051
Epoch 12/20, Loss: 0.2781745755672455
Epoch 13/20, Loss: 0.25704569518566134
Epoch 14/20, Loss: 0.24458871245384217
Epoch 15/20, Loss: 0.20953933715820314
Epoch 16/20, Loss: 0.1881439507007599
Epoch 17/20, Loss: 0.18920923113822938
Epoch 18/20, Loss: 0.16462993681430815
Epoch 19/20, Loss: 0.1591998627781868
Epoch 20/20, Loss: 0.15676770389080047


In [364]:
import math
# Assuming your vocabulary sizes from the tokenizer
input_vocab_size = 10 + 3
target_vocab_size = sp.get_piece_size()

# Initialize the Transformer model
transformer_model = TransformerModel(
    input_vocab_size=input_vocab_size,
    target_vocab_size=target_vocab_size
)

# Train the model
train_model(transformer_model, dataloader, num_epochs=20)


Epoch 1/20, Loss: 2.496651341915131
Epoch 2/20, Loss: 0.8746180474758148
Epoch 3/20, Loss: 0.666631880402565
Epoch 4/20, Loss: 0.5613335084915161
Epoch 5/20, Loss: 0.5094714856147766
Epoch 6/20, Loss: 0.4561134362220764
Epoch 7/20, Loss: 0.35182737052440644
Epoch 8/20, Loss: 0.31638810962438585
Epoch 9/20, Loss: 0.3067630445957184
Epoch 10/20, Loss: 0.2941738975048065
Epoch 11/20, Loss: 0.2763662710785866
Epoch 12/20, Loss: 0.24181130439043044
Epoch 13/20, Loss: 0.2829021456837654
Epoch 14/20, Loss: 0.23284145385026933
Epoch 15/20, Loss: 0.22157204702496527
Epoch 16/20, Loss: 0.19545322686433791
Epoch 17/20, Loss: 0.1882915563881397
Epoch 18/20, Loss: 0.17262213081121444
Epoch 19/20, Loss: 0.18533218607306481
Epoch 20/20, Loss: 0.13755528703331948


In [25]:
target_bos_token_id = 4  # SentencePiece model's tokens
target_eos_token_id = 5
target_pad_token_id = 3

def predict(model, number, tokenizer_num, tokenizer_text, max_length=50):
    device = torch.device("cpu")
    model.to(device)
    model.eval()
    with torch.no_grad():
        src = torch.tensor([[BOS_TOKEN_ID] + [int(d) for d in str(number)] + [EOS_TOKEN_ID]], dtype=torch.long).to(device)
        tgt = torch.tensor([[target_bos_token_id]], dtype=torch.long).to(device)
        src_mask = ~torch.ones(src.shape, dtype=torch.bool).to(device)
        tgt_mask = ~torch.ones((1, 1), dtype=torch.bool).to(device)
        
        for _ in range(max_length):
            output = model(src, tgt, src_mask, tgt_mask)
            next_token = output[:, -1].argmax(dim=-1).unsqueeze(1)
            if next_token.item() == target_eos_token_id:
                break
            tgt = torch.cat([tgt, next_token], dim=1)
            tgt_mask = ~torch.ones(tgt.shape, dtype=torch.bool)
            
        output_tokens = tgt[0][1:].tolist()  # Skip BOS
        if target_eos_token_id in output_tokens:
            output_tokens = output_tokens[:output_tokens.index(target_eos_token_id)]
        return tokenizer_text.decode(output_tokens)

In [32]:
predict(transformer_model, 120, tokenize_number_column, sp, max_length=20)

'one hundred and twenty'

In [28]:
def test_accuracy(model, test_df, tokenizer_num, tokenizer_text):
    correct = 0
    total = 0

    for _, row in test_df.iterrows():
        number = row['num']
        word = row['word']
        predicted_word = predict(model, number, tokenizer_num, tokenizer_text)
        print(f"Number: {number}, Predicted: {predicted_word}, Actual: {word}")
        if predicted_word == word:
            correct += 1
        total += 1

    return correct / total

In [40]:
test_accuracy(transformer_model, val_df, tokenize_number_column, sp)

Number: 1, Predicted: one hundred and eleven, Actual: one
Number: 4, Predicted: four hundred and forty-five, Actual: four
Number: 13, Predicted: one hundred and thirty-one, Actual: thirteen
Number: 14, Predicted: one hundred and forty-six, Actual: fourteen
Number: 20, Predicted: two hundred and two, Actual: twenty
Number: 21, Predicted: two hundred and twelve, Actual: twenty-one
Number: 27, Predicted: two hundred and seventy-two, Actual: twenty-seven
Number: 32, Predicted: three hundred and twenty-six, Actual: thirty-two
Number: 34, Predicted: three hundred and forty-six, Actual: thirty-four
Number: 35, Predicted: three hundred and fifty-five, Actual: thirty-five
Number: 40, Predicted: four hundred and four, Actual: forty
Number: 47, Predicted: four hundred and seventy-six, Actual: forty-seven
Number: 52, Predicted: five hundred and twenty-six, Actual: fifty-two
Number: 58, Predicted: five hundred and eighty-six, Actual: fifty-eight
Number: 62, Predicted: six hundred and twenty-eight, 

0.73

In [343]:
test_accuracy(transformer_model, train_df[:30], tokenize_number_column, sp)

Number: 521, Predicted: five hundred and twenty-one, Actual: five hundred and twenty-one
Number: 737, Predicted: seven hundred and thirty-eight, Actual: seven hundred and thirty-seven
Number: 740, Predicted: seven hundred and forty, Actual: seven hundred and forty
Number: 660, Predicted: six hundred and sixty, Actual: six hundred and sixty
Number: 411, Predicted: four hundred and eleven, Actual: four hundred and eleven
Number: 678, Predicted: six hundred and eighty-seven, Actual: six hundred and seventy-eight
Number: 626, Predicted: six hundred and twenty-six, Actual: six hundred and twenty-six
Number: 513, Predicted: five hundred and thirteen, Actual: five hundred and thirteen
Number: 859, Predicted: eight hundred and fifty-nine, Actual: eight hundred and fifty-nine
Number: 136, Predicted: one hundred and thirty-six, Actual: one hundred and thirty-six
Number: 811, Predicted: eight hundred and eleven, Actual: eight hundred and eleven
Number: 76, Predicted: seven hundred and sixty-three

0.7666666666666667