In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

import json
import pandas as pd

In [2]:
# Configuration object to store hyperparameters
class Config:
    def __init__(self):
        self.vocab_size = 119547            # e.g., BERT-base vocab size
        self.hidden_size = 768              # model embedding size
        self.max_position_embeddings = 512  # maximum sequence length
        self.hidden_dropout_prob = 0.1    # 10% probability of dropping (zeroing out) each element in the input tensor during training
        self.intermediate_size = 3072       # FFN inner layer size (usually 4*hidden_size)
        self.num_attention_heads = 12
        self.num_encoder_layers = 6         # number of encoder layers
        self.num_decoder_layers = 6         # number of decoder layers

config = Config()

In [3]:
# Embeddings module: token embeddings + positional embeddings
class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) # output of this embedding lookup is a tensor of shape (batch_size, sequence_length, hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob) # define dropout rate from config param

    def forward(self, input_ids):
        # input_ids shape: (batch_size, seq_length)
        seq_length = input_ids.size(1)
        # Create position IDs: (1, seq_length)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device).unsqueeze(0)
        # Look up token and position embeddings
        token_embeddings = self.token_embeddings(input_ids)           # (batch_size, seq_length, hidden_size)
        position_embeddings = self.position_embeddings(position_ids)    # (1, seq_length, hidden_size)
        embeddings = token_embeddings + position_embeddings  # Broadcasting over batch dimension
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        # print("Embeddings output shape:", embeddings.shape) #print
        return embeddings

# attention function
def scaled_dot_product_attention(query, key, value, mask=None):
    dim_k = query.size(-1)
    scores = torch.bmm(query, key.transpose(1, 2)) / math.sqrt(dim_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float("-inf"))
    weights = F.softmax(scores, dim=-1)
    return torch.bmm(weights, value)

class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim): # embed_dim is define in hidden_size
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, query, key, value):
        return scaled_dot_product_attention(
            self.q(query), self.k(key), self.v(value)
        )
        
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        embed_dim = config.hidden_size
        num_heads = config.num_attention_heads
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, query, key=None, value=None, mask=None):
        # If key or value is not provided, default to query (i.e., self-attention)
        if key is None:
            key = query
        if value is None:
            value = query
        # Each head returns a tensor of shape: (batch_size, seq_length, head_dim)
        head_outputs = [h(query, key, value) for h in self.heads]
        # Concatenate on the last dimension: shape becomes (batch_size, seq_length, embed_dim)
        x = torch.cat(head_outputs, dim=-1)
        # print("Concatenated heads shape:", x.shape) #！！
        x = self.output_linear(x)
        # print("MultiHeadAttention output shape:", x.shape)#！！
        return x

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size) #Expanding the dimensionality provides the model with more capacity to capture complex patterns and relationships within each token's representation
        self.gelu = nn.GELU() #GELU is smoother than ReLU. This smooth behavior can lead to better gradient flow
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

In [4]:
# Transformer Encoder Layer: attention + feed-forward, with residual connections
class TransformerEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)

    def forward(self, x):
        # Pre-layer normalization before attention
        hidden_state = self.layer_norm_1(x)
        # Residual Connection:The attention output is added back to the original x,preserve the original information while integrating new
        x = x + self.attention(hidden_state)
        # Pre-layer normalization before feed-forward, then
        # residual connection
        x = x + self.feed_forward(self.layer_norm_2(x))
        return x

# Transformer Encoder: stacking multiple encoder layers
class TransformerEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.layers = nn.ModuleList([TransformerEncoderLayer(config)
                                     for _ in range(config.num_encoder_layers)]) # encoder_layers, defined in config
    def forward(self, input_ids):
        x = self.embeddings(input_ids)
        for layer in self.layers:
            x = layer(x)
        return x  # shape: (batch_size, seq_length, hidden_size)

In [5]:
# Transformer Decoder Layer: has two attention sub-layers
class TransformerDecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        self.layer_norm_3 = nn.LayerNorm(config.hidden_size)
        # Masked self-attention for decoder (prevent future tokens)
        self.masked_attention = MultiHeadAttention(config)
        # Encoder-decoder (cross) attention: here we assume same MultiHeadAttention; 
        # In practice, keys and values come from encoder outputs.
        self.enc_dec_attention = MultiHeadAttention(config)
        self.feed_forward = FeedForward(config)
        
    def forward(self, x, memory, tgt_mask=None):
        # Masked self-attention sub-layer
        hidden_state = self.layer_norm_1(x)
         # residual connection
        x = x + self.masked_attention(hidden_state, mask=tgt_mask)
        # Encoder-decoder (cross) attention: query from decoder; key & value from encoder memory
        hidden_state2 = self.layer_norm_2(x)
         # residual connection
        #  pass memory here
        x = x + self.enc_dec_attention(hidden_state2, key=memory, value=memory)
        # Feed-forward sub-layer & residual connection
        x = x + self.feed_forward(self.layer_norm_3(x))
        return x

# Transformer Decoder: stacking multiple decoder layers
class TransformerDecoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embeddings = Embeddings(config)  # for target tokens
        self.layers = nn.ModuleList([TransformerDecoderLayer(config)
                                     for _ in range(config.num_decoder_layers)])
    def forward(self, target_ids, memory, tgt_mask=None):
        x = self.embeddings(target_ids)
        for layer in self.layers:
            x = layer(x, memory, tgt_mask=tgt_mask)
        return x  # shape: (batch_size, target_seq_length, hidden_size)

In [6]:
# mask
def generate_tgt_mask(seq_length, device=None):
    """
    Generates a target mask for a sequence of given length.
    
    The mask is a lower triangular matrix of shape (1, seq_length, seq_length),
    where positions with 1 indicate allowed attention and 0 indicate masked positions.
    
    Args:
        seq_length (int): The length of the target sequence.
        device (torch.device or None): The device on which to create the mask.
    
    Returns:
        torch.Tensor: A mask tensor of shape (1, seq_length, seq_length).
    """
    # Create a lower triangular matrix filled with 1s (allowed positions)
    mask = torch.tril(torch.ones(seq_length, seq_length, device=device))
    # Unsqueeze to add a batch dimension for broadcasting: (1, seq_length, seq_length)
    mask = mask.unsqueeze(0)
    return mask

In [7]:
# Full Transformer Model: connects encoder and decoder
class TransformerModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config)
        self.decoder = TransformerDecoder(config)
        self.output_linear = nn.Linear(config.hidden_size, config.vocab_size)
        
    def forward(self, src_ids, tgt_ids, tgt_mask=None):
        memory = self.encoder(src_ids)
        decoder_output = self.decoder(tgt_ids, memory, tgt_mask=tgt_mask)
        logits = self.output_linear(decoder_output)
        return logits  # shape: (batch_size, tgt_seq_length, vocab_size)


# data prep

In [8]:
with open("train.json","r",encoding="utf-8") as f:
    data=json.load(f)
# print("Number of examples:", len(data))
# print(json.dumps(data[0], indent=2))

df = pd.DataFrame(data)
df.shape

df_exploded = df.explode("conversation")
# Reset index for convenience
df_exploded = df_exploded.reset_index(drop=True)
# Normalize the "conversation" column into separate columns
conversation_df = pd.json_normalize(df_exploded["conversation"])
# Merge the normalized data back into the exploded df
df_exploded = df_exploded.drop("conversation", axis=1).join(conversation_df)
# only need the ja_sentance and en_sentence columns
train_data=df_exploded[["ja_sentence","en_sentence"]]

train_data=train_data[:500]

data prep for model

In [9]:
# clean - removing unwanted characters, normalizing punctuation, etc

# tokenizing
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") # this impact token size

# Tokenize sentences with special tokens (like [CLS], [SEP] or <sos>, <eos> depending on the model)
train_data["src_ids"] = train_data["ja_sentence"].apply(lambda s: tokenizer.encode(s, add_special_tokens=True))
train_data["tgt_ids"] = train_data["en_sentence"].apply(lambda s: tokenizer.encode(s, add_special_tokens=True))

# padding them to a fixed length
from torch.nn.utils.rnn import pad_sequence
# creates two Python lists—one for the source sentences and one for the target sentences. 
# Each element in these lists is a PyTorch tensor that contains the token IDs for one sentence.
src_id_tensors = [torch.tensor(ids) for ids in train_data["src_ids"]]
tgt_id_tensors = [torch.tensor(ids) for ids in train_data["tgt_ids"]]
# Use the tokenizer's pad token ID for padding
src_padded = pad_sequence(src_id_tensors, batch_first=True, padding_value=tokenizer.pad_token_id)
tgt_padded = pad_sequence(tgt_id_tensors, batch_first=True, padding_value=tokenizer.pad_token_id)

# batch
from torch.utils.data import TensorDataset, DataLoader
dataset = TensorDataset(src_padded, tgt_padded)
dataloader = DataLoader(dataset, batch_size=50, shuffle=True)


In [10]:
max_src = max([max(ids) for ids in train_data["src_ids"] if ids])
max_tgt = max([max(ids) for ids in train_data["tgt_ids"] if ids])
print("Max source token ID:", max_src)
print("Max target token ID:", max_tgt)
print("Tokenizer vocab size:", tokenizer.vocab_size)

Max source token ID: 119518
Max target token ID: 110603
Tokenizer vocab size: 119547



# training 

In [11]:
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the model with your configuration and move it to the device.
model = TransformerModel(config).to(device)

# Define the loss function.
# CrossEntropyLoss is used for classification tasks, and we ignore the pad token.
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Define an optimizer, here we use Adam.
optimizer = optim.Adam(model.parameters(), lr=1e-4)

model.train()
num_epochs = 5
epoch_result=[]
# Iterate over epochs.
for epoch in range(num_epochs):
    epoch_loss = 0.0

    # Iterate over batches from the DataLoader.
    for batch in dataloader:
        #   src_ids: Padded token IDs for source sentences (shape: [batch_size, src_seq_len])
        #   tgt_ids: Padded token IDs for target sentences (shape: [batch_size, tgt_seq_len])
        src_ids, tgt_ids = batch
        src_ids = src_ids.to(device)
        # print("src:", src_ids.shape)
        tgt_ids = tgt_ids.to(device)
        # print("tgt:", tgt_ids.shape)
           
        # shift the target sequence: ensures that at every time step the model’s input is the sequence of previous tokens, and it’s trained to predict the next token.
        #   - decoder_input: All tokens except the last one.
        #   - target_output: All tokens except the first one.
        decoder_input = tgt_ids[:, :-1]   # shape: (batch_size, tgt_seq_len - 1),shifting it removes one token from each end:
        target_output = tgt_ids[:, 1:]      # shape: (batch_size, tgt_seq_len - 1)
        print("decoder_input:", decoder_input.shape)
        print("target_output:", target_output.shape)
        
        # Generate a target mask to enforce causality in the decoder.
        # This mask prevents each token from attending to future tokens.
        seq_length = decoder_input.size(1)
        tgt_mask = generate_tgt_mask(seq_length, device=decoder_input.device)
        
        # Forward pass: pass source and decoder inputs through the model.
        # The model expects (src_ids, decoder_input, tgt_mask).
        logits = model(src_ids, decoder_input, tgt_mask=tgt_mask)
        # logits shape: (batch_size, tgt_seq_len - 1, vocab_size)
        print("Logits shape:", logits.shape)

        # Reshape logits and target_output for loss computation.
        # We flatten the batch and sequence dimensions.
        logits = logits.reshape(-1, config.vocab_size)
        target_output = target_output.reshape(-1)
        
        # Compute the loss.
        loss = criterion(logits, target_output)
        print("Loss:", loss.item())
        # Backpropagation and parameter update.
        optimizer.zero_grad()   # Clear previous gradients
        loss.backward()         # Compute new gradients
        optimizer.step()        # Update the model's parameters
        
        epoch_loss += loss.item()
    
    average_loss = epoch_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {average_loss:.4f}")
    epoch_result.append([epoch,average_loss])

decoder_input: torch.Size([50, 40])
target_output: torch.Size([50, 40])
Logits shape: torch.Size([50, 40, 119547])
Loss: 11.975255012512207
decoder_input: torch.Size([50, 40])
target_output: torch.Size([50, 40])
Logits shape: torch.Size([50, 40, 119547])
Loss: 10.264413833618164
decoder_input: torch.Size([50, 40])
target_output: torch.Size([50, 40])
Logits shape: torch.Size([50, 40, 119547])
Loss: 10.630443572998047
decoder_input: torch.Size([50, 40])
target_output: torch.Size([50, 40])
Logits shape: torch.Size([50, 40, 119547])
Loss: 10.061494827270508
decoder_input: torch.Size([50, 40])
target_output: torch.Size([50, 40])
Logits shape: torch.Size([50, 40, 119547])
Loss: 9.788758277893066
decoder_input: torch.Size([50, 40])
target_output: torch.Size([50, 40])
Logits shape: torch.Size([50, 40, 119547])
Loss: 9.845888137817383
decoder_input: torch.Size([50, 40])
target_output: torch.Size([50, 40])
Logits shape: torch.Size([50, 40, 119547])
Loss: 9.581720352172852
decoder_input: torch.Si

In [14]:
epoch_result

[[0, 9.797934532165527],
 [1, 6.582284307479858],
 [2, 5.15256519317627],
 [3, 4.361662435531616],
 [4, 3.8526048183441164]]

# testing

In [20]:
# load test data 
with open("test.json","r",encoding="utf-8") as f:
    data=json.load(f)
# print("Number of examples:", len(data))
# print(json.dumps(data[0], indent=2))

df = pd.DataFrame(data)
df.shape

df_exploded = df.explode("conversation")
# Reset index for convenience
df_exploded = df_exploded.reset_index(drop=True)
# Normalize the "conversation" column into separate columns
conversation_df = pd.json_normalize(df_exploded["conversation"])
# Merge the normalized data back into the exploded df
df_exploded = df_exploded.drop("conversation", axis=1).join(conversation_df)
# only need the ja_sentance and en_sentence columns
test_data=df_exploded[["ja_sentence","en_sentence"]]


In [23]:
# prep tensor for testing
# Tokenize the test sentences (using the same tokenizer as before)
test_data["src_ids"] = test_data["ja_sentence"].apply(lambda s: tokenizer.encode(s, add_special_tokens=True))
test_data["tgt_ids"] = test_data["en_sentence"].apply(lambda s: tokenizer.encode(s, add_special_tokens=True))

# Convert the token lists to tensors
src_id_tensors_test = [torch.tensor(ids) for ids in test_data["src_ids"]]
tgt_id_tensors_test = [torch.tensor(ids) for ids in test_data["tgt_ids"]]

# Pad the sequences
src_padded_test = pad_sequence(src_id_tensors_test, batch_first=True, padding_value=tokenizer.pad_token_id)
tgt_padded_test = pad_sequence(tgt_id_tensors_test, batch_first=True, padding_value=tokenizer.pad_token_id)

# Create a dataset and DataLoader for test data
test_dataset = TensorDataset(src_padded_test, tgt_padded_test)
test_dataloader = DataLoader(test_dataset, batch_size=50, shuffle=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data["src_ids"] = test_data["ja_sentence"].apply(lambda s: tokenizer.encode(s, add_special_tokens=True))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data["tgt_ids"] = test_data["en_sentence"].apply(lambda s: tokenizer.encode(s, add_special_tokens=True))


In [26]:
src_padded_test.shape

torch.Size([2120, 74])

In [27]:
# evluation
model.eval()  # Set model to evaluation mode

total_loss = 0.0
total_correct = 0
total_tokens = 0

with torch.no_grad():
    for batch in test_dataloader:
        src_ids, tgt_ids = batch
        src_ids = src_ids.to(device)
        tgt_ids = tgt_ids.to(device)
        
        # Create decoder input and target output by shifting tgt_ids
        decoder_input = tgt_ids[:, :-1]  # shape: (batch_size, tgt_seq_len - 1)
        target_output = tgt_ids[:, 1:]     # shape: (batch_size, tgt_seq_len - 1)
        
        # Generate the target mask for the decoder
        seq_length = decoder_input.size(1)
        tgt_mask = generate_tgt_mask(seq_length, device=decoder_input.device)
        
        # Forward pass through the model
        logits = model(src_ids, decoder_input, tgt_mask=tgt_mask)
        # logits shape: (batch_size, tgt_seq_len - 1, vocab_size)
        
        # Compute loss: flatten logits and target output for CrossEntropyLoss
        loss = criterion(logits.reshape(-1, config.vocab_size), target_output.reshape(-1))
        total_loss += loss.item() #accumulating the numeric loss value from each batch
        
        # Compute token-level accuracy
        predictions = torch.argmax(logits, dim=-1)  # shape: (batch_size, tgt_seq_len - 1)
        # Create a mask to ignore pad tokens in the target, creates a boolean tensor label True where the target output is not the pad token,
        non_pad = target_output != tokenizer.pad_token_id 
        correct = (predictions == target_output) & non_pad
        total_correct += correct.sum().item()
        total_tokens += non_pad.sum().item()

avg_loss = total_loss / len(test_dataloader)
accuracy = total_correct / total_tokens if total_tokens > 0 else 0

print("Test Loss:", avg_loss)
print("Test Accuracy:", accuracy)


Test Loss: 6.441004620041958
Test Accuracy: 0.2197485128058989


# option
to save and load the model back

In [19]:
# Save the model's state dict to a file
# torch.save(model.state_dict(), "transformer_model.pth")

# Create a new instance of the model with the same configuration
# model_loaded = TransformerModel(config)
# model_loaded.load_state_dict(torch.load("transformer_model.pth"))
# model_loaded.to(device)  # Move the model to the appropriate device (GPU or CPU)
# model_loaded.eval()      # Set the model to evaluation mode
