In [1]:
import numpy as np
import pandas as pd
import os

def batch_encode_p1000(numbers):
    """
    Encode a batch of numbers using the P1000 scheme.
    
    Parameters:
      numbers (np.ndarray): Array of numbers to encode.
      
    Returns:
      sign_tokens (np.ndarray): Array of sign tokens ('+' or '-').
      mantissa_tokens (np.ndarray): Array of mantissa tokens as zero-padded strings.
      exponent_tokens (np.ndarray): Array of exponent tokens as strings (e.g., 'E+3').
    """
    # Convert input to numpy array
    numbers = np.array(numbers, dtype=float)  # Ensure floating-point
    
    # Sign tokens: vectorized assignment based on the sign of the numbers
    sign_tokens = np.where(numbers < 0, '-', '+')
    
    # Work with absolute values
    abs_numbers = np.abs(numbers)
    
    # Handle zeros separately to avoid log10 issues
    non_zero = abs_numbers > 0
    exponent = np.zeros_like(abs_numbers, dtype=int)
    normalized = np.zeros_like(abs_numbers, dtype=float)
    
    # Compute exponent where numbers are non-zero
    exponent[non_zero] = np.floor(np.log10(abs_numbers[non_zero])).astype(int)
    
    # Compute normalized values for non-zero numbers - using float powers to avoid integer power error
    normalized[non_zero] = abs_numbers[non_zero] / np.power(10.0, exponent[non_zero])
    
    # Scale the normalized values to get mantissa in [0, 1000)
    # Multiply by 100 since normalized values are in [1, 10)
    mantissas = np.round(normalized * 100).astype(int)
    
    # Correct any mantissa that rounds to 1000
    overflow = mantissas >= 1000
    if np.any(overflow):
        mantissas[overflow] = 100
        exponent[overflow] += 1
    
    # Format mantissas as 3-digit strings
    mantissa_tokens = np.array([f"{m:03d}" for m in mantissas])
    
    # Format exponent tokens as strings (e.g., 'E+3' or 'E-2')
    exponent_tokens = np.array([f"E{'+' if e >= 0 else ''}{e}" for e in exponent])
    
    return sign_tokens, mantissa_tokens, exponent_tokens


In [2]:


# Assuming batch_encode_p1000 function is already defined above

def process_file_with_columns(file_path, col_sep_token="[COL_SEP]",nrows=5,random_seed=42):
    """
    Process a single CSV file containing numerical data.
    Encodes each number using the P1000 scheme and preserves column structure by
    inserting a delimiter token between columns.
    
    Returns a list of tokens for the entire file.
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path,sep=r"\s+",header=None)

    if len(df)>nrows:
        df=df.sample(n=nrows,random_state=random_seed)


    
    # We'll build a token list row by row
    token_list = []
    
    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        row_tokens = []
        # Process each column in the row separately
        for col_idx in df.columns:
            # Encode the number in this cell; each cell may be a single number.
            # If a cell contains multiple numbers, you might need to adjust this.
            number = row[col_idx]
            # For a single number, get the tokens as a list
            sign_tokens, mantissa_tokens, exponent_tokens = batch_encode_p1000([number])
            # Combine the triplet (they are arrays with one element each)
            number_tokens = [sign_tokens[0], mantissa_tokens[0], exponent_tokens[0]]
            row_tokens.extend(number_tokens)
            # Insert column delimiter after each column (except last column)
            if col_idx != df.columns[-1]:
                row_tokens.append(col_sep_token)
        # Optionally, you can add a row delimiter (e.g., [ROW_SEP]) if needed.
        token_list.append("[ROW_SEP]")
        token_list.extend(row_tokens)
    
    token_list.append("[DATA_END]")
    return token_list

# # Example usage for a single file:
# file_path = "../Feynman_with_units/I.6.2"  # Update the path as needed
# token_list = process_file_with_colum                                                                                                                   ns(file_path)
# print(token_list[:20])  # Print first 20 tokens for inspection
# print("Total tokens in file:", len(token_list))


In [3]:
folder = "Feynman_with_units"

file_list = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
print(file_list)

['II.8.31', 'III.12.43', 'I.18.4', 'II.10.9', 'II.11.27', 'III.21.20', 'I.18.14', 'III.9.52', 'I.6.2', 'II.36.38', 'II.2.42', 'II.13.34', 'I.13.12', 'III.4.33', 'I.12.5', 'II.34.29a', 'I.34.27', 'I.34.14', 'II.13.23', 'I.50.26', 'II.4.23', 'II.11.17', 'III.15.12', 'II.21.32', 'I.6.2b', 'II.11.20', 'I.39.11', 'II.6.15a', 'I.32.17', 'I.14.4', 'I.15.3x', 'II.27.18', 'I.6.2a', 'I.43.31', 'I.12.4', 'III.7.38', 'III.17.37', 'I.15.3t', 'III.10.19', 'II.6.11', 'I.32.5', 'I.39.1', 'I.34.8', 'I.13.4', 'II.34.2a', 'I.34.1', 'II.34.2', 'II.3.24', 'I.48.2', 'I.29.16', 'I.12.1', 'I.43.16', 'I.47.23', 'I.11.19', 'I.12.2', 'II.37.1', 'I.30.5', 'II.34.29b', 'I.40.1', 'II.11.3', 'III.13.18', 'I.39.22', 'I.15.1', 'II.24.17', 'I.26.2', 'I.24.6', 'II.38.3', 'III.15.27', 'I.25.13', 'III.14.14', 'II.27.16', 'III.15.14', 'II.35.18', 'I.14.3', 'I.37.4', 'I.38.12', 'III.4.32', 'III.8.54', 'I.18.12', 'II.15.4', 'I.12.11', 'I.44.4', 'I.41.16', 'II.15.5', 'II.35.21', 'II.38.14', 'I.9.18', 'I.43.43', 'I.16.6', 'II.

In [4]:
print(len(file_list))

100


In [5]:
def process_all_files_with_columns(file_list, folder_path, col_sep_token="[COL_SEP]"):
    all_file_tokens = {}
    for file_name in file_list:
        file_path = os.path.join(folder_path, file_name)
        tokens = process_file_with_columns(
            file_path, 
            col_sep_token=col_sep_token, 
            nrows=5,          # <--- ensures we only get 50 rows
            random_seed=42
        )
        all_file_tokens[file_name] = tokens
        print(f"Processed {file_name}, token sequence length: {len(tokens)}")
    return all_file_tokens

# Example usage:
folder_path = "Feynman_with_units"
all_tokens = process_all_files_with_columns(file_list,folder_path)


Processed II.8.31, token sequence length: 61
Processed III.12.43, token sequence length: 61
Processed I.18.4, token sequence length: 101
Processed II.10.9, token sequence length: 81
Processed II.11.27, token sequence length: 101
Processed III.21.20, token sequence length: 101
Processed I.18.14, token sequence length: 101
Processed III.9.52, token sequence length: 141
Processed I.6.2, token sequence length: 61
Processed II.36.38, token sequence length: 181
Processed II.2.42, token sequence length: 121
Processed II.13.34, token sequence length: 81
Processed I.13.12, token sequence length: 121
Processed III.4.33, token sequence length: 101
Processed I.12.5, token sequence length: 61
Processed II.34.29a, token sequence length: 81
Processed I.34.27, token sequence length: 61
Processed I.34.14, token sequence length: 81
Processed II.13.23, token sequence length: 81
Processed I.50.26, token sequence length: 101
Processed II.4.23, token sequence length: 81
Processed II.11.17, token sequence le

In [6]:
from tokenizers import Tokenizer

tokenizer=Tokenizer.from_file("my_tokenizer.json")
tokenized_dict={}
for key,value in all_tokens.items():
    current_token_string=" ".join(value)
    encoding_result = tokenizer.encode(current_token_string)  
    tokenized_dict[key] = encoding_result.ids


    

In [7]:
import json
with open("dataset_encoded.json", "w") as f:
    json.dump(tokenized_dict, f, indent=4)

In [8]:
from openpyxl import load_workbook
wb = load_workbook('FeynmanEquations.xlsx', data_only=False)
ws = wb.active

# Iterate through rows (we start from row 2 because we skip header)
formula_col_index = 1 
formula_list=[]
# Iterate through rows (we start from row 2 because we skip header)
for row in ws.iter_rows(min_row=2):
    # get cell in formula column
    cell = row[formula_col_index - 1]  # zero-indexed
    formula_list.append(cell.value) 
    

##Save the formula_list in a text file
filename = "filenames.txt"

with open(filename, "w") as f:
    for formula in formula_list:
        f.write(formula + "\n")

print(f"Formulas saved to {filename}")



Formulas saved to filenames.txt


In [9]:
import json

DATA_END_ID = 4  # replace with the actual token ID for [DATA_END]
combined_samples = []

# 1) Load RPN file
with open("formulas_rpn.json", "r") as f:
    rpn_data = json.load(f)  # a list of dicts: [{"id": "I.6.2a", "rpn": [...]}, ...]

# 2) Load numeric-data file
with open("dataset_encoded.json", "r") as f:
    data_dict = json.load(f) # a dict with keys like "I.6.2a" : [ data tokens ], etc.

# 3) Combine data & RPN for each entry
for item in rpn_data:
    formula_id = item["id"]
    rpn_tokens = item["rpn"]

    # Find matching data tokens (same ID) in data_dict
    if formula_id in data_dict:
        data_tokens = data_dict[formula_id]
        combined = data_tokens + [DATA_END_ID] + rpn_tokens
        combined_samples.append(combined)
    else:
        print(f"Warning: {formula_id} not found in dataset_encoded.json")

# 'combined_samples' now contains your full sequences, each with data followed by [DATA_END] and formula RPN tokens.
with open("combined_samples.json", "w") as f:
    json.dump(combined_samples, f, indent=4)


In [10]:
def levenshtein_distance(seq1, seq2):
    """
    Returns the Levenshtein edit distance between two lists of tokens,
    i.e. the minimum number of edits (insertions, deletions, substitutions)
    required to transform seq1 into seq2.
    """
    # seq1, seq2 are lists of token IDs (or anything hashable).
    len1, len2 = len(seq1), len(seq2)
    
    # Create a DP table (len1+1) x (len2+1)
    dp = [[0] * (len2 + 1) for _ in range(len1 + 1)]
    
    # Initialization
    for i in range(len1 + 1):
        dp[i][0] = i
    for j in range(len2 + 1):
        dp[0][j] = j
    
    # Compute DP
    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            if seq1[i-1] == seq2[j-1]:
                cost = 0
            else:
                cost = 1
            dp[i][j] = min(
                dp[i-1][j] + 1,      # deletion
                dp[i][j-1] + 1,      # insertion
                dp[i-1][j-1] + cost  # substitution
            )
    return dp[len1][len2]


def normalized_edit_distance(seq1, seq2):
    """
    Returns the normalized edit distance between two sequences in [0..1].
    0 = identical sequences, 1 = completely different if length is > 0.
    """
    if not seq1 and not seq2:
        return 0.0  # both empty
    dist = levenshtein_distance(seq1, seq2)
    max_len = max(len(seq1), len(seq2))
    return dist / max_len


def edit_distance_score(seq1, seq2):
    """
    Returns a similarity score in [0..1], 
    where 1.0 = identical sequences, 0.0 = completely different.
    """
    ned = normalized_edit_distance(seq1, seq2)
    return 1.0 - ned


In [11]:
import json
import random
from typing import List, Tuple
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.optim.lr_scheduler import LambdaLR
###############################################################################
# 1. Dataset and Collation
###############################################################################



class TokenSequenceDataset(Dataset):
    """
    A dataset for pre-tokenized sequences stored in memory. 
    Each sequence is a list of integer token IDs.

    This class returns (input_seq, target_seq), where:
      - input_seq = sequence[:-1]
      - target_seq = sequence[1:]
    """
    def __init__(self, sequences: List[List[int]]):
        super().__init__()
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx) -> Tuple[List[int], List[int]]:
        seq = self.sequences[idx]
        # We shift to create input/target pairs for next-token prediction
        input_seq = seq[:-1]
        target_seq = seq[1:]
        return input_seq, target_seq


class PadCollator:
    """
    A custom collator that pads input sequences to the same length in a batch,
    creating attention masks and ensuring alignment of input/target sequences.
    """
    def __init__(self, pad_token_id: int = 0):
        self.pad_token_id = pad_token_id

    def __call__(self, batch: List[Tuple[List[int], List[int]]]):
        # Extract all input/target pairs
        input_batch, target_batch = zip(*batch)

        max_len = max(len(seq) for seq in input_batch)

        padded_inputs = []
        padded_targets = []
        attention_masks = []

        for inp, tgt in zip(input_batch, target_batch):
            inp_len = len(inp)
            pad_len = max_len - inp_len

            # Pad inputs & targets
            padded_inp = inp + [self.pad_token_id] * pad_len
            padded_tgt = tgt + [self.pad_token_id] * pad_len

            # Create attention mask: 1 for real tokens, 0 for padded
            att_mask = [1] * inp_len + [0] * pad_len

            padded_inputs.append(padded_inp)
            padded_targets.append(padded_tgt)
            attention_masks.append(att_mask)

        # Convert to tensors
        padded_inputs = torch.tensor(padded_inputs, dtype=torch.long)
        padded_targets = torch.tensor(padded_targets, dtype=torch.long)
        attention_masks = torch.tensor(attention_masks, dtype=torch.long)

        return padded_inputs, padded_targets, attention_masks


###############################################################################
# 2. Model Definition
###############################################################################
class PositionalEncoding(nn.Module):
    """
    Standard sinusoidal positional encoding.
    """
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 50000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)
        )
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x shape: (batch, seq_len, d_model)
        """
        seq_len = x.size(1)
        # Add positional encoding
        x = x + self.pe[:, :seq_len]
        return self.dropout(x)


class DecoderOnlyTransformer(nn.Module):
    """
    A causal (decoder-only) Transformer model for next-token prediction.
    """
    def __init__(
        self,
        vocab_size: int,
        d_model: int = 256,
        nhead: int = 4,
        num_layers: int = 10,
        dim_feedforward: int = 1024,
        dropout: float = 0.1,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model

        # Token Embedding
        self.token_emb = nn.Embedding(vocab_size, d_model)

        # Positional Encoding
        self.pos_encoder = PositionalEncoding(d_model, dropout=dropout)

        # Transformer Decoder Layers
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=d_model, 
            nhead=nhead, 
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation='relu'
        )
        self.transformer_decoder = nn.TransformerDecoder(
            decoder_layer, 
            num_layers=num_layers
        )

        # Final projection to vocabulary
        self.output_proj = nn.Linear(d_model, vocab_size)

        # Causal mask cache (for efficiency)
        self.register_buffer("mask_cache", None)

    def _generate_causal_mask(self, sz: int, device: torch.device):
        """
        Generates an upper-triangular causal mask to ensure each token 
        can only attend to preceding tokens (including itself).
        """
        if (self.mask_cache is None) or (self.mask_cache.size(0) < sz):
            mask = torch.triu(torch.ones(sz, sz, device=device), diagonal=1)
            # Convert to boolean, True means "block this position"
            mask = mask.bool()
            self.mask_cache = mask
        else:
            mask = self.mask_cache[:sz, :sz]
        return mask

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor
    ) -> torch.Tensor:
        """
        input_ids: (batch, seq_len)
        attention_mask: (batch, seq_len) => 1 for real tokens, 0 for pads

        Returns:
          logits of shape (batch, seq_len, vocab_size)
        """
        device = input_ids.device
        batch_size, seq_len = input_ids.shape

        # Generate token embeddings
        tok_emb = self.token_emb(input_ids)  # (batch, seq_len, d_model)
        # Add positional encoding
        pos_emb = self.pos_encoder(tok_emb)

        # Prepare the causal mask
        causal_mask = self._generate_causal_mask(seq_len, device=device)  # (seq_len, seq_len)

        # We also need to expand the attention_mask to shape (batch, 1, seq_len)
        # so it can be broadcast to (batch, seq_len, seq_len).
        # We'll turn 0 => True in the mask to block those positions.
        extended_attention_mask = attention_mask.unsqueeze(1).repeat(1, seq_len, 1)
        # So the final mask used by the decoder is (batch, seq_len, seq_len)
        # with True where we want to block attention.
        combined_mask = causal_mask.unsqueeze(0) | (extended_attention_mask == 0)

        # Permute to fit PyTorch's (seq_len, batch, d_model)
        pos_emb = pos_emb.permute(1, 0, 2)  # => (seq_len, batch, d_model)

        # Decode (TransformerDecoder expects shape (seq_len, batch, d_model))
        # The "memory" here is empty because we're using a decoder-only approach.
        decoded = self.transformer_decoder(
            pos_emb,
            memory=torch.zeros(0, batch_size, self.d_model, device=device),  # dummy empty memory
            tgt_mask=combined_mask[0],  # shape (seq_len, seq_len) for a single batch? We'll do a trick below.
            # tgt_key_padding_mask=~attention_mask.bool()  # shape (batch, seq_len)
             tgt_key_padding_mask=~attention_mask.bool()
        )

        # NOTE: PyTorch's TransformerDecoder can’t directly handle a 3D mask. 
        # We used 'tgt_key_padding_mask' for pad tokens and 'tgt_mask' for causality. 
        # This approach merges them. If you have more advanced needs, you'd implement a custom layer or reshape.

        # Undo the permute: (seq_len, batch, d_model) -> (batch, seq_len, d_model)
        decoded = decoded.permute(1, 0, 2).contiguous()

        # Project to vocab
        logits = self.output_proj(decoded)  # (batch, seq_len, vocab_size)
        return logits


###############################################################################
# 3. Training / Validation / Testing
###############################################################################
def train_one_epoch(
    model: nn.Module,
    dataloader: DataLoader,
    optimizer: optim.Optimizer,
    criterion: nn.CrossEntropyLoss,
    device: torch.device
) -> Tuple[float, float]:
    """
    Train for one epoch. Returns (avg_loss, approx_token_accuracy).
    """
    model.train()
    running_loss = 0.0
    running_correct = 0
    total_tokens = 0

    for batch_idx, (inputs, targets, attention_mask) in enumerate(dataloader):
        inputs = inputs.to(device)
        targets = targets.to(device)
        attention_mask = attention_mask.to(device)

        optimizer.zero_grad()
        logits = model(inputs, attention_mask)  # (batch, seq_len, vocab_size)

        # Reshape for loss: (batch*seq_len, vocab_size) vs (batch*seq_len)
        vocab_size = logits.size(-1)
        loss = criterion(logits.view(-1, vocab_size), targets.view(-1))
        try:
            real_vocab_size = tokenizer.get_vocab_size()
        except:
        # For older versions of Tokenizers, you might need:
            real_vocab_size = len(tokenizer.get_vocab())

        print(f"Tokenizer says vocab size = {real_vocab_size}")

        # Now compare that with your currently hard-coded vocab_size
        # For example:
        print(f"You currently set vocab_size = 2000 in the model.")
        loss.backward()
        optimizer.step()

        # Accumulate stats
        running_loss += loss.item()
        # Approximate token-level accuracy
        preds = logits.argmax(dim=-1)  # (batch, seq_len)
        mask_flat = attention_mask.view(-1).bool()
        correct = (preds.view(-1)[mask_flat] == targets.view(-1)[mask_flat]).sum().item()
        count = mask_flat.sum().item()
        running_correct += correct
        total_tokens += count

    avg_loss = running_loss / len(dataloader)
    avg_acc = running_correct / total_tokens if total_tokens > 0 else 0.0
    return avg_loss, avg_acc


def validate(
    model: nn.Module,
    dataloader: DataLoader,
    criterion: nn.CrossEntropyLoss,
    device: torch.device
) -> Tuple[float, float]:
    """
    Validate the model. Returns (avg_loss, approx_token_accuracy).
    """
    model.eval()
    running_loss = 0.0
    running_correct = 0
    total_tokens = 0

    with torch.no_grad():
        for inputs, targets, attention_mask in dataloader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            attention_mask = attention_mask.to(device)

            logits = model(inputs, attention_mask)

            vocab_size = logits.size(-1)
            loss = criterion(logits.view(-1, vocab_size), targets.view(-1))
            running_loss += loss.item()

            # Approximate token-level accuracy
            preds = logits.argmax(dim=-1)
            mask_flat = attention_mask.view(-1).bool()
            correct = (preds.view(-1)[mask_flat] == targets.view(-1)[mask_flat]).sum().item()
            count = mask_flat.sum().item()
            running_correct += correct
            total_tokens += count

    avg_loss = running_loss / len(dataloader)
    avg_acc = running_correct / total_tokens if total_tokens > 0 else 0.0
    return avg_loss, avg_acc


def test_sequence_accuracy(
    model: nn.Module,
    dataloader: DataLoader,
    device: torch.device
) -> float:
    """
    Computes the fraction of sequences where the entire predicted sequence
    matches the target sequence exactly. 
    """
    model.eval()
    correct_sequences = 0
    total_sequences = 0

    with torch.no_grad():
        for inputs, targets, attention_mask in dataloader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            attention_mask = attention_mask.to(device)

            logits = model(inputs, attention_mask)
            preds = logits.argmax(dim=-1)  # (batch, seq_len)

            # For each sequence in the batch, compare all tokens (where attention_mask=1).
            for i in range(inputs.size(0)):
                seq_mask = attention_mask[i].bool()
                pred_seq = preds[i, seq_mask]
                tgt_seq = targets[i, seq_mask]
                total_sequences += 1
                if torch.equal(pred_seq, tgt_seq):
                    correct_sequences += 1

    return correct_sequences / total_sequences if total_sequences > 0 else 0.0

def test_edit_distance_score(
    model: nn.Module,
    dataloader: DataLoader,
    device: torch.device
) -> float:
    """
    Computes the average edit-distance-based similarity score across all sequences 
    in the test set. For each sequence:
       score = 1 - (levenshtein_distance / max_len_of_seq).
    Then we average those scores over the entire test set.
    """
    model.eval()
    total_score = 0.0
    total_count = 0

    with torch.no_grad():
        for inputs, targets, attention_mask in dataloader:
            inputs = inputs.to(device)
            targets = targets.to(device)
            attention_mask = attention_mask.to(device)

            logits = model(inputs, attention_mask)
            preds = logits.argmax(dim=-1)

            # For each sequence in the batch, compare predicted vs. target tokens 
            # (only the real tokens where attention_mask=1).
            batch_size = inputs.size(0)
            for i in range(batch_size):
                seq_mask = attention_mask[i].bool()
                pred_seq = preds[i, seq_mask].cpu().tolist()
                tgt_seq = targets[i, seq_mask].cpu().tolist()

                score = edit_distance_score(pred_seq, tgt_seq)
                total_score += score
                total_count += 1

    return total_score / total_count if total_count > 0 else 0.0

###############################################################################
# 4. Main Script
###############################################################################
def main():
    # ---------------------------
    # 4.1 Load Tokenized Data
    # ---------------------------
    json_path = "dataset_encoded.json"  # Change to your path if needed
    with open(json_path, "r") as f:
        token_data_dict = json.load(f)

    # token_data_dict is assumed to be {filename: [list_of_token_ids], ...}
    # Merge all token lists into one big list if needed, or keep them separate.
    # We'll merge them for a single dataset:
    all_sequences = []
    for seq_list in token_data_dict.values():
        # seq_list is presumably a list of ints
        # Possibly it's a list of lists if you segmented each file. 
        # If needed, adapt to your structure. 
        # We'll assume it's a single list of token IDs per entry.
        if isinstance(seq_list[0], int):
            # single sequence
            all_sequences.append(seq_list)
        else:
            # multiple sequences in a sub-list
            all_sequences.extend(seq_list)

    # Filter out any sequences shorter than 2 tokens (otherwise can't do input/target shift).
    all_sequences = [seq for seq in all_sequences if len(seq) > 1]


    ###############################################################################
    # Step 1: Check average sequence length & approximate perfect-sequence probability
    ###############################################################################
    lengths = [len(seq) for seq in all_sequences]
    avg_len = sum(lengths) / len(lengths)
    max_len = max(lengths)
    print(f"Number of sequences: {len(all_sequences)}")
    print(f"Average sequence length: {avg_len:.2f}")
    print(f"Max sequence length: {max_len}")

    # Suppose token accuracy is around 0.70:
    approx_token_acc = 0.70
    print(f"Average Length:{avg_len}")
    approx_full_seq_acc = (approx_token_acc ** avg_len)
    print(f"Estimated fraction fully correct at 70% token acc: {approx_full_seq_acc:.6f}")

    print("I want the code to be executed only till here.")
    # raise SystemExit
    MAX_CHUNK_LEN = 64  # or any reasonable max length
    chunked_sequences = []

    for seq_list in token_data_dict.values():
        # If seq_list is a single sequence of IDs, chunk it
        if isinstance(seq_list[0], int):
            seq = seq_list
            # Break the single seq into multiple chunks
            for i in range(0, len(seq), MAX_CHUNK_LEN):
                chunk = seq[i:i+MAX_CHUNK_LEN]
                # We only keep chunks that have at least 2 tokens
                if len(chunk) > 1:
                    chunked_sequences.append(chunk)
        else:
            # If seq_list is already a list of sequences, chunk each
            for seq in seq_list:
                for i in range(0, len(seq), MAX_CHUNK_LEN):
                    chunk = seq[i:i+MAX_CHUNK_LEN]
                    if len(chunk) > 1:
                        chunked_sequences.append(chunk)

    all_sequences = chunked_sequences


    # ---------------------------
    # 4.2 Split into Train/Val/Test
    # ---------------------------
    random.shuffle(all_sequences)
    dataset_size = len(all_sequences)
    train_size = int(0.8 * dataset_size)
    val_size = int(0.1 * dataset_size)
    test_size = dataset_size - train_size - val_size

    train_data, val_data, test_data = random_split(
        all_sequences,
        [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(42)
    )

    train_sequences = list(train_data)
    val_sequences = list(val_data)
    test_sequences = list(test_data)

    ###############################################################################
    # Step 3: Inspect a few example sequences
    ###############################################################################
    print("Sample (raw) sequences from all_sequences:")
    for i in range(3):
        print(f"Example {i}:", all_sequences[i])
        print("Length:", len(all_sequences[i]))
        print("--------")
    # ---------------------------
    # 4.3 Create Datasets & Loaders
    # ---------------------------
    pad_token_id = 0  # Adjust if your PAD token is something else
    train_dataset = TokenSequenceDataset(train_sequences)
    val_dataset = TokenSequenceDataset(val_sequences)
    test_dataset = TokenSequenceDataset(test_sequences)
    
    collator = PadCollator(pad_token_id=pad_token_id)
    batch_size = 256

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collator)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collator)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collator)

    ###############################################################################
    # Step 4: Evaluate on short sequences only
    ###############################################################################
    short_test_sequences = [seq for seq in test_sequences if len(seq) <= 8]
    print(f"Number of short test sequences (<= 8 tokens): {len(short_test_sequences)}")

    if len(short_test_sequences) > 0:
        short_test_dataset = TokenSequenceDataset(short_test_sequences)
        short_test_loader = DataLoader(
            short_test_dataset,
            batch_size=256,
            shuffle=False,
            collate_fn=collator
        )

        # We'll do a quick “test_sequence_accuracy” call on these short ones, once the model is trained
        # Just store them for later usage:
    else:
        short_test_loader = None
        print("No short sequences to test.")


    sample_batch = next(iter(train_loader))
    print("Sample input shape:", sample_batch[0].shape)
    print("Sample target shape:", sample_batch[1].shape)
    print("Sample mask shape:", sample_batch[2].shape)
    print("Sample input:", sample_batch[0][0][:10])  # First 10 tokens of first example
    print("Sample target:", sample_batch[1][0][:10])
    print("Max token ID in input:", sample_batch[0].max().item())
    print("Max token ID in target:", sample_batch[1].max().item())
        # ---------------------------
    # 4.4 Model, Optimizer, Loss
    # ---------------------------
    # You likely know your max vocab size from your tokenizer
    # For example, if you used Byte-Level BPE or a custom mapping:
    vocab_size = 2000  # <-- Replace with your actual vocabulary size
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = DecoderOnlyTransformer(
        vocab_size=vocab_size,
        d_model=512,          # Hidden dimension
        nhead=8,              # Number of attention heads
        num_layers=10,         # Number of transformer decoder layers
        dim_feedforward=1024, # FFN dimension
        dropout=0.1
    )
    model.to(device)    
    optimizer = optim.AdamW(model.parameters(), lr=5e-4)
    pad_token_id=0
    criterion = nn.CrossEntropyLoss(ignore_index=pad_token_id)

    # ---------------------------
    # 4.5 Training Loop
    # ---------------------------
    epochs = 500
    best_val_loss = float('inf')

    for epoch in range(1, epochs + 1):
        # --- Train ---
        train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
        # --- Validate ---
        val_loss, val_acc = validate(model, val_loader, criterion, device)

        print(f"[Epoch {epoch}/{epochs}] "
              f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} || "
              f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

        # Checkpoint if validation loss improves
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "transformer_model.pt")
            print("  (Best validation loss so far, saving model)")

    print("Training complete. Loading best model weights from checkpoint.")
    model.load_state_dict(torch.load("transformer_model.pt"))

    # ---------------------------
    # 4.6 Evaluate on Test Set (Sequence Accuracy)
    # ---------------------------
    seq_acc = test_sequence_accuracy(model, test_loader, device)
    print(f"Test Full-Sequence Accuracy: {seq_acc:.4f}")

    if short_test_loader is not None:
        short_seq_acc = test_sequence_accuracy(model, short_test_loader, device)
        print(f"Short-test-set (<=8 tokens) full-sequence accuracy: {short_seq_acc:.4f}")

    # New partial-credit metric
    eds_score = test_edit_distance_score(model, test_loader, device)
    print(f"Average Edit-Distance Score (1=identical): {eds_score:.4f}")


if __name__ == "__main__":
    main()


Number of sequences: 100
Average sequence length: 94.01
Max sequence length: 201
Average Length:94.01
Estimated fraction fully correct at 70% token acc: 0.000000
I want the code to be executed only till here.
Sample (raw) sequences from all_sequences:
Example 0: [3, 647, 1169, 1731, 2, 647, 1176, 1731, 2, 647, 1011, 1731, 2, 647, 1142, 1731, 2, 648, 1246, 1732, 3, 647, 935, 1731, 2, 647, 946, 1731, 2, 647, 966, 1731, 2, 647, 1168, 1731, 2, 648, 837, 1732, 3, 647, 1119, 1731, 2, 647, 877, 1731, 2, 647, 885, 1731, 2, 647, 1068, 1731, 2, 648, 1065, 1731, 3, 647, 1089, 1731]
Length: 64
--------
Example 1: [3, 647, 930, 1731, 2, 647, 1140, 1731, 2, 647, 1031, 1731, 2, 647, 990, 1731, 2, 647, 1227, 1731, 3, 647, 1032, 1731, 2, 647, 1106, 1731, 2, 647, 902, 1731, 2, 647, 1062, 1731, 2, 647, 1040, 1731, 3, 647, 948, 1731, 2, 647, 1102, 1731, 2, 647, 1132, 1731, 2, 647, 874, 1731, 2, 647, 836, 1732, 3, 647, 1189, 1731]
Length: 64
--------
Example 2: [2, 647, 1021, 1731, 2, 647, 1203, 1731, 2, 6