# Libraries/Imports

In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install datasets evaluate transformers[sentencepiece]
%pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
%pip install kagglehub
%pip install transformers
%pip install accelerate
%pip install --upgrade torch torchvision torchaudio
%pip install --upgrade torchtext

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7

In [3]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import torch

In [4]:
def print_rows(dataset, start=0, end=10, split="", feature=""):
  if split == "":
    for split in dataset:
      print(f"Entries {start+1} - {end} of the {split} data:")
      for i in range(start, end):
        if feature:  # If feature is provided
          print(f"{feature}: {dataset[split][i][feature]}")  # Print the specified feature
        else:
          print(dataset[split][i])  # Print the entire row if no feature is specified
      print("-" * 20)
  else:
    print(f"Entries {start+1} - {end} of the {split} data:")
    for i in range(start, end):
      if feature:  # If feature is provided
        print(f"{feature}: {dataset[split][i][feature]}")  # Print the specified feature
      else:
        print(dataset[split][i])  # Print the entire row if no feature is specified
    print("-" * 20)

# Preprocessing

In [5]:
# Preprocess COT dataset
from datasets import load_dataset

cot_ds = load_dataset("AI-MO/NuminaMath-CoT")

cot_ds['train'] = cot_ds['train'].remove_columns(['messages'])
cot_ds['test'] = cot_ds['test'].remove_columns(['messages'])
cot_ds['train'] = cot_ds['train'].remove_columns(['source'])
cot_ds['test'] = cot_ds['test'].remove_columns(['source'])

# Remove chinese characters from COT dataset
import re

def contains_chinese(text):
    # match Chinese characters
    pattern = re.compile(r'[\u4e00-\u9fff\u2e80-\u2eff\u31c0-\u31ef\uff00-\uffef]')
    return bool(pattern.search(text))

def filter_entries(dataset, fields):
    # Filter out entries that contain Chinese characters
    filtered_dataset = dataset.filter(lambda example: not any(contains_chinese(example[field]) for field in fields))
    return filtered_dataset

# remove entries with Chinese characters
fields_to_check = ['problem', 'solution']
cot_ds['train'] = filter_entries(cot_ds['train'], fields_to_check)
cot_ds['test'] = filter_entries(cot_ds['test'], fields_to_check)
print(cot_ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.68k [00:00<?, ?B/s]

train-00000-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/166k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/859494 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/859494 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 850151
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 100
    })
})


In [6]:
# Preprocess MATH dataset (load all json files into into Dataset object)

import kagglehub

# Download latest version
path = kagglehub.dataset_download("mathurinache/math-dataset")

print("Path to dataset files:", path)

import os
from datasets import Dataset, DatasetDict

def load_json_files(data_dir):
    """Loads JSON files from a directory into a Dataset."""
    all_data = []
    problems = 0
    for subdir in os.listdir(data_dir):
      subdir_path = os.path.join(data_dir, subdir)
      for filename in os.listdir(subdir_path):
        if filename.endswith(".json"):
          problems += 1
          filepath = os.path.join(subdir_path, filename)
          with open(filepath, "r") as f:
            all_data.append(json.load(f))
    # Create a Pandas DataFrame to easily convert into a Dataset\
    print(f"Loaded {problems} problems.")
    return all_data

# Assuming 'path' is from kagglehub.dataset_download
math_dir = os.path.join(path, "MATH")
train_dir = os.path.join(math_dir, "train")
test_dir = os.path.join(math_dir, "test")

train_data = load_json_files(train_dir)
test_data = load_json_files(test_dir)

# Convert the train and test data into Dataset objects
train_dataset = Dataset.from_dict({
    "problem": [item["problem"] for item in train_data],
    # "level": [item["level"] for item in train_data],
    # "type": [item["type"] for item in train_data],
    "solution": [item["solution"] for item in train_data]
})

test_dataset = Dataset.from_dict({
    "problem": [item["problem"] for item in test_data],
    # "level": [item["level"] for item in test_data],
    # "type": [item["type"] for item in test_data],
    "solution": [item["solution"] for item in test_data]
})

math_ds = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

print(math_ds)


Downloading from https://www.kaggle.com/api/v1/datasets/download/mathurinache/math-dataset?dataset_version_number=1...


100%|██████████| 7.07M/7.07M [00:00<00:00, 77.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/mathurinache/math-dataset/versions/1
Loaded 7500 problems.
Loaded 5000 problems.
DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 7500
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 5000
    })
})


In [7]:
# Concatenate and split datasets
from datasets import concatenate_datasets

# Make validation dataset
train_valid_split = cot_ds['train'].train_test_split(test_size=0.1)
cot_ds['train'] = train_valid_split['train']
cot_ds['test'] = train_valid_split['test']

train_valid_split = cot_ds['train'].train_test_split(test_size=0.12)
cot_ds['train'] = train_valid_split['train']
cot_ds['validation'] = train_valid_split['test']

# Add MATH dataset as test dataset
merged_math = concatenate_datasets([math_ds['train'], math_ds['test']])
cot_ds['test'] = concatenate_datasets([cot_ds['test'], merged_math])

ds = cot_ds
print(ds)

del cot_ds
del math_ds

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 673318
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 97516
    })
    validation: Dataset({
        features: ['problem', 'solution'],
        num_rows: 91817
    })
})


In [8]:
print("Split")
print("train:", len(ds['train']) / ( len(ds['train']) + len(ds['validation']) + len(ds['test']) ))
print("test:", len(ds['test']) / ( len(ds['train']) + len(ds['validation']) + len(ds['test']) ))
print("validation:", len(ds['validation']) / ( len(ds['train']) + len(ds['validation']) + len(ds['test']) ))

Split
train: 0.7805219028320839
test: 0.11304223840232029
validation: 0.10643585876559582


# Embedding

In [9]:
ds32 = DatasetDict({
    split: dataset.select(range(32))
    for split, dataset in ds.items()
})
print(ds32)

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 32
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 32
    })
    validation: Dataset({
        features: ['problem', 'solution'],
        num_rows: 32
    })
})


In [10]:
# Generate Embeddings

from transformers import AutoTokenizer, AutoModel
import torch

# Load MathBERT
model_name = "tbs17/MathBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True)
model = AutoModel.from_pretrained(model_name)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/441M [00:00<?, ?B/s]

In [11]:
sample_sentence = 'Let \\( T = 1 - 2 + 3 - 4 + \\cdots + 2022 - 2023 \\). What is the residue of \\( T \\), modulo 2024?'
sample_sentence_ids = tokenizer(sample_sentence, add_special_tokens=True,
        truncation=True,
        max_length=128,).input_ids
print(sample_sentence_ids)


[101, 2292, 1032, 1006, 1056, 1027, 1015, 1011, 1016, 1009, 1017, 1011, 1018, 1009, 1032, 3729, 12868, 1009, 16798, 2475, 1011, 16798, 2509, 1032, 1007, 1012, 2054, 2003, 1996, 21755, 1997, 1032, 1006, 1056, 1032, 1007, 1010, 16913, 18845, 16798, 2549, 1029, 102]


In [12]:
# Function to compute embeddings
def compute_embeddings(batch):
    # Tokenize the problem text
    inputs = tokenizer(
        batch["problem"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt")

    # Move input tensors to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Use the [CLS] token embedding (typically the first token)
    cls_embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy()

    # Return embeddings
    return {"embeddings": cls_embeddings}

# Apply the function to each split of the dataset
def process_dataset(dataset_dict):
    for split in dataset_dict:
        dataset_dict[split] = dataset_dict[split].map(compute_embeddings, batched=True, batch_size=32)
        dataset_dict[split].set_format(type="torch", columns=["embeddings"], output_all_columns=True)
    return dataset_dict


# Add embeddings to the dataset
ds32 = process_dataset(ds32)

# Save embeddings or access them
torch.save(model.state_dict(), 'mathbert_weights.pth')
# model.load_state_dict(torch.load('mathbert_weights.pth'))

print(ds32)

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution', 'embeddings'],
        num_rows: 32
    })
    test: Dataset({
        features: ['problem', 'solution', 'embeddings'],
        num_rows: 32
    })
    validation: Dataset({
        features: ['problem', 'solution', 'embeddings'],
        num_rows: 32
    })
})


In [13]:
vocab_size = len(tokenizer)
print(vocab_size)

30522


In [14]:
batch_embeddings = []
for i in range(len(ds32['train'])):
  p= torch.tensor(ds32['train'][i]["embeddings"]).unsqueeze(0)
  batch_embeddings.append(p)

print(len(batch_embeddings))
batch_embeddings = torch.cat(batch_embeddings, dim=0)
print(batch_embeddings.shape)


32
torch.Size([32, 128, 768])


  p= torch.tensor(ds32['train'][i]["embeddings"]).unsqueeze(0)


# Encoder


In [15]:
print(ds32['train'][0])

{'embeddings': tensor([[-1.0494, -0.0814, -0.1708,  ..., -1.1555,  1.3876, -0.3599],
        [ 0.3803,  0.7577, -0.3773,  ...,  0.3179,  0.4490, -1.0780],
        [ 0.1491,  0.4438, -0.6127,  ..., -2.3219,  0.8915, -1.9246],
        ...,
        [-1.4218, -1.1582, -0.2430,  ..., -2.1157,  1.2326,  0.1273],
        [-1.5021, -0.1316,  0.1135,  ..., -2.3079,  0.6984,  0.5811],
        [-1.4839, -1.3890, -0.0781,  ..., -1.5789,  0.8897,  0.7527]]), 'problem': "Let  $ABCD$  be a convex quadrilateral. Let  $E,F,G,H$  be points on segments  $AB$ ,  $BC$ ,  $CD$ ,  $DA$ , respectively, and let  $P$  be the intersection of  $EG$  and  $FH$ . Given that quadrilaterals  $HAEP$ ,  $EBFP$ ,  $FCGP$ ,  $GDHP$  all have inscribed circles, prove that  $ABCD$  also has an inscribed circle.\n\n*Evan O'Dorney.*", 'solution': "1. **Rename the points and define the incircles:**\n   Let us rename the points \\(E, F, G, H, P\\) as \\(P, Q, R, S, O\\). Let the incircles of quadrilaterals \\(APOS, BQOP, CROQ,

In [26]:
train_embeddings = ds32["train"]["embeddings"]

In [17]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from typing import Union, Tuple


In [42]:
from transformers import T5Config
class CustomConfig(T5Config):
    def __init__(
        self,
        d_model=768,      # Hidden size 768   16384
        num_heads=8,       # Number of attention heads
        d_kv = 96,
        d_ff= 2048,         # Feedforward network size
        num_encoder_layers=6,  # Number of encoder layers
        num_decoder_layers=6,  # Number of decoder layers
        dropout_rate=0.1,      # Dropout rate
        attention_dropout_rate=0.1,  # Attention dropout rate
        layer_norm_eps=1e-6,   # Layer norm epsilon for stability
        vocab_size=30522,     # Vocabulary size
        bos_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.pad_token_id,
        max_length=128,

    ):
        super().__init__(vocab_size=vocab_size,
                        d_model=d_model,
                        d_kv=d_kv,
                        d_ff=d_ff,
                        num_heads=num_heads,
                        num_encoder_layers=num_encoder_layers,
                        dropout_rate=dropout_rate,
                        layer_norm_eps=layer_norm_eps,
                        bos_token_id=bos_token_id,
                        eos_token_id=eos_token_id,
                        max_length=max_length
                  )
        self.num_encoder_layers = num_encoder_layers
        self.attention_dropout_rate = attention_dropout_rate
        self.to_json_string = True
        self.decoder_start_token_id = tokenizer.pad_token_id


        assert d_model % num_heads == 0, "d_model mfust be divisible by num_heads"


In [19]:
# train_embedding = torch.tensor(dataset['train'][1]["embeddings"])
# train_embedding = train_embedding.unsqueeze(0)

# print(train_embedding)
# print(train_embedding.shape)
# # need to convert embeddings into tensor-> add nn.embedding as the first layer

In [20]:
class FFNLayer(nn.Module):
    # T5LayerFF
    def __init__(self, config):
        super().__init__()
        self.linear1 = nn.Linear(config.d_model, config.d_ff)  # First projection
        self.activation = nn.GELU()  # Non-linear activation
        self.linear2 = nn.Linear(config.d_ff, config.d_model)  # Second projection
        self.dropout = nn.Dropout(config.dropout_rate)  # Dropout regularization
        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)  # Layer normalization

    def forward(self, hidden_states):

        #initializing weights in the feed-forward layers to improve stability during training.
        nn.init.xavier_uniform_(self.linear1.weight)
        nn.init.xavier_uniform_(self.linear2.weight)

         # Input goes through the FFN
        forwarded_states = self.linear1(hidden_states)
        forwarded_states = self.activation(forwarded_states)
        forwarded_states = self.dropout(forwarded_states)
        forwarded_states = self.linear2(forwarded_states)

        # Add residual connection and layer norm
        hidden_states = self.layer_norm(hidden_states + forwarded_states)
        return hidden_states


In [21]:
class AttentionLayer(nn.Module):
    # T5Attention
    def __init__(self, config):
        super().__init__()
        self.num_heads = config.num_heads  # Number of attention heads
        self.d_model = config.d_model
        self.dropout = config.attention_dropout_rate

        self.self_attention = nn.MultiheadAttention(
            embed_dim=self.d_model,
            num_heads=self.num_heads,
            dropout=self.dropout
        )

        # skipping Relative positional bias

        self.layer_norm = nn.LayerNorm(self.d_model, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states, attention_mask=None, layer_head_mask=None, output_attentions=False):
        """
        Forward pass for the attention layer.

        Args:
            hidden_states (torch.FloatTensor): Input tensor of shape [seq_len, batch_size, d_model].
            attention_mask (torch.FloatTensor, optional): Mask for attention mechanism of shape [batch_size, seq_len].
            layer_head_mask (torch.FloatTensor, optional): Mask for specific attention heads.
            output_attentions (bool, optional): Whether to return attention scores.

        Returns:
            torch.FloatTensor: Updated hidden states after attention.
            Optional[torch.FloatTensor]: Attention weights if `output_attentions=True`.
        """

        # Transpose hidden_states to [seq_len, batch_size, d_model] for nn.MultiheadAttention
        hidden_states = hidden_states.transpose(0, 1)  # [seq_len, batch_size, d_model]

        # Apply multi-head self-attention
        attention_output, attention_weights = self.self_attention(
            query=hidden_states,
            key=hidden_states,
            value=hidden_states,
            attn_mask=attention_mask
        )


        # Residual connection and layer normalization
        hidden_states = self.layer_norm(hidden_states + self.dropout(attention_output))

        # Transpose back to [batch_size, seq_len, d_model]
        hidden_states = hidden_states.transpose(0, 1)


        if output_attentions:
            return hidden_states, attention_weights
        return hidden_states

In [22]:
class EncoderBlock(nn.Module):
   # a single Transformer encoder layer
   # similar to T5 Block
    def __init__(self, config):
        super(EncoderBlock, self).__init__()
        self.layer = nn.ModuleList()
        self.layer.append(AttentionLayer(config))
        self.layer.append(FFNLayer(config))


    def forward(
        self,
        hidden_states,
        attention_mask=None,
        # position_bias=None,
        layer_head_mask=None,
        #cross_attn_layer_head_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
        return_dict=True,
        cache_position=None,
    ):
        """
        Forward pass for the encoder block.
        Args:
            hidden_states: Tensor of shape [?].
            attention_mask: Mask to prevent attention to certain positions.
            layer_head_mask: Mask for specific attention heads.
            use_cache: Whether to use caching for inference.
            output_attentions: Whether to return attention outputs.
            cache_position: Cache tracking the current position for inference.
        Returns:
            hidden_states: Updated hidden states after attention and FFN.
            outputs: A tuple containing additional optional outputs.
        """

        self_attention_outputs = self.layer[0](
            hidden_states,
            attention_mask=attention_mask,
           # layer_head_mask=layer_head_mask,
            #use_cache=use_cache,
            #output_attentions=output_attentions,
           # cache_position=cache_position,
        )
        if output_attentions:
            attention_outputs = self_attention_outputs[1:]
            hidden_states = self_attention_outputs[0]
        else:
            hidden_states = self_attention_outputs

        hidden_states = self.layer[1](hidden_states)

        # Clamp to handle FP16 training again
        if hidden_states.dtype == torch.float16:
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        # Prepare outputs
        #outputs = (hidden_states,) + attention_outputs

        return hidden_states

# Testing
# # Initialize EncoderBlock
# config = CustomConfig(d_model=768, num_heads=8, d_ff=2048)
# encoder_block = EncoderBlock(config)

# # Pass through EncoderBlock
# block_output = encoder_block(hidden_states=train_embedding)
# hidden_states= block_output  # Extract the hidden states
# print(f"Output Shape from EncoderBlock: {hidden_states.shape}")  # Expected: [1, 1, 768]
# print(f"Output from EncoderBlock: {hidden_states}")


In [23]:
class TransformerEncoder(nn.Module):
  # stacks multiple encoder layers
    def __init__(self, config):
        super(TransformerEncoder, self).__init__()
        #self.embed_tokens = embed_tokens

        self.block = nn.ModuleList(
            [EncoderBlock(config) for i in range(config.num_encoder_layers)]
        )
        self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.dropout_rate)

        # Initialize weights and apply final processing
        # self.post_init()
        # # Model parallel
        # self.model_parallel = False
        # self.device_map = None
        # self.gradient_checkpointing = False

    def forward(
          self,
          hidden_states,
          attention_mask=None,
          head_mask=None,
          output_attentions=False,
          output_hidden_states=False,
          return_dict=True,
          ):
          """
          Forward pass for the TransformerEncoder.

          Args:
              hidden_states (Tensor): Input embeddings of shape [batch_size, seq_len, d_model].
              attention_mask (Tensor, optional): Mask to prevent attention to certain positions (e.g., padding).
              head_mask (Tensor, optional): Mask for specific attention heads.
              output_attentions (bool, optional): Whether to return attention weights.
              output_hidden_states (bool, optional): Whether to return intermediate hidden states.
              return_dict (bool, optional): Whether to return outputs as a dictionary.

          Returns:
              dict or tuple: Updated hidden states and optional outputs.
          """

          # Prepare outputs if needed
          all_hidden_states = () if output_hidden_states else None
          all_attentions = () if output_attentions else None

          # Apply dropout to input hidden states
          hidden_states = self.dropout(hidden_states)

          # Pass through each layer in the encoder block
          for i, layer_module in enumerate(self.block):
              if output_hidden_states:
                  all_hidden_states = all_hidden_states + (hidden_states,)

              # Forward pass through the encoder block
              # print(hidden_states.shape)
              layer_outputs = layer_module(
                  hidden_states,
                  attention_mask=attention_mask,
                  layer_head_mask=head_mask[i] if head_mask is not None else None,
                  output_attentions=output_attentions,
              )

              # Collect attention weights if requested
              if output_attentions:
                  hidden_states = layer_outputs[0]
                  all_attentions = all_attentions + (layer_outputs[1],)
              else:
                  hidden_states = layer_outputs

          # Apply final layer normalization
          hidden_states = self.final_layer_norm(hidden_states)

          # Apply final dropout
          hidden_states = self.dropout(hidden_states)

          # Add the final hidden state to outputs if requested
          if output_hidden_states:
              all_hidden_states = all_hidden_states + (hidden_states,)

          # Prepare return values
          if not return_dict:
              return tuple(
                  v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None
              )

          return {
              "last_hidden_state": hidden_states,
              "hidden_states": all_hidden_states,
              "attentions": all_attentions,
          }

# Testing
# # Initialize TransformerEncoder
# config = CustomConfig(d_model=768, num_heads=8, d_ff=2048, num_encoder_layers=6)
# transformer_encoder = TransformerEncoder(config)
# attention_mask = torch.ones(batch_embeddings.shape[1], batch_embeddings.shape[1])

# # Pass through TransformerEncoder
# encoder_output = transformer_encoder(hidden_states=train_embedding, attention_mask=attention_mask)

# # Print the last hidden state
# print(f"Output Shape from TransformerEncoder: {encoder_output['last_hidden_state'].shape}")
# print(f"Last Hidden State: {encoder_output['last_hidden_state']}")



In [24]:
class EncoderModel(nn.Module):
  # wraps the transformer encoder
  # similar to T5EncoderModel


    def __init__(self, config):
      super(EncoderModel, self).__init__()
      # Embedding layer (token + positional)
      # self.token_embedding = nn.Embedding(config., hidden_size)
      # self.position_embedding = nn.Embedding(max_seq_len, hidden_size)
      # self.dropout = nn.Dropout(dropout)
      self.encoder = TransformerEncoder(config)
      # self.hidden_size = config.h


    def forward(
        self,
        hidden_states: torch.FloatTensor,  # Precomputed embeddings
        attention_mask = None,
        output_attentions = False,
        output_hidden_states = False,
        return_dict = True,
    ) -> Union[Tuple[torch.FloatTensor], dict]:
      """
      Forward pass for the EncoderModel.

      Args:
          hidden_states (torch.FloatTensor): Precomputed embeddings of shape [batch_size, seq_len, d_model].
          attention_mask (Optional[torch.FloatTensor]): Mask of shape [batch_size, seq_len] to prevent attention to padding tokens.
          output_attentions (Optional[bool]): Whether to return attention weights.
          output_hidden_states (Optional[bool]): Whether to return hidden states of all layers.
          return_dict (Optional[bool]): Whether to return outputs as a dictionary.

      Returns:
          Union[Tuple, dict]: Final hidden states and optional outputs (attention weights, hidden states).
      """

      # Forward pass through the transformer encoder
      # print(hidden_states.shape) # torch.Size([batch_size, 128, 768])
      encoder_outputs = self.encoder(
          hidden_states=hidden_states,
          attention_mask=attention_mask,
          output_attentions=output_attentions,
          output_hidden_states=output_hidden_states,
          return_dict=return_dict,
      )

      return encoder_outputs


In [27]:
# Pass the embedding through the encoder
config = CustomConfig()
encoder_model = EncoderModel(config)
output = encoder_model(train_embeddings)
print(output)

96
{'last_hidden_state': tensor([[[-1.0860,  1.4327, -0.0000,  ..., -0.4710, -0.4203, -0.4113],
         [-1.0895,  2.2814, -0.3556,  ..., -2.1833, -0.0294, -0.6009],
         [ 0.0000,  1.4433, -1.2642,  ..., -1.8588,  1.0239, -0.9441],
         ...,
         [ 0.3328,  1.0296, -1.4296,  ..., -0.0000,  0.7389,  0.6875],
         [ 0.8180,  1.6125, -0.8876,  ..., -1.5931,  0.7877, -0.4340],
         [ 0.2044,  1.1558, -0.9848,  ..., -1.4568,  0.0000,  1.0560]],

        [[ 0.1051,  0.6714, -0.5032,  ..., -0.6460,  1.2869, -0.0142],
         [-0.1512,  0.5413,  0.0000,  ..., -0.1549, -0.5124, -1.1205],
         [-0.0406,  0.0000, -2.4281,  ..., -0.5030,  0.3428, -0.2800],
         ...,
         [-1.9587,  1.5479, -0.0000,  ..., -0.6264,  0.0000, -0.9901],
         [ 0.9937,  1.7372,  0.4466,  ..., -0.8230,  1.3308, -1.5645],
         [-0.4850,  1.6359,  0.7477,  ..., -0.3225,  1.0363, -1.7974]],

        [[ 1.4187,  1.3657,  0.8535,  ..., -0.0000,  0.9871, -0.9897],
         [ 2.0190,  

In [28]:
encoder_last_hidden_states = output['last_hidden_state']
print(encoder_last_hidden_states.shape)

torch.Size([32, 128, 768])


# Decoder



In [29]:
class CrossAttentionLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_heads = config.num_heads
        self.d_model = config.d_model
        self.dropout = config.attention_dropout_rate

        self.cross_attention = nn.MultiheadAttention(
            embed_dim=self.d_model,
            num_heads=self.num_heads,
            dropout=self.dropout
        )

        self.layer_norm = nn.LayerNorm(self.d_model, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.dropout_rate)

    def forward(self, hidden_states, encoder_hidden_states, attention_mask=None):
        # Transpose from [batch_size, seq_len, hidden_dim] to [seq_len, batch_size, hidden_dim]
        hidden_states = hidden_states.transpose(0, 1)
        encoder_hidden_states = encoder_hidden_states.transpose(0, 1)

        # Adjust attention mask if provided
        if attention_mask is not None:
            # Convert attention mask to the correct shape and dtype
            attention_mask = attention_mask.to(dtype=hidden_states.dtype)

            # Ensure the mask has the correct shape (tgt_len, src_len)
            if attention_mask.dim() == 2:
                attention_mask = attention_mask.transpose(0, 1)

            # Convert mask to additive attention mask
            attention_mask = attention_mask.masked_fill(attention_mask == 0, float('-inf'))
            attention_mask = attention_mask.masked_fill(attention_mask == 1, float(0.0))

        attention_output, attention_weights = self.cross_attention(
            query=hidden_states,
            key=encoder_hidden_states,
            value=encoder_hidden_states,
            attn_mask=attention_mask
        )

        hidden_states = self.layer_norm(hidden_states + self.dropout(attention_output))
        # Transpose back to [batch_size, seq_len, hidden_dim]
        hidden_states = hidden_states.transpose(0, 1)

        return hidden_states

class DecoderBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.self_attention = AttentionLayer(config)
        self.cross_attention = CrossAttentionLayer(config)
        self.ffn = FFNLayer(config)

    def forward(
        self,
        hidden_states,
        encoder_hidden_states,
        attention_mask=None,
        encoder_attention_mask=None,
        layer_head_mask=None,
        cross_attn_layer_head_mask=None,
        past_key_value=None,
        output_attentions=False,
    ):
        # Self Attention
        self_attention_outputs = self.self_attention(
            hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions
        )
        hidden_states = self_attention_outputs if not output_attentions else self_attention_outputs[0]

        # Cross Attention
        cross_attention_outputs = self.cross_attention(
            hidden_states,
            encoder_hidden_states,
            attention_mask=encoder_attention_mask
        )
        hidden_states = cross_attention_outputs

        # Feed Forward
        hidden_states = self.ffn(hidden_states)

        return hidden_states

In [30]:
class TransformerDecoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.block = nn.ModuleList(
            [DecoderBlock(config) for _ in range(config.num_decoder_layers)]
        )
        self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.dropout_rate)
        self.output_projection = nn.Linear(config.d_model, config.vocab_size, bias=False)

    def forward(
        self,
        hidden_states,
        encoder_hidden_states,
        attention_mask=None,
        encoder_attention_mask=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None
        all_cross_attns = () if output_attentions else None

        hidden_states = self.dropout(hidden_states)

        for i, layer_module in enumerate(self.block):
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            layer_head_mask = head_mask[i] if head_mask is not None else None
            cross_attn_layer_head_mask = (
                cross_attn_head_mask[i] if cross_attn_head_mask is not None else None
            )

            hidden_states = layer_module(
                hidden_states,
                encoder_hidden_states,
                attention_mask=attention_mask,
                encoder_attention_mask=encoder_attention_mask,
                layer_head_mask=layer_head_mask,
                cross_attn_layer_head_mask=cross_attn_layer_head_mask,
                output_attentions=output_attentions,
            )

        # Final layer norm
        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.dropout(hidden_states)

        # Project to vocabulary
        logits = self.output_projection(hidden_states)

        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        if not return_dict:
            return tuple(v for v in [
                logits,
                all_hidden_states,
                all_self_attns,
                all_cross_attns
            ] if v is not None)

        return {
            "logits": logits,
            "hidden_states": all_hidden_states,
            "self_attentions": all_self_attns,
            "cross_attentions": all_cross_attns,
        }


In [43]:
from transformers.models.t5 import T5ForQuestionAnswering

class T5Model(T5ForQuestionAnswering):
  def __init__(self, config: CustomConfig):
    super().__init__(config)
    self.encoder = EncoderModel(config)
    self.decoder = TransformerDecoder(config)
    self.config = config

# Model

In [37]:
from transformers.modeling_outputs import Seq2SeqQuestionAnsweringModelOutput
from torch.nn import CrossEntropyLoss
class T5Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.encoder = TransformerEncoder(config)
        self.decoder = TransformerDecoder(config)
        self.config = config


    def loss_func(self, inputs, logits):
        # Shift so that tokens < n predict n
        shift_labels = inputs[..., 1:].contiguous()
        shift_logits = logits[..., :-1, :].contiguous()
        # Calculate per-token loss
        loss_fct = CrossEntropyLoss(reduce=False)
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        # Resize and average loss per sample
        loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
        return loss_per_sample.mean()

    def forward(
        self,
        encoder_input,
        decoder_input,
        attention_mask=None,
        decoder_attention_mask=None,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
    ):
        # Encode
        encoder_outputs = self.encoder(
            hidden_states=encoder_input,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Decode
        decoder_outputs = self.decoder(
            hidden_states=decoder_input,
            encoder_hidden_states=encoder_outputs["last_hidden_state"],
            attention_mask=decoder_attention_mask,
            encoder_attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        if not return_dict:
            return (decoder_outputs["logits"],)

        logits = decoder_outputs['logits']
        # print(logits.shape)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        loss = self.loss_func(decoder_input, logits)
        outputs = Seq2SeqQuestionAnsweringModelOutput(
            start_logits=start_logits,
            end_logits=end_logits,
        )

        return outputs

        # return {
        #     "logits": decoder_outputs["logits"],
        #     "encoder_hidden_states": encoder_outputs["hidden_states"],
        #     "decoder_hidden_states": decoder_outputs["hidden_states"],
        #     "encoder_attentions": encoder_outputs["attentions"],
        #     "decoder_attentions": decoder_outputs["self_attentions"],
        #     "cross_attentions": decoder_outputs["cross_attentions"],
        # }

In [32]:
config = CustomConfig(d_model=768, num_heads=8, d_ff=2048,
                         num_encoder_layers=6, num_decoder_layers=6)

    # Initialize decoder
decoder = TransformerDecoder(config)

# Create sample inputs
decoder_input = batch_embeddings
encoder_hidden_states = batch_embeddings  # Simulating encoder output

# Create attention masks
attention_mask = torch.ones(batch_embeddings.size(1), batch_embeddings.size(1))
encoder_attention_mask = torch.ones(batch_embeddings.size(1), batch_embeddings.size(1))

# Forward pass
decoder_output = decoder(
    hidden_states=train_embeddings,
    encoder_hidden_states=encoder_last_hidden_states,
    attention_mask=attention_mask,
    encoder_attention_mask=encoder_attention_mask,
)


96


In [33]:
from torch.nn import CrossEntropyLoss
import torch

def loss_function(inputs, logits):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    return loss_per_sample.mean()

In [44]:
t5_model = T5Model(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"T5 size: {model_size/1000**2:.1f}M parameters")

T5 size: 109.5M parameters


In [45]:
t5_model(train_embeddings, train_embeddings)

TypeError: EncoderModel.forward() got an unexpected keyword argument 'input_ids'

# **Set Up Training**

In [None]:
# Evaluation Metrics

def get_grouped_params(model, no_decay=['bias', 'LayerNorm.weight'], weight_decay=0.01):
  params_with_wd, params_without_wd = [], []
  for n, p in model.named_parameters():
      if any(nd in n for nd in no_decay):
          params_without_wd.append(p)
      else:
          params_with_wd.append(p)
  return [
      {"params": params_with_wd, "weight_decay": weight_decay},
      {"params": params_without_wd, "weight_decay": 0.0},
  ]

def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])
        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")

    print(f"loss: {loss.item()}, perplexity: {perplexity.item()}")

    return loss.item(), perplexity.item()

In [None]:
# DataLoader
from torch.utils.data.dataloader import DataLoader

ds32.set_format('torch')
train_dataloader = DataLoader(ds32["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(ds32["validation"], batch_size=32)

In [None]:
# Optimizer
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(t5_model), lr=5e-4)

In [None]:
# Distributed training setup
from accelerate import Accelerator

accelerator = Accelerator(mixed_precision='fp16')

t5_model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader)

In [None]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=500,
    num_training_steps=num_training_steps,
)


In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
out = data_collator([batch_embeddings])
for key in out:
    print(f"{key} shape: {out[key].shape}")

In [None]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "mathbert_t5"
repo_name = get_full_repo_name(model_name)
output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

In [None]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 5000

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["embeddings"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 100 == 0:
            accelerator.print(
                {
                    "lr": get_lr(),
                    "samples": step * samples_per_step,
                    "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )