In [1]:
import os
import sys
from typing import Optional, cast

import src.evals.data as data_module
import src.hf_bert as hf_bert_module
import src.mosaic_bert as mosaic_bert_module
import src.flex_bert as flex_bert_module
import transformers
from composer import Trainer, algorithms, Evaluator
from composer.callbacks import LRMonitor, MemoryMonitor, OptimizerMonitor, RuntimeEstimator, SpeedMonitor
from composer.core.types import Dataset
from composer.loggers import WandBLogger
from composer.optim import DecoupledAdamW
from composer.optim.scheduler import (
    ConstantWithWarmupScheduler,
    CosineAnnealingWithWarmupScheduler,
    LinearWithWarmupScheduler,
)
from src.scheduler import WarmupStableDecayScheduler
from composer.utils import dist, reproducibility
from omegaconf import DictConfig
from omegaconf import OmegaConf as om
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm
  @custom_fwd
  @custom_bwd


In [2]:
def build_my_dataloader(cfg: DictConfig, device_batch_size: int):
    """Create a dataloader for classification.

    **Modify this function to train on your own dataset!**

    This function is provided as a starter code to simplify fine-tuning a BERT
    classifier on your dataset. We'll use the dataset for QNLI (one of the
    GLUE tasks) as a demonstration.

    Args:
        cfg (DictConfig): An omegaconf config that houses all the configuration
            variables needed to instruct dataset/dataloader creation.
        device_batch_size (int): The size of the batches that the dataloader
            should produce.

    Returns:
        dataloader: A dataloader set up for use of the Composer Trainer.
    """
    # As a demonstration, we're using the QNLI dataset from the GLUE suite
    # of tasks.
    #
    # Note: We create our dataset using the `data_module.create_glue_dataset` utility
    #   defined in `./src/glue/data.py`. If you inspect that code, you'll see
    #   that we're taking some extra steps so that our dataset yields examples
    #   that follow a particular format. In particular, the raw text is
    #   tokenized and some of the data columns are removed. The result is that
    #   each example is a dictionary with the following:
    #
    #     - 'input_ids': the tokenized raw text
    #     - 'label': the target class that the text belongs to
    #     - 'attention_mask': a list of 1s and 0s to indicate padding
    #
    # When you set up your own dataset, it should handle tokenization to yield
    # examples with a similar structure!
    #
    # REPLACE THIS WITH YOUR OWN DATASET:
    dataset = data_module.create_glue_dataset(
        task="qnli",
        split=cfg.split,
        tokenizer_name=cfg.tokenizer_name,
        max_seq_length=cfg.max_seq_len,
    )

    dataset = cast(Dataset, dataset)
    dataloader = DataLoader(
        dataset,
        # As an alternative to formatting the examples inside the dataloader,
        # you can write a custom data collator to do that instead.
        collate_fn=transformers.default_data_collator,
        batch_size=device_batch_size,
        sampler=dist.get_sampler(dataset, drop_last=cfg.drop_last, shuffle=cfg.shuffle),
        num_workers=cfg.num_workers,
        pin_memory=cfg.get("pin_memory", True),
        prefetch_factor=cfg.get("prefetch_factor", 2),
        persistent_workers=cfg.get("persistent_workers", True),
        timeout=cfg.get("timeout", 0),
    )

    return dataloader

yaml_path, args_list = "yamls/test/sequence_classification.yaml", sys.argv[2:]
with open("yamls/defaults.yaml") as f:
    default_cfg = om.load(f)
with open(yaml_path) as f:
    yaml_cfg = om.load(f)
cli_cfg = om.from_cli(args_list)
cfg = om.merge(default_cfg, yaml_cfg, cli_cfg)
cfg = cast(DictConfig, cfg)  # for type checking

train_loader = build_my_dataloader(
    cfg.train_loader,
    cfg.global_train_batch_size // dist.get_world_size(),
)

#get one data sample from the train_loader
data = next(iter(train_loader))
# Print data keys only
print(data.keys())
print(data)



Example from glue dataset:
{'question': 'When did the third Digimon series begin?', 'sentence': 'Unlike the two seasons before it and most of the seasons that followed, Digimon Tamers takes a darker and more realistic approach to its story featuring Digimon who do not reincarnate after their deaths and more complex character development in the original Japanese.', 'label': 1, 'idx': 0}
{'question': 'Which missile batteries often have individual launchers several kilometres from one another?', 'sentence': 'When MANPADS is operated by specialists, batteries may have several dozen teams deploying separately in small sections; self-propelled air defence guns may deploy in pairs.', 'label': 1, 'idx': 1}
{'question': "What two things does Popper argue Tarski's theory involves in an evaluation of truth?", 'sentence': 'He bases this interpretation on the fact that examples such as the one described above refer to two things: assertions and the facts to which they refer.', 'label': 0, 'idx': 2}

In [7]:
import transformers
from generate_dataset import generate_synthetic_dataset

# Configuration parameters
decimal_token = "<DEC>"

cfg = {}

# Generate the synthetic dataset
df = generate_synthetic_dataset(
    n_samples=cfg.get("n_samples", 100),
    n_continuous_features=cfg.get("n_continuous_features", 15),
    n_discrete_features=cfg.get("n_discrete_features", 15),
    n_classes=cfg.get("n_classes", 2),
    class_distribution=cfg.get("class_distribution", [0.8, 0.2]),
    n_bins=cfg.get("n_bins", 10),
    n_redundant=cfg.get("n_redundant", 5),
    n_noisy=cfg.get("n_noisy", 20),
    class_sep=cfg.get("class_sep", 0.1),
)

# Change structure to "sentence", "label" and "idx"
# All columns except the last one are features and they are concatenated to form a sentence
# The last column is the label
df['sentence'] = df.drop(columns=['label']).apply(lambda x: ' '.join([f"{val}".replace('.', decimal_token) for val in x]), axis=1)

# Create dummy sentence based on label: if 1 then "4.23245", if 0 then "5.7655"
df['sentence'] = df['label'].apply(lambda x: f"4{decimal_token}23245" if x == 1 else f"5{decimal_token}7655")

# Reorder columns and add index
df = df[['sentence', 'label']]
df['idx'] = df.index

# Tokenize the dataset
tokenizer = transformers.AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

# Add special tokens
tokenizer.add_tokens([decimal_token])

# Add a padding token if it doesn't already exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token_id = tokenizer.eos_token_id  # Use EOS token as padding token for GPT-2

# Tokenize sentences
tokenized_dataset = tokenizer(
    df['sentence'].tolist(),  # Ensure this is a list of strings
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

# Print input sentence and tokenization results
print("\nTokenizer Debug Info:")
print("-" * 50)
# Print first 3 examples
for i in range(min(3, len(df))):
    print(f"\nExample {i+1}:")
    print(f"Input sentence: {df['sentence'].iloc[i]}")
    print(f"Label: {df['label'].iloc[i]}")
    
    # Get tokenized ids for this example
    tokens = tokenizer.encode(df['sentence'].iloc[i])
    print(f"Token IDs: {tokens}")
    
    # Decode back to string to verify tokenization
    decoded = tokenizer.decode(tokens)
    print(f"Decoded text: {decoded}")
    
    # Print individual tokens
    tokens_list = tokenizer.convert_ids_to_tokens(tokens)
    print(f"Individual tokens: {tokens_list}")
print("-" * 50)

# Create a PyTorch dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

labels = df['label'].tolist()
custom_dataset = CustomDataset(tokenized_dataset, labels)


Tokenizer Debug Info:
--------------------------------------------------

Example 1:
Input sentence: 5<DEC>7655
Label: 0
Token IDs: [101, 1019, 30522, 6146, 24087, 102]
Decoded text: [CLS] 5 <DEC> 7655 [SEP]
Individual tokens: ['[CLS]', '5', '<DEC>', '76', '##55', '[SEP]']

Example 2:
Input sentence: 4<DEC>23245
Label: 1
Token IDs: [101, 1018, 30522, 20666, 19961, 102]
Decoded text: [CLS] 4 <DEC> 23245 [SEP]
Individual tokens: ['[CLS]', '4', '<DEC>', '232', '##45', '[SEP]']

Example 3:
Input sentence: 5<DEC>7655
Label: 0
Token IDs: [101, 1019, 30522, 6146, 24087, 102]
Decoded text: [CLS] 5 <DEC> 7655 [SEP]
Individual tokens: ['[CLS]', '5', '<DEC>', '76', '##55', '[SEP]']
--------------------------------------------------




In [8]:
import torch
import torch.nn as nn

class NumericalEmbedding(nn.Module):
    def __init__(self, max_digits=10, embedding_dim=16):
        super(NumericalEmbedding, self).__init__()
        self.max_digits = max_digits
        self.embedding_dim = embedding_dim
        
        # Embedding for digit values (0-9 and decimal point)
        self.value_embedding = nn.Embedding(11, embedding_dim)  # 10 digits + 1 for decimal point
        
        # Positional encoding
        self.position_embedding = nn.Embedding(max_digits, embedding_dim)
        
    def forward(self, numbers):
        # Convert numbers to strings and pad/truncate to max_digits
        number_strings = [f"{num:.10f}".replace('.', '')[:self.max_digits] for num in numbers]
        
        # Convert to tensor of digit values
        digit_values = [[int(char) if char.isdigit() else 10 for char in num_str] for num_str in number_strings]
        digit_values = torch.tensor(digit_values, dtype=torch.long)
        
        # Create position indices
        positions = torch.arange(self.max_digits).unsqueeze(0).expand(len(numbers), -1)
        
        # Get value and position embeddings
        value_embeds = self.value_embedding(digit_values)
        position_embeds = self.position_embedding(positions)
        
        # Combine value and position embeddings
        embeddings = value_embeds + position_embeds
        
        # Sum embeddings along the digit dimension to get a single embedding per number
        embeddings = embeddings.sum(dim=1)
        
        return embeddings

# Example usage
numbers = [4.23245, 5.7655]
embedding_layer = NumericalEmbedding(max_digits=10, embedding_dim=16)
embeddings = embedding_layer(numbers)
print(embeddings)

tensor([[  3.9211,   3.2069,  -8.7426,  -1.4919,  -8.4208,   2.9638,  -2.1228,
          -0.6522, -17.6901, -11.3840,  -0.4243,  13.0068,   1.2750,  -2.6411,
          -7.6628,  -0.1946],
        [  2.8334,   1.6331,  -8.1017, -12.3891,  -8.4939,   9.4537,  -1.3971,
           3.2787, -12.8249,  -7.0314,  -1.8222,  17.1801,  -0.5544,  -0.8752,
          -1.7109,  -0.7451]], grad_fn=<SumBackward1>)
