# Setup

## Imports

In [2]:
# Import importlib to reload modules and sys and os to add the path for other imports
import importlib
import sys
import os
import torch
from accelerate import Accelerator

# Append the parent directory to the path to import the necessary modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import the utilities and the dataloader
from utils import trainutil, inferutil, setuputil
from classes import GeluAvgEmbed

# Now reload the modules to ensure they are up-to-date
importlib.reload(setuputil)
importlib.reload(trainutil)
importlib.reload(inferutil)
#importlib.reload(GeluAvgEmbed)

# Import the funcs needed from utils
from utils.setuputil import setup_bert_config, display_bert_config
from utils.trainutil import train_model
from utils.inferutil import infer_one, infer_full

# Import the model class
#from classes.GeluAvgEmbed import GeluAvgEmbed

## Configuration

In [3]:
# Define the input config file
setup_config = {
    "model_name": "prajjwal1/bert-tiny",
    "device": "cuda:0",
    "threads": 8,
    "seed": 0,
    "data_dir": "../../data/farzan/",
    "data_ds": "manual",
    "rows": 100,
    "cols": 100,
    "tokens": 32
}

# Get the actual to use config file and view
config = setup_bert_config(setup_config)
display_bert_config(config)

# Define local variables as per the variables from config
DEVICE = config['DEVICE']
THREADS = config['THREADS']
train_loader = config['train_loader']
val_loader = config['val_loader']
test_loader = config['test_loader']
model_name = config['model_name']
tokenizer = config['tokenizer']

config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

  
Processing files: 100%|████████████████████████████████████████| 40/40 [00:23<00:00,  1.68it/s]



40(P) = 40(G) + 0(E)


Processing files: 100%|██████████████████████████████████████████| 5/5 [00:00<00:00,  8.84it/s]



5(P) = 5(G) + 0(E)


Processing files: 100%|██████████████████████████████████████████| 5/5 [00:00<00:00, 11.72it/s]



5(P) = 5(G) + 0(E)

Final BERT configuration:
{
  "model_name": "prajjwal1/bert-tiny",
  "data_dir": "../../data/farzan/",
  "DEVICE": "cuda:0",
  "THREADS": 8,
  "data_ds": "manual",
  "train_dir": "../../data/farzan/manual_train",
  "val_dir": "../../data/farzan/manual_val",
  "test_dir": "../../data/farzan/manual_test",
  "rows": 100,
  "cols": 100,
  "tokens": 32,
  "tokenizer": "<ModernBert Tokenizer Object>",
  "train_loader": "<Train BertLoader Object>",
  "val_loader": "<Validation BertLoader Object>",
  "test_loader": "<Test BertLoader Object>"
}


# Model Creation

In [4]:
import torch
import torch.nn as nn
from transformers import AutoModel
from tqdm import tqdm


class TestBERT(nn.Module):
    def __init__(self, model_name="bert-base-cased", dropout_rate=0.05):
        super(TestBERT, self).__init__()

        # 1. Load pretrained BERT
        self.bert = AutoModel.from_pretrained(model_name)

        # 2. Define a dropout
        self.dropout = nn.Dropout(dropout_rate)

        # 3. Non-linear activation (GELU)
        self.gelu = nn.GELU()

        # 4. Final predictor (1-dim output per cell)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):

        # 1) Allocate the (batch_size, rows, cols) S_cube
        S_cube = torch.zeros(
            (input_ids.shape[0], input_ids.shape[1], input_ids.shape[2]),
            device=input_ids.device,
        )

        # 2) Loop over cells in row-major order
        for cell in tqdm(
            range(input_ids.shape[1] * input_ids.shape[2]), desc="Forward"
        ):

            # In one shot, store logits → S_cube
            # cell // input_ids.shape[2] = row, cell % input_ids.shape[2] = col
            S_cube[
                :, cell // input_ids.shape[2], cell % input_ids.shape[2]
            ] = self.classifier(
                self.gelu(
                    self.dropout(
                        self.bert(
                            input_ids[
                                :,
                                cell // input_ids.shape[2],
                                cell % input_ids.shape[2],
                                :,
                            ],
                            attention_mask=attention_mask[
                                :,
                                cell // input_ids.shape[2],
                                cell % input_ids.shape[2],
                                :,
                            ],
                        ).pooler_output
                    )
                )
            ).view(
                -1
            )

        return S_cube


# class TestBERT(nn.Module):
#     def __init__(self, model_name="bert-base-cased", dropout_rate=0.05):
#         super(TestBERT, self).__init__()

#         # 1. Load pretrained BERT
#         self.bert = AutoModel.from_pretrained(model_name)

#         # 2. Define a dropout
#         self.dropout = nn.Dropout(dropout_rate)

#         # 3. Non-linear activation (GELU)
#         self.gelu = nn.GELU()

#         # 4. Final predictor (1-dim output per cell)
#         self.classifier = nn.Linear(self.bert.config.hidden_size, 1)

#     def forward(self, input_ids, attention_mask):

#         # 1. Print the overall shapes
#         # print("batch_size:", input_ids.shape[0])
#         # print("rows:",      input_ids.shape[1])
#         # print("cols:",      input_ids.shape[2])
#         # print("tokens:",    input_ids.shape[3])

#         # 2. Initialize S_cube => (batch_size, rows, cols)
#         S_cube = torch.zeros(
#             (input_ids.shape[0], input_ids.shape[1], input_ids.shape[2]),
#             device=input_ids.device
#         )

#         # 3. Loop over all cells
#         for cell in tqdm(range(input_ids.shape[1] * input_ids.shape[2]), desc = 'Forward'):

#             r = cell // input_ids.shape[2]
#             c = cell %  input_ids.shape[2]

#             # Extract the slice for current cell (batch_size x tokens)
#             cell_input_ids  = input_ids[:, r, c, :]
#             cell_attn_mask  = attention_mask[:, r, c, :]

#             # Pass them through the BERT model
#             outputs = self.bert(cell_input_ids, attention_mask=cell_attn_mask)

#             # pooler_out => (batch_size, hidden_dim)
#             pooler_out = outputs.pooler_output

#             # Inlined pipeline: dropout -> GELU -> classifier => (batch_size, 1)
#             logits = self.classifier(self.gelu(self.dropout(pooler_out)))

#             # Flatten (batch_size, 1) => (batch_size,)
#             logits_flat = logits.view(-1)

#             # Populate S_cube => shape: (batch_size, rows, cols)
#             S_cube[:, r, c] = logits_flat

#             # If this is the first cell, do some prints and break
#             if r == 0 and c == 0:
#                 print(f"\nFirst cell => row={r}, col={c}")
#                 print(f"cell_input_ids.shape: {cell_input_ids.shape}")
#                 print(f"cell_attn_mask.shape: {cell_attn_mask.shape}")
#                 print(f"logits.shape: {logits.shape}")
#                 print(f"logits_flat.shape: {logits_flat.shape}")
#                 print(f"S_cube[:, {r}, {c}].shape: {S_cube[:, r, c].shape}")

#                 break  # Stop after the first cell

#         # 4. Print the shape of S_cube
#         # print(f"\nS_cube.shape: {S_cube.shape}")

#         # Return S_cube or None, depending on your use case
#         return S_cube

In [4]:
# ---------------------------------------------------
#  Full Notebook Code with ONLY Mixed Precision (fp16), No DeepSpeed
# ---------------------------------------------------

# 1) Standard imports
import importlib
import sys
import os
import torch
from torch.cuda.amp import autocast

# Append the parent directory to the path to import the necessary modules
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import the utilities and the dataloader
from utils import trainutil, inferutil, setuputil

# Now reload the modules to ensure they are up-to-date
importlib.reload(setuputil)
importlib.reload(trainutil)
importlib.reload(inferutil)

# Import the funcs needed from utils
from utils.setuputil import setup_bert_config, display_bert_config
from utils.trainutil import train_model
from utils.inferutil import infer_one, infer_full

# Define the input config file
setup_config = {
    "model_name": "prajjwal1/bert-tiny",
    "device": "cuda:0",
    "threads": 8,
    "seed": 0,
    "data_dir": "../../data/farzan/",
    "data_ds": "manual",
    "rows": 100,
    "cols": 100,
    "tokens": 32
}

# Get the actual to use config file and view
config = setup_bert_config(setup_config)
display_bert_config(config)

# Define local variables as per the variables from config
DEVICE = config['DEVICE']
THREADS = config['THREADS']
train_loader = config['train_loader']
val_loader = config['val_loader']
test_loader = config['test_loader']
model_name = config['model_name']
tokenizer = config['tokenizer']

import torch
import torch.nn as nn
from transformers import AutoModel
from tqdm import tqdm


class TestBERT(nn.Module):
    def __init__(self, model_name="bert-base-cased", dropout_rate=0.05):
        super(TestBERT, self).__init__()

        # 1. Load pretrained BERT
        self.bert = AutoModel.from_pretrained(model_name)

        # Enable gradient checkpointing if desired
        self.bert.gradient_checkpointing_enable()

        # 2. Define a dropout
        self.dropout = nn.Dropout(dropout_rate)

        # 3. Non-linear activation (GELU)
        self.gelu = nn.GELU()

        # 4. Final predictor (1-dim output per cell)
        self.classifier = nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):

        # 1) Allocate the (batch_size, rows, cols) S_cube
        S_cube = torch.zeros(
            (input_ids.shape[0], input_ids.shape[1], input_ids.shape[2]),
            device=input_ids.device,
        )

        # 2) Loop over cells in row-major order
        for cell in tqdm(
            range(input_ids.shape[1] * input_ids.shape[2]), desc="Forward"
        ):

            # cell // input_ids.shape[2] = row, cell % input_ids.shape[2] = col
            S_cube[
                :, cell // input_ids.shape[2], cell % input_ids.shape[2]
            ] = (
                self.classifier(
                    self.gelu(
                        self.dropout(
                            self.bert(
                                input_ids[:, cell // input_ids.shape[2],
                                          cell % input_ids.shape[2], :],
                                attention_mask=attention_mask[:, cell // input_ids.shape[2],
                                                              cell % input_ids.shape[2], :]
                            ).pooler_output
                        )
                    )
                )
                .view(-1)
            )

        return S_cube


# 1) Create model and move to GPU
untrained_model = TestBERT(model_name=model_name).to(DEVICE)

# 2) Single-batch DataLoader
check_loader = torch.utils.data.DataLoader(train_loader, batch_size=1, shuffle=False)
batch = next(iter(check_loader))

ex_xtok = batch["x_tok"].to(DEVICE)
ex_xmask = batch["x_masks"].to(DEVICE)

# 3) FP16 forward pass with torch.cuda.amp
with autocast():
    out = untrained_model.forward(ex_xtok, ex_xmask)

print(out.shape)

Processing files: 100%|████████████████████████████████████████| 40/40 [00:22<00:00,  1.75it/s]



40(P) = 40(G) + 0(E)


Processing files: 100%|██████████████████████████████████████████| 5/5 [00:00<00:00,  9.99it/s]



5(P) = 5(G) + 0(E)


Processing files: 100%|██████████████████████████████████████████| 5/5 [00:00<00:00, 11.98it/s]



5(P) = 5(G) + 0(E)

Final BERT configuration:
{
  "model_name": "prajjwal1/bert-tiny",
  "data_dir": "../../data/farzan/",
  "DEVICE": "cuda:0",
  "THREADS": 8,
  "data_ds": "manual",
  "train_dir": "../../data/farzan/manual_train",
  "val_dir": "../../data/farzan/manual_val",
  "test_dir": "../../data/farzan/manual_test",
  "rows": 100,
  "cols": 100,
  "tokens": 32,
  "tokenizer": "<ModernBert Tokenizer Object>",
  "train_loader": "<Train BertLoader Object>",
  "val_loader": "<Validation BertLoader Object>",
  "test_loader": "<Test BertLoader Object>"
}


RuntimeError: Failed to import transformers.models.bert.modeling_bert because of the following error (look up to see its traceback):
Artifact name: 'trace_shape_events' not registered,please call register_artifact('trace_shape_events') in torch._logging.registrations.

Forward: 100%|███████████████████████████| 10000/10000 [00:24<00:00, 409.54it/s]

torch.Size([1, 100, 100])



