# Inference Optimizations
Note: used google collab for GPU because of time constraint. \
In real pipeline this would be done after training on GPU instance.

## Pre-process Data

In [4]:
# Upload kaggle.json
from google.colab import files
files.upload()

# Move to the correct path
!mkdir -p /content/.kaggle
!cp kaggle.json /content/.kaggle/
!chmod 600 /content/.kaggle/kaggle.json

# Set the environment variable so the API knows where to look
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/.kaggle"

# Test
!kaggle datasets list

Saving kaggle.json to kaggle (1).json
ref                                                          title                                                      size  lastUpdated                 downloadCount  voteCount  usabilityRating  
-----------------------------------------------------------  --------------------------------------------------  -----------  --------------------------  -------------  ---------  ---------------  
jayaantanaath/student-habits-vs-academic-performance         Student Habits vs Academic Performance                    19512  2025-04-12 10:49:08.663000          23311        401  1.0              
adilshamim8/cost-of-international-education                  Cost of International Education                           18950  2025-05-07 15:41:53.213000           4630         78  1.0              
adilshamim8/social-media-addiction-vs-relationships          Students' Social Media Addiction                           7851  2025-05-10 14:38:02.713000           2137   

In [5]:
import argparse
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import zipfile

# os.environ["KAGGLE_CONFIG_DIR"] = os.path.abspath(".kaggle") # Use local .kaggle directory
from kaggle.api.kaggle_api_extended import KaggleApi

In [6]:
def download_jigsaw(kaggle_dir):
    os.makedirs(kaggle_dir, exist_ok=True)

    api = KaggleApi()
    api.authenticate()

    # Download competition data
    api.competition_download_files(
        "jigsaw-unintended-bias-in-toxicity-classification",
        path=kaggle_dir
    )

    # Unzip
    zip_path = os.path.join(kaggle_dir, "jigsaw-unintended-bias-in-toxicity-classification.zip")
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(kaggle_dir)

    print("Downloaded and extracted Jigsaw dataset.")


def preprocess(kaggle_dir, output_dir, split_ratio=0.2):
    # Create output directory if missing
    os.makedirs(output_dir, exist_ok=True)

    # Ensure input file exists, create parent dir if needed (just in case)
    os.makedirs(kaggle_dir, exist_ok=True)
    input_path = os.path.join(kaggle_dir, "train.csv")
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"train.csv not found in {kaggle_dir}")

    df = pd.read_csv(input_path).dropna(subset=["comment_text"])

    # Keep only the needed columns
    df = df[["comment_text", "target"]]

    # Binarize target (optional: uncomment if needed)
    # df["target"] = (df["target"] >= 0.5).astype(int)

    # Split
    train_df, val_df = train_test_split(df, test_size=split_ratio, random_state=42)

    os.makedirs(output_dir, exist_ok=True)
    train_df.to_csv(os.path.join(output_dir, "train.csv"), index=False)
    val_df.to_csv(os.path.join(output_dir, "val.csv"), index=False)

    print(f"Saved {len(train_df)} training and {len(val_df)} validation samples to {output_dir}")

In [8]:
kaggle_dir = "data/jigsaw/raw/"
output_dir = "data/jigsaw/processed/"
val_split = 0.2

download_jigsaw(kaggle_dir)
preprocess(kaggle_dir, output_dir, val_split)

## Training

In [9]:
import os
import time
import argparse
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [19]:
config = {
    "initial_epochs": 2,
    "total_epochs": 1,
    "patience": 2,
    "batch_size": 128,
    "lr": 2e-5,
    "fine_tune_lr": 1e-5,
    "max_len": 128,
    "dropout_probability": 0.3,
    "model_name": "distilbert-base-uncased"
}

In [11]:
# ---------------------------
# Dataset
# ---------------------------
class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts = df["comment_text"].tolist()
        self.labels = (df["target"] >= 0.5).astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [26]:
# ---------------------------
# Training + Evaluation Functions
# ---------------------------
from tqdm import tqdm

def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss, correct, total = 0, 0, 0

    num_batches = int(0.01 * len(loader)) # Doing part of the training because my part is inference and monitoring
    print(f"Training for {num_batches} batches")

    for i, batch in enumerate(tqdm(loader, desc="Training", leave=False)):
        if i >= num_batches:
            break

        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = outputs.logits.argmax(dim=1)
        correct += (preds == batch["labels"]).sum().item()
        total += batch["labels"].size(0)

    avg_loss = total_loss / num_batches
    avg_acc = correct / total
    print(f"Partial Epoch Summary - Avg Loss: {avg_loss:.4f}, Avg Accuracy: {avg_acc:.4f}\n")

    return avg_loss, avg_acc

def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, correct, total = 0, 0, 0

    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            total_loss += loss.item()
            preds = outputs.logits.argmax(dim=1)
            correct += (preds == batch["labels"]).sum().item()
            total += batch["labels"].size(0)

    return total_loss / len(loader), correct / total

In [27]:
# ---------------------------
# Main Training Pipeline
# ---------------------------
def main(args):
    # made to run in command line originally
    # parser = argparse.ArgumentParser()
    # parser.add_argument("--data-dir", type=str, required=True, help="Directory with train.csv and val.csv")
    # parser.add_argument("--save-path", type=str, required=True, help="Path to save the trained model")
    # parser.add_argument("--dry-run", action="store_true", help="Run a quick test on a small sample")
    # args = parser.parse_args()

    os.makedirs(os.path.dirname(args.save_path), exist_ok=True)
    device = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
    print(f"Using device: {device}")

    tokenizer = DistilBertTokenizer.from_pretrained(config["model_name"])
    train_df = pd.read_csv(os.path.join(args.data_dir, "train.csv"))
    if args.dry_run:
        train_df = train_df.sample(n=32, random_state=42)
    val_df = pd.read_csv(os.path.join(args.data_dir, "val.csv"))
    if args.dry_run:
        val_df = val_df.sample(n=32, random_state=42)

    train_loader = DataLoader(JigsawDataset(train_df, tokenizer, config["max_len"]),
                              batch_size=config["batch_size"], shuffle=True)
    val_loader = DataLoader(JigsawDataset(val_df, tokenizer, config["max_len"]),
                            batch_size=config["batch_size"])

    model = DistilBertForSequenceClassification.from_pretrained(config["model_name"])
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config["lr"])

    best_val_loss = float("inf")
    patience_counter = 0

    for epoch in range(config["total_epochs"]):
        start = time.time()

        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = evaluate(model, val_loader, criterion, device)

        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f} Acc={train_acc:.4f} | Val Loss={val_loss:.4f} Acc={val_acc:.4f} | Time={time.time() - start:.2f}s")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), args.save_path)
            patience_counter = 0
            print("  Validation loss improved. Model saved.")
        else:
            patience_counter += 1
            print(f"  No improvement. Patience: {patience_counter}")
            if patience_counter >= config["patience"]:
                print("  Early stopping.")
                break

In [None]:
# simulate arguments
class args:
  data_dir = "data/jigsaw/processed/"
  save_path = "models/model.pth"
  dry_run = False

main(args)

Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training for 564 batches


Training:   0%|          | 23/11281 [00:09<1:17:57,  2.41it/s]

## Inference Optimization

### Measure inference performance of PyTorch model on CPU

In [None]:
!pip install torchinfo

In [None]:
import os
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torchinfo import summary
import time
import numpy as np
import pandas as pd

### Measure inference performance of ONNX model on CPUÂ¶