In [1]:
# Colab Prerequisites
!pip install -q transformers scikit-learn torch
!pip install huggingface_hub[hf_xet]

import os
import random
import numpy as np
import pandas as pd
import torch

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
from sklearn.base import ClassifierMixin
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from typing import Dict, Tuple, List



### DistilBERT Implementation for Text Classification
*The fine-tuned model can be downloaded here: https://drive.google.com/drive/folders/1Onufz7H3MInt2gra_lLhGlUTbKw8nhfv?usp=drive_link*

This analysis was conducted on Google Colab with access to an A100 with 40GB GPU RAM

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_df = pd.read_csv('/content/drive/MyDrive/Harvard BPH/Spring 2025/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Harvard BPH/Spring 2025/test.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Harvard BPH/Spring 2025/val.csv')

In [4]:
# 1. GPU / Device setup
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")
# If this prints "cpu", go to Runtime → Change runtime type → GPU

Using device: cuda


In [5]:

# 2. Load & label-encode
le = LabelEncoder().fit(train_df["channel"])
train_df["label"] = le.transform(train_df["channel"])
num_labels = len(le.classes_)

In [None]:
# See label encoding
train_df.head()

The initial thought was to train three separate models on different partitions of the data, the half of channels with the most snips and another half with channels that had fewer snips. To overcome this class imbalance, we considered creating a meta-feature, which was deried from the classification assignment of a given validation snip across all three models, and then use an additional classificatio model (such as linear regression), to classify the meta-feature. We did not go on to implement this strategy, instead we evaluated the validation set directly on the fine-tuned BERT model trained with all the training data. However, the strucutre of the code was maintained as if going through with the former idea. 

In [None]:
# 3. Partition classes by count
counts = train_df["label"].value_counts()
HIGH = counts[counts > 400].index
LOW  = counts[counts <= 400].index

splits = {
    "high": train_df[train_df["label"].isin(HIGH)],
    "low":  train_df[train_df["label"].isin(LOW)],
    "all":  train_df
}

In [None]:
splits["high"].head()

In [None]:
splits["low"].head()

In [None]:
splits["all"].head()

In [7]:
# 4. Tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Create a custom dataset class that supports integer indexing
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx.item()]) if isinstance(idx, torch.Tensor) else torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx.item()]) if isinstance(idx, torch.Tensor) else torch.tensor(self.labels[idx])
            return item

    def __len__(self):
        return len(self.labels)

def tokenize_df(df_sub):
    enc = tokenizer(
        df_sub["snip"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=256,  # 256 tokens covers about 1120 characters
        return_tensors="pt"
    )
    # Create a custom Dataset instance
    ds = MyDataset({
        "input_ids": enc.input_ids.cpu().numpy(), # Ensure encodings and labels are on CPU
        "attention_mask": enc.attention_mask.cpu().numpy()
    }, df_sub["label"].values)
    return ds # return the Dataset object directly

# 5. Helper to train a base model on a split
def train_base(name, df_split, save_path=None):
    # 1) Prepare data
    ds = tokenize_df(df_split)   # your existing function, returns a Dataset with .labels

    # 2) Initialize model & push to GPU
    model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=num_labels,
    ).to(device)

    # 3) Training args
    args = TrainingArguments(
        output_dir=f"./models/{name}",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        num_train_epochs=3,
        logging_steps=100,
        save_strategy="no",
        seed=42,
    )

    # 4) Trainer & train
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds,
    )
    trainer.train()

    # 5) Compute training‐set predictions & metrics
    trainer.model.eval()
    with torch.no_grad():
        pred_output = trainer.predict(ds)
    train_preds  = np.argmax(pred_output.predictions, axis=-1)
    train_labels = pred_output.label_ids

    train_acc  = accuracy_score(train_labels, train_preds)
    train_prec, train_rec, train_f1, _ = precision_recall_fscore_support(
        train_labels, train_preds, average="weighted"
    )

    print(f"[{name}] TRAIN → acc: {train_acc:.4f}, prec: {train_prec:.4f}, "
          f"rec: {train_rec:.4f}, f1: {train_f1:.4f}")

    # 6) Optional: save model + metrics
    if save_path is not None:
        save_dir = os.path.join(save_path, name)
        trainer.save_model(save_dir)

        # save metrics to CSV
        met_df = pd.DataFrame([{
            "model": name,
            "train_accuracy": train_acc,
            "train_precision": train_prec,
            "train_recall": train_rec,
            "train_f1": train_f1
        }])
        met_df.to_csv(os.path.join(save_path, f"{name}_train_metrics.csv"), index=False)
        print(f"→ Saved model to {save_dir}")
        print(f"→ Saved training metrics to {save_path}/{name}_train_metrics.csv")

    return model

In [8]:
all_model = train_base("all", splits["all"], save_path="/content/drive/MyDrive/models")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mecdyer[0m ([33mecdyer-harvard-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,3.062
200,2.6159
300,2.2126
400,1.9214
500,1.7211
600,1.5659
700,1.4663
800,1.2836
900,1.183
1000,1.1722


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[all] TRAIN → acc: 0.8499, prec: 0.8468, rec: 0.8499, f1: 0.8395
→ Saved model to /content/drive/MyDrive/models/all
→ Saved training metrics to /content/drive/MyDrive/models/all_train_metrics.csv


In [None]:
low_model = train_base("low", splits["low"], save_path="/content/drive/MyDrive/models")

In [None]:
high_model = train_base("high", splits["high"])

In [None]:
high_model.save_pretrained("/content/drive/MyDrive/models")

In [None]:
base_models = {
    "high": high_model,
    "low": low_model,
    "all": all_model
}

In [9]:
# Load the models
high_model = DistilBertForSequenceClassification.from_pretrained("/content/drive/MyDrive/models/high")
low_model = DistilBertForSequenceClassification.from_pretrained("/content/drive/MyDrive/models/low")
all_model = DistilBertForSequenceClassification.from_pretrained("/content/drive/MyDrive/models/all")


# Move models to device (if using GPU)
high_model.to(device)
low_model.to(device)
all_model.to(device)

# Update base_models dictionary
base_models = {
    "high": high_model,
    "low": low_model,
    "all": all_model
}

In [None]:
# 6. Train the three base models
#base_models = {name: train_base(name, split) for name, split in splits.items()}

In [10]:
# 1) Load & encode
val_df["label"] = le.transform(val_df["channel"])
y_true = val_df["label"].values

torch.cuda.empty_cache()  # Clear the CUDA cache

# 2) Batch‐tokenize
enc = tokenizer(
    val_df["snip"].tolist(),
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors="pt"
)
input_ids      = enc["input_ids"].to(device)
attention_mask = enc["attention_mask"].to(device)

# 3) Loop over each base model
metrics_summary = []

for name, model in base_models.items():
    model.eval()
    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
        probs  = torch.softmax(logits, dim=-1).cpu().numpy()

    # preds and confidence
    preds       = np.argmax(probs, axis=-1)
    confidences = probs[np.arange(len(probs)), preds]

    # decode to network names
    pred_chan = le.inverse_transform(preds)
    true_chan = val_df["channel"].values

    # 4a) Compute overall metrics
    acc = accuracy_score(y_true, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, preds, average="weighted"
    )
    metrics_summary.append({
        "model":    name,
        "accuracy": acc,
        "precision": prec,
        "recall":    rec,
        "f1":        f1
    })

    # 4b) Compute per-class metrics
    class_prec, class_rec, class_f1, support = precision_recall_fscore_support(
        y_true, preds, average=None, labels=range(len(le.classes_))
    )
    per_class_df = pd.DataFrame({
        "channel":  le.classes_,
        "precision": class_prec,
        "recall":    class_rec,
        "f1":        class_f1,
        "support":   support
    })
    per_class_df.to_csv(f"{name}_per_class_metrics.csv", index=False)
    print(f"→ Saved per-class metrics to {name}_per_class_metrics.csv")

    # 5) Save per-example CSV
    out_df = pd.DataFrame({
        "snip":          val_df["snip"],
        "true_channel":  true_chan,
        "pred_channel":  pred_chan,
        "confidence":    confidences
    })
    out_df.to_csv(f"{name}_predictions.csv", index=False)
    print(f"→ Saved predictions to      {name}_predictions.csv")

    # 6) Save overall metrics CSV
    met_df = pd.DataFrame([metrics_summary[-1]])
    met_df.to_csv(f"{name}_metrics.csv", index=False)
    print(f"→ Saved overall metrics to  {name}_metrics.csv")
    print(f"[{name}] acc={acc:.4f} prec={prec:.4f} rec={rec:.4f} f1={f1:.4f}\n")

# 7) Aggregate all overall metrics into one table
all_met = pd.DataFrame(metrics_summary)
all_met.to_csv("all_models_metrics.csv", index=False)
print("→ Saved aggregate metrics to all_models_metrics.csv")
print(all_met)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


→ Saved per-class metrics to high_per_class_metrics.csv
→ Saved predictions to      high_predictions.csv
→ Saved overall metrics to  high_metrics.csv
[high] acc=0.6242 prec=0.5664 rec=0.6242 f1=0.5896



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


→ Saved per-class metrics to low_per_class_metrics.csv
→ Saved predictions to      low_predictions.csv
→ Saved overall metrics to  low_metrics.csv
[low] acc=0.0579 prec=0.0192 rec=0.0579 f1=0.0193

→ Saved per-class metrics to all_per_class_metrics.csv
→ Saved predictions to      all_predictions.csv
→ Saved overall metrics to  all_metrics.csv
[all] acc=0.6581 prec=0.6558 rec=0.6581 f1=0.6472

→ Saved aggregate metrics to all_models_metrics.csv
  model  accuracy  precision    recall        f1
0  high  0.624218   0.566439  0.624218  0.589578
1   low  0.057914   0.019214  0.057914  0.019274
2   all  0.658111   0.655796  0.658111  0.647186


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Create a Meta-Encoder (Did not go on to finish this portion)

In [None]:
# 7. Out‐of‐Fold (OOF) features for meta-learner
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
N = len(train_df)
oof_feats = np.zeros((N, 3 * num_labels), dtype=np.float32)
oof_labels = train_df["label"].values

for fold, (train_idx, val_idx) in enumerate(kf.split(train_df, train_df["label"])):
    print(f"Generating OOF for fold {fold}")
    fold_model_dir = os.path.join("/content/drive/MyDrive/models", f"fold_{fold}")
    if os.path.exists(fold_model_dir):
      fold_models[fold] = {
      name: DistilBertForSequenceClassification.from_pretrained(os.path.join("/content/drive/MyDrive/models", f"{name}_fold{fold}"))
      for name in splits.keys()}
      # Move models to device if needed
      for model in fold_models[fold].values():
        model.to(device)
    else:
      # Split data
      df_tr, df_val = train_df.iloc[train_idx], train_df.iloc[val_idx]

      # Create a directory for saving fold models if it doesn't exist
      fold_model_dir = os.path.join("/content/drive/MyDrive/models", f"fold_{fold}")
      os.makedirs(fold_model_dir, exist_ok=True)

      # retrain base models on this fold and save them
      fold_models = {
          name: train_base(f"{name}_fold{fold}", df_tr[df_tr["label"].isin(split_idx.index)], save_path=fold_model_dir)
          for name, split_idx in splits.items()
      }
    # get predictions on val set
    tok_val = tokenize_df(df_val)
    for i, name in enumerate(["high", "low", "all"]):
        model = fold_models[name]
        with torch.no_grad():
            logits = model(
                input_ids=tok_val["input_ids"],
                attention_mask=tok_val["attention_mask"],
            ).logits
        probs = torch.softmax(logits, dim=-1).cpu().numpy()
        oof_feats[val_idx, i*num_labels:(i+1)*num_labels] = probs

Generating OOF for fold 0


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mecdyer[0m ([33mecdyer-harvard-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,3.0115
200,2.496
300,2.0057
400,1.691
500,1.4902
600,1.3263
700,1.2517
800,1.1485


KeyboardInterrupt: 

In [None]:
# 8. Train meta‐model (on CPU is fine)
meta = LogisticRegression(
    multi_class="multinomial", max_iter=1000, class_weight="balanced", random_state=42
)
meta.fit(oof_feats, oof_labels)

In [None]:
# 9. Stacked predict function
def stacked_predict(texts):
    enc = tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="pt")
    enc = {k: v.to(device) for k, v in enc.items()}
    all_probs = []
    for model in base_models.values():
        with torch.no_grad():
            logits = model(**enc).logits
        all_probs.append(torch.softmax(logits, dim=-1).cpu().numpy())
    feat = np.concatenate(all_probs, axis=1)  # shape: (n, 3*K)
    preds = meta.predict(feat)
    return le.inverse_transform(preds)

In [None]:
# 10. Test inference
examples = [
    "Tonight's political debate heats up on Network A.",
    "Breaking sports update from Channel 7."
]
print(stacked_predict(examples))