In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install -q transformers
!pip install -q torch
!pip install -q datasets
!pip install -q sklearn.utils
!pip install -q accelerate
!pip install -q numpy
!pip install -q pandas

In [8]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import os
import json

# --- Configuration ---
MODEL_NAME = "distilbert-base-uncased"
# Prefix for file paths within the mounted Google Drive
DRIVE_PREFIX = "/content/drive/MyDrive/266-final-project-data"
TRAIN_FILE = os.path.join(DRIVE_PREFIX, "train_dataset.csv")
VAL_FILE = os.path.join(DRIVE_PREFIX, "val_dataset.csv")
TEST_FILE = os.path.join(DRIVE_PREFIX, "test_dataset.csv")

# Directory to save the final trained model
MODEL_OUTPUT_DIR = os.path.join(DRIVE_PREFIX, "guardrail_model_v1")
LOGGING_DIR = os.path.join(MODEL_OUTPUT_DIR, "logs")

# Define the labels
LABEL_MAP = {
    "Benign": 0,
    "Malicious": 1
}
# Create revered map for the model config
ID2LABEL = {v: k for k, v in LABEL_MAP.items()}

# --- 1. Load Data ---

def load_data(filepath):
    """Loads a CSV, checks for required columns, and converts to Hugging Face Dataset."""
    if not os.path.exists(filepath):
        print(f"Error: File not found: {filepath}")
        return None

    try:
        df = pd.read_csv(filepath)

        # Check for the columns created by prompt_gen.py
        if "Obfuscated_Prompt" not in df.columns or "Final_Label" not in df.columns:
            print(f"Error: CSV {filepath} is missing required columns 'Obfuscated_Prompt' or 'Final_Label'.")
            return None

        # Rename for simplicity and map labels to integers
        df = df[["Obfuscated_Prompt", "Final_Label"]]
        df = df.rename(columns={"Obfuscated_Prompt": "text", "Final_Label": "label_str"})

        # Map string labels to integer IDs
        df["label"] = df["label_str"].map(LABEL_MAP)

        # Filter out any rows that didn't map (e.g., if CSV is malformed)
        df = df.dropna(subset=["text", "label"])
        df["label"] = df["label"].astype(int)

        print(f"Loaded and processed {len(df)} records from {filepath}")

        # Convert pandas DataFrame to Hugging Face Dataset
        return Dataset.from_pandas(df)

    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return None

# --- 2. Initialize Tokenizer and Model ---

print(f"Loading tokenizer for '{MODEL_NAME}'...")
# We use use_fast=True for faster tokenization
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

print(f"Loading model '{MODEL_NAME}' for sequence classification...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABEL_MAP),
    id2label=ID2LABEL,
    label2id=LABEL_MAP
)

# --- 3. Preprocessing (Tokenization) ---

def tokenize_function(examples):
    """Tokenizes the 'text' column of the dataset."""
    # The tokenizer will pad to the max length of the batch
    # and truncate any prompts longer than 512 tokens.
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# --- 4. Define Metrics ---

def compute_metrics(eval_pred):
    """Called by the Trainer to compute guardrail-specific metrics."""
    logits, labels = eval_pred
    # Get the most likely prediction (highest logit score)
    predictions = np.argmax(logits, axis=-1)

    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, pos_label=1) # F1 for "Malicious"
    precision = precision_score(labels, predictions, pos_label=1) # Precision for "Malicious"
    recall = recall_score(labels, predictions, pos_label=1) # Recall for "Malicious"

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# --- 5. Main Training Function ---

def main():
    # Load all datasets
    train_dataset = load_data(TRAIN_FILE)
    val_dataset = load_data(VAL_FILE)
    test_dataset = load_data(TEST_FILE)

    if train_dataset is None or val_dataset is None or test_dataset is None:
        print("\nAborting training due to missing or invalid data files.")
        return

    # Tokenize all datasets
    print("\nTokenizing all datasets (this may take a moment)...")
    tokenized_train = train_dataset.map(tokenize_function, batched=True)
    tokenized_val = val_dataset.map(tokenize_function, batched=True)
    tokenized_test = test_dataset.map(tokenize_function, batched=True)

    # Remove text columns to speed up training
    tokenized_train = tokenized_train.remove_columns(["text", "label_str"])
    tokenized_val = tokenized_val.remove_columns(["text", "label_str"])
    tokenized_test = tokenized_test.remove_columns(["text", "label_str"])

    # Instantiate the EarlyStoppingCallback
    # This will stop training if the 'f1' score doesn't improve for 3
    # consecutive evaluation runs (3 * 50 steps = 150 steps).
    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=3,
        early_stopping_threshold=0.005  # new F1 must be at least 0.005 better
    )

    # Define Training Arguments
    training_args = TrainingArguments(
        output_dir=MODEL_OUTPUT_DIR,
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=16,   # Batch size for training
        per_device_eval_batch_size=64,    # Batch size for evaluation
        warmup_steps=500,                 # Number of steps for warmup
        weight_decay=0.01,                # Strength of weight decay

        # Evaluation and Logging
        eval_strategy="steps",          # Corrected parameter name
        eval_steps=50,                  # Run validation every 50 steps
        logging_dir=LOGGING_DIR,
        logging_steps=50,

        # Checkpoint Saving & Loading
        save_strategy="steps",          # Aligns with eval_strategy
        save_steps=50,
        load_best_model_at_end=True,    # This is the key to preventing overfitting
        metric_for_best_model="f1",     # We care most about F1 score
        greater_is_better=True,
        save_total_limit=2,             # Only keep the best and the latest checkpoint
        report_to="none"                # Disable W&B/etc. by default
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        callbacks=[early_stopping_callback]
    )

    # --- 6. Train the Model ---
    print("\n--- Starting Model Training ---")
    trainer.train()
    print("--- Training Complete ---")

    # --- 7. Evaluate on Test Set ---
    print("\n--- Evaluating on Hold-Out Test Set ---")

    # Use the best model (loaded automatically) for the final test evaluation
    test_results = trainer.evaluate(eval_dataset=tokenized_test)

    print("\n--- Final Test Results ---")
    print(f"Accuracy:  {test_results['eval_accuracy']:.4f}")
    print(f"F1 Score:  {test_results['eval_f1']:.4f}")
    print(f"Precision: {test_results['eval_precision']:.4f}")
    print(f"Recall:    {test_results['eval_recall']:.4f}")

    # --- Save Model & Results ---
    print(f"\nSaving best model to {MODEL_OUTPUT_DIR}...")
    trainer.save_model(MODEL_OUTPUT_DIR)
    print("Model saved successfully.")

    # Save test results and training history to JSON files
    try:
        results_file = os.path.join(MODEL_OUTPUT_DIR, "test_results.json")
        with open(results_file, 'w') as f:
            json.dump(test_results, f, indent=4)
        print(f"Test results saved to {results_file}")

        history_file = os.path.join(MODEL_OUTPUT_DIR, "training_log_history.json")
        with open(history_file, 'w') as f:
            # Filter history to only include log entries
            log_history = [entry for entry in trainer.state.log_history if 'loss' in entry or 'eval_loss' in entry]
            json.dump(log_history, f, indent=4)
        print(f"Training history saved to {history_file}")

    except Exception as e:
        print(f"Error saving JSON results: {e}")

    print("\n--- Script Finished ---")
if __name__ == "__main__":
    main()

Loading tokenizer for 'distilbert-base-uncased'...
Loading model 'distilbert-base-uncased' for sequence classification...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded and processed 27000 records from /content/drive/MyDrive/266-final-project-data/train_dataset.csv
Loaded and processed 4500 records from /content/drive/MyDrive/266-final-project-data/val_dataset.csv
Loaded and processed 4500 records from /content/drive/MyDrive/266-final-project-data/test_dataset.csv

Tokenizing all datasets (this may take a moment)...


Map:   0%|          | 0/27000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

  trainer = Trainer(



--- Starting Model Training ---


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.6735,0.634362,0.704,0.777704,0.667239,0.932
100,0.5799,0.527228,0.714667,0.789024,0.669548,0.9604
150,0.44,0.316988,0.862889,0.884781,0.829772,0.9476
200,0.2379,0.150823,0.942889,0.948342,0.953131,0.9436
250,0.137,0.098372,0.968667,0.971148,0.994135,0.9492
300,0.1025,0.078718,0.976667,0.978672,0.994222,0.9636
350,0.0819,0.06702,0.978667,0.981043,0.968799,0.9936
400,0.0685,0.061389,0.981778,0.983494,0.98987,0.9772
450,0.0593,0.058877,0.984889,0.986312,0.992707,0.98


--- Training Complete ---

--- Evaluating on Hold-Out Test Set ---



--- Final Test Results ---
Accuracy:  0.9880
F1 Score:  0.9892
Precision: 0.9920
Recall:    0.9864

Saving best model to /content/drive/MyDrive/266-final-project-data/guardrail_model_v1...
Model saved successfully.
Test results saved to /content/drive/MyDrive/266-final-project-data/guardrail_model_v1/test_results.json
Training history saved to /content/drive/MyDrive/266-final-project-data/guardrail_model_v1/training_log_history.json

--- Script Finished ---
