# Instruction Fine-tuning: Llama-Guard-3-8B for Polarization Detection

This notebook demonstrates instruction fine-tuning of the `meta-llama/Llama-Guard-3-8B` model on the polarization detection dataset.

## Approach
- Use QLoRA (Quantized Low-Rank Adaptation) for efficient fine-tuning
- Format data as instruction-following conversations
- Train on multilingual polarization detection task
- Evaluate on held-out test set

## 1. Install Dependencies

In [None]:
%%time
%%capture
!pip install -q transformers datasets accelerate peft bitsandbytes trl evaluate scikit-learn

## 2. Import Libraries

In [None]:
import os

import pandas as pd
import torch
from datasets import Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    EarlyStoppingCallback,
    TrainingArguments,
)
from trl import SFTTrainer

In [None]:
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"GPU Count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(
        f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
    )

## 3. Configuration

In [None]:
# Model configuration
MODEL_NAME = "meta-llama/Llama-Guard-3-8B"
OUTPUT_DIR = "./llama-guard-3-8b-polarization"

# Data paths
TRAIN_DATA_PATH = "../../subtask1/train"
DEV_DATA_PATH = "../../subtask1/dev"

# Training hyperparameters
MAX_SEQ_LENGTH = 512
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 2e-4
NUM_EPOCHS = 3
WARMUP_RATIO = 0.03

# LoRA configuration
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

## 4. Load and Prepare Dataset

In [None]:
def load_split(split_dir):
    """Load all CSV files from a directory and combine them."""
    dfs = []
    for file in os.listdir(split_dir):
        if file.endswith(".csv"):
            lang = file.replace(".csv", "")
            df = pd.read_csv(os.path.join(split_dir, file))
            df["lang"] = lang
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)


# Load training and dev data
train_df = load_split(TRAIN_DATA_PATH)
dev_df = load_split(DEV_DATA_PATH)

print(f"Train size: {train_df.shape}")
print(f"Dev size: {dev_df.shape}")
print(f"\nLanguages: {train_df['lang'].nunique()}")
print("\nPolarization distribution (train):")
print(train_df["polarization"].value_counts(normalize=True))

In [None]:
# Create stratified train/val/test splits
train_df["lang_label"] = (
    train_df["lang"].astype(str) + "_" + train_df["polarization"].astype(str)
)

train_data, temp_data = train_test_split(
    train_df,
    test_size=0.10,
    stratify=train_df["lang_label"],
    random_state=42,
    shuffle=True,
)

val_data, test_data = train_test_split(
    temp_data,
    test_size=0.50,
    stratify=temp_data["lang_label"],
    random_state=42,
    shuffle=True,
)

print(f"Train: {len(train_data)}")
print(f"Val: {len(val_data)}")
print(f"Test: {len(test_data)}")

## 5. Format Data as Instructions

We'll format the data using a chat template suitable for Llama-Guard-3-8B.

In [None]:
def format_instruction(text, lang, label=None):
    """Format a single example as an instruction-following conversation."""
    system_prompt = """You are an expert content moderator specializing in detecting polarized content in social media posts.

Polarized content includes:
- Hate speech
- Toxicity
- Misogyny or gender-based violence
- Sarcastic or offensive speech
- Strong us-vs-them divisions
- Extreme opinions that create hostility between groups

Your task is to classify whether the given text contains polarized content. Respond with only 'Yes' or 'No'."""

    user_message = f"""Language: {lang}
Text: {text}

Does this text contain polarized content? Answer with only 'Yes' or 'No'."""

    if label is not None:
        assistant_message = "Yes" if label == 1 else "No"
        return {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": assistant_message},
            ]
        }
    else:
        return {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message},
            ]
        }


# Format the datasets
def prepare_dataset(df):
    formatted_data = []
    for _, row in df.iterrows():
        formatted_data.append(
            format_instruction(row["text"], row["lang"], row["polarization"])
        )
    return Dataset.from_list(formatted_data)


train_dataset = prepare_dataset(train_data)
val_dataset = prepare_dataset(val_data)
test_dataset = prepare_dataset(test_data)

print("Dataset prepared!")
print("\nExample formatted instruction:")
print(train_dataset[0])

## 6. Load Model and Tokenizer with Quantization

In [None]:
%%time
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix for fp16 training

print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")
print(f"Vocab size: {len(tokenizer)}")

In [None]:
%%time
# Configure 4-bit quantization for QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

print(f"Model loaded: {model.__class__.__name__}")
print(f"Model device: {model.device}")

## 7. Configure LoRA

In [None]:
# LoRA configuration
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)

# Apply LoRA to model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

## 8. Configure Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    learning_rate=LEARNING_RATE,
    lr_scheduler_type="cosine",
    warmup_ratio=WARMUP_RATIO,
    logging_steps=50,
    save_strategy="steps",
    save_steps=500,
    eval_strategy="steps",
    eval_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    bf16=True,  # Use bfloat16 precision
    tf32=True,  # Use TF32 for faster training on Ampere GPUs
    max_grad_norm=0.3,
    weight_decay=0.001,
    report_to="none",
    seed=42,
)

print("Training arguments configured!")

## 9. Initialize Trainer

In [None]:
def formatting_prompts_func(example):
    """Format examples using the chat template."""
    output_texts = []
    for i in range(len(example["messages"])):
        text = tokenizer.apply_chat_template(
            example["messages"][i], tokenize=False, add_generation_prompt=False
        )
        output_texts.append(text)
    return output_texts

In [None]:
# Initialize SFTTrainer (Supervised Fine-Tuning Trainer)
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    formatting_func=formatting_prompts_func,
    max_seq_length=MAX_SEQ_LENGTH,
    packing=False,  # Don't pack multiple examples into one sequence
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

print("Trainer initialized!")

## 10. Train the Model

In [None]:
%%time
# Start training
print("Starting training...")
trainer.train()

print("\nTraining completed!")

## 11. Save the Fine-tuned Model

In [None]:
# Save the fine-tuned model and tokenizer
trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model saved to {OUTPUT_DIR}")

## 12. Evaluation and Inference

In [None]:
def predict_polarization(text, lang, model, tokenizer):
    """Predict polarization for a single text."""
    # Format the input
    formatted = format_instruction(text, lang)

    # Apply chat template
    prompt = tokenizer.apply_chat_template(
        formatted["messages"], tokenize=False, add_generation_prompt=True
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=10,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract answer (last part after the prompt)
    answer = response.split("assistant")[-1].strip().lower()

    # Parse answer
    if "yes" in answer:
        return 1
    elif "no" in answer:
        return 0
    else:
        print(f"Unclear response: {answer}")
        return 0

In [None]:
# Test on a single example
test_text = "I hate the way you talk, I hate the way you walk"
test_lang = "eng"

prediction = predict_polarization(test_text, test_lang, model, tokenizer)
print(f"Text: {test_text}")
print(f"Language: {test_lang}")
print(f"Prediction: {'Polarized' if prediction == 1 else 'Not Polarized'}")

In [None]:
%%time
# Evaluate on test set (sample for speed)
test_sample = test_data.sample(n=min(500, len(test_data)), random_state=42)

predictions = []
true_labels = []

print(f"Evaluating on {len(test_sample)} test examples...")
for idx, row in test_sample.iterrows():
    pred = predict_polarization(row["text"], row["lang"], model, tokenizer)
    predictions.append(pred)
    true_labels.append(row["polarization"])

    if (idx + 1) % 50 == 0:
        print(f"Processed {idx + 1}/{len(test_sample)} examples")

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average="binary")

print(f"\n{'=' * 50}")
print("Test Set Evaluation Results")
print(f"{'=' * 50}")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(
    classification_report(
        true_labels, predictions, target_names=["Not Polarized", "Polarized"]
    )
)

## 13. Generate Predictions for Dev Set

In [None]:
%%time
# Generate predictions for the entire dev set
dev_predictions = []

print(f"Generating predictions for {len(dev_df)} dev examples...")
for idx, row in dev_df.iterrows():
    pred = predict_polarization(row["text"], row["lang"], model, tokenizer)
    dev_predictions.append({"id": row["id"], "polarization": pred})

    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1}/{len(dev_df)} examples")

# Save predictions
pred_df = pd.DataFrame(dev_predictions)
pred_df.to_csv("dev_predictions.csv", index=False)
print("\nPredictions saved to dev_predictions.csv")

## Conclusion

This notebook demonstrated:
1. Loading and preparing multilingual polarization detection data
2. Formatting data as instruction-following conversations
3. Fine-tuning Llama-Guard-3-8B using QLoRA for efficiency
4. Evaluating the fine-tuned model on test data
5. Generating predictions for the dev set

### Next Steps:
- Experiment with different instruction formats
- Try different LoRA configurations (r, alpha, target modules)
- Adjust training hyperparameters (learning rate, batch size, epochs)
- Implement few-shot prompting in the instruction format
- Compare performance with encoder-only models