In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.nn.functional import softmax
import matplotlib.pyplot as plt
import torch
from contextlib import nullcontext
from torch.cuda.amp import autocast, GradScaler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_amp = (device.type == "cuda")

amp_ctx = (autocast if use_amp else nullcontext)
scaler = (GradScaler() if use_amp else None)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load dataset
train_path = "data/train/train.csv"
test_path  = "data/test/test.csv"

df = pd.read_csv(train_path)
df = df[['text', 'author']]
df = df[df['author'].isin(['EAP', 'HPL', 'MWS'])].reset_index(drop=True)

le = LabelEncoder()
df['label'] = le.fit_transform(df['author'])
print(f" Data loaded: {len(df)} labeled training samples")

test_df = pd.read_csv(test_path)

#  Roberta-Base setup
MODEL_NAME = 'roberta-base'
MAX_LEN = 256
BATCH_SIZE = 32
EPOCHS = 5
LR = 2e-5
WEIGHT_DECAY = 0.01
LABEL_SMOOTHING = 0.1
NUM_WORKERS = 2
PATIENCE = 2

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

# Dataset class
class AuthorDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LEN)
        self.labels = labels
    def __len__(self):
        return len(self.encodings['input_ids'])
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

train_df, val_df = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=42)
train_dataset = AuthorDataset(train_df['text'].tolist(), train_df['label'].tolist())
val_dataset = AuthorDataset(val_df['text'].tolist(), val_df['label'].tolist())
test_dataset = AuthorDataset(test_df['text'].tolist())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)

# Model setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
model.to(device)

optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss(label_smoothing=LABEL_SMOOTHING)

# Meta-model preparation
val_oof_df = val_df.copy()
val_preds_all = np.zeros((len(val_df), 3))
test_preds_all = np.zeros((len(test_df), 3))

#  Training loop
print(f"\n Starting training for {EPOCHS} epochs ({MODEL_NAME})...")
best_val_loss = float('inf')
early_stop_counter = 0
train_losses, val_losses, lrs = [], [], []

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_loader):
        print(f" Epoch {epoch+1} | Batch {i+1}/{len(train_loader)}", end='\r')
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with amp_ctx(dtype=torch.float16):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)

        if scaler is not None:
            # AMP path (CUDA)
            scaler.scale(loss).backward()
            # Unscale before clipping so clipping uses real grads
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
        else:
            # CPU (no AMP)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        scheduler.step()
        total_loss += loss.item()
        lrs.append(scheduler.get_last_lr()[0])

    train_loss = total_loss / len(train_loader)
    train_losses.append(train_loss)
    print(f"\n Epoch {epoch+1} Train Loss: {train_loss:.4f}")

    #  Validation
    model.eval()
    val_preds, val_targets = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = softmax(outputs.logits, dim=1)

            batch_probs = probs.cpu().numpy()
            val_preds.extend(batch_probs)
            val_targets.extend(labels.cpu().numpy())

            # Save for meta model
            start = len(val_preds) - len(batch_probs)
            end = len(val_preds)
            val_preds_all[start:end] = batch_probs

    val_loss = log_loss(val_targets, val_preds)
    val_losses.append(val_loss)
    print(f" Epoch {epoch+1}: Val Log Loss = {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_model.pt")
        print( "Best model saved")
        early_stop_counter = 0
    else:
        early_stop_counter += 1
        print(f" No improvement. Early Stop Counter: {early_stop_counter}/{PATIENCE}")
        if early_stop_counter >= PATIENCE:
            print("Early stopping triggered.")
            break

#  Plot training curves
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Log Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training Loss")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(lrs)
plt.xlabel("Training Step")
plt.ylabel("Learning Rate")
plt.title("LR Schedule")

plt.tight_layout()
plt.show()

print(f"\n Training complete. Best Val Log Loss: {best_val_loss:.4f}")

#  Inference on test set for meta-model
model.load_state_dict(torch.load("best_model.pt"))
model.eval()

with torch.no_grad():
    all_test_probs = []
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = softmax(outputs.logits, dim=1)
        all_test_probs.extend(probs.cpu().numpy())

test_preds_all = np.array(all_test_probs)

#  Save meta-model CSVs
val_oof_df[['id']] = val_df[['id']] if 'id' in val_df.columns else np.arange(len(val_df))
for i, cls in enumerate(le.classes_):
    val_oof_df[f'roberta_base_{cls}'] = val_preds_all[:, i]
val_oof_df[['id'] + [f'roberta_base_{cls}' for cls in le.classes_]].to_csv("oof_preds.csv", index=False)

test_df_preds = test_df.copy()
for i, cls in enumerate(le.classes_):
    test_df_preds[f'roberta_base_{cls}'] = test_preds_all[:, i]
test_df_preds[['id'] + [f'roberta_base_{cls}' for cls in le.classes_]].to_csv("roberta_base_test_preds.csv", index=False)

print("\n Meta-model prediction files saved:")
print("- oof_preds.csv")
print("- roberta_base_test_preds.csv")

In [None]:
#  Add 'id' column safely
if 'id' in val_df.columns:
    val_oof_df['id'] = val_df['id'].values
else:
    val_oof_df['id'] = np.arange(len(val_oof_df))

#  Save OOF predictions from validation set for meta-model
for i, cls in enumerate(le.classes_):
    val_oof_df[f'roberta_base_{cls}'] = val_preds_all[:, i]

val_oof_df[['id'] + [f'roberta_base_{cls}' for cls in le.classes_]].to_csv("roberta_oof_preds.csv", index=False)
print(" roberta_oof_preds.csv saved")

In [None]:
# Add fixed ID assignment first
if 'id' in val_df.columns:
    val_oof_df['id'] = val_df['id'].values
else:
    val_oof_df['id'] = np.arange(len(val_oof_df))

# Save oof preds
for i, cls in enumerate(le.classes_):
    val_oof_df[f'roberta_base_{cls}'] = val_preds_all[:, i]

val_oof_df[['id'] + [f'roberta_base_{cls}' for cls in le.classes_]].to_csv("oof_preds.csv", index=False)

# Save test preds (if not broken earlier)
test_df_preds = test_df.copy()
for i, cls in enumerate(le.classes_):
    test_df_preds[f'roberta_base_{cls}'] = test_preds_all[:, i]
test_df_preds[['id'] + [f'roberta_base_{cls}' for cls in le.classes_]].to_csv("roberta_base_test_preds.csv", index=False)

In [None]:
#  Format submission using roberta-base predictions
submission = test_df_preds.copy()

submission = submission.rename(columns={
    'roberta_base_EAP': 'EAP',
    'roberta_base_HPL': 'HPL',
    'roberta_base_MWS': 'MWS'
})

submission = submission[['id', 'EAP', 'HPL', 'MWS']]
submission.to_csv("roBERTa_submission.csv", index=False)
print(" roBERTa_submission.csv created successfully!")