In [None]:
# === Standardized path configuration added automatically ===
from pathlib import Path
BASE_DIR = Path.cwd()  # project root when running the notebook
DATA_DIR = BASE_DIR / 'data'
TRAIN_CSV = DATA_DIR / 'train' / 'train.csv'
TEST_CSV = DATA_DIR / 'test' / 'test.csv'
# You can now use TRAIN_CSV and TEST_CSV instead of hardcoded strings.


In [1]:
# === Inference + Submission from DeBERTa best_model.pt ===

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax

# === ✅ Config
MODEL_NAME = 'microsoft/deberta-v3-base'
MODEL_PATH = '/kaggle/input/debertav3-base/pytorch/default/1/best_model.pt'  # update path if needed
TEST_CSV = '/kaggle/input/identify-the-author/test/test.csv'
BATCH_SIZE = 32
MAX_LEN = 256
LABELS = ['EAP', 'HPL', 'MWS']
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === ✅ Load test data
test_df = pd.read_csv(TEST_CSV)

# === ✅ Tokenizer + Dataset
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class AuthorDataset(Dataset):
    def __init__(self, texts):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=MAX_LEN)
    def __len__(self):
        return len(self.encodings['input_ids'])
    def __getitem__(self, idx):
        return {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}

test_dataset = AuthorDataset(test_df['text'].tolist())
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# === ✅ Load model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(LABELS))
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.to(DEVICE)
model.eval()

# === ✅ Run inference
all_test_probs = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = softmax(outputs.logits, dim=1)
        all_test_probs.extend(probs.cpu().numpy())

# === ✅ Build prediction DataFrame
test_preds_all = np.array(all_test_probs)

# Save meta-model format (optional)
test_df_preds = test_df[['id']].copy()
for i, cls in enumerate(LABELS):
    test_df_preds[f'deberta_base_{cls}'] = test_preds_all[:, i]
test_df_preds.to_csv("deberta_base_test_preds.csv", index=False)

# Save submission format
submission = test_df_preds.rename(columns={
    'deberta_base_EAP': 'EAP',
    'deberta_base_HPL': 'HPL',
    'deberta_base_MWS': 'MWS'
})
submission = submission[['id', 'EAP', 'HPL', 'MWS']]
submission.to_csv("submission.csv", index=False)

print("✅ Files saved:")
print("- submission.csv (for Kaggle)")
print("- deberta_base_test_preds.csv (for ensembling)")


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

2025-07-23 19:34:49.317530: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753299289.649333      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753299289.754167      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

✅ Files saved:
- submission.csv (for Kaggle)
- deberta_base_test_preds.csv (for ensembling)
