In [6]:
import os
import re
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score
import numpy as np
import torch

# 0. CONFIGURE YOUR PATHS HERE
TRAIN_CSV = "../csv/train.csv"
TEST_CSV  = "../csv/test.csv"
MODEL_NAME  = "facebook/bart-large-mnli"
OUTPUT_DIR  = "./bart-mnli-finetuned"
BATCH_SIZE  = 8
MAX_LEN     = 128
EPOCHS      = 3
SEED        = 42

# 1. A SIMPLE REGEX PARSER FOR "['A', 'B', 'C']" STRINGS
def parse_list_field(s: str) -> list[str]:
    if not isinstance(s, str):
        return []
    # find everything between single quotes
    return re.findall(r"'([^']*)'", s)

# 2. LOAD AND PARSE TRAIN & TEST
def load_and_parse(csv_path):
    df = pd.read_csv(csv_path)
    # parse both fields
    df["all_topics"]      = df["all_topics"].apply(parse_list_field)
    df["selected_topics"] = df["selected_topics"].apply(parse_list_field)
    return df

print("Loading training data from", TRAIN_CSV)
train_df = load_and_parse(TRAIN_CSV)
print("Loading evaluation data from", TEST_CSV)
eval_df  = load_and_parse(TEST_CSV)

# DEBUG: print a couple of rows
print("\nTRAIN sample after parsing:")
print(train_df.head(2).to_dict(orient="records"))
print("\nTEST sample after parsing:")
print(eval_df.head(2).to_dict(orient="records"))

# 3. BUILD NLI EXAMPLES
def make_nli_records(df: pd.DataFrame):
    records = []
    for _, row in df.iterrows():
        premise = row["review"]
        for topic in row["all_topics"]:
            # 2 = entailment, 0 = contradiction
            label = 2 if topic in row["selected_topics"] else 0
            records.append({
                "premise": premise,
                "hypothesis": f"This review is about {topic}.",
                "label": label
            })
    return records

train_records = make_nli_records(train_df)
eval_records  = make_nli_records(eval_df)
print("\nSample NLI records:")
print(train_records[:2])
print("\nSample NLI records:") 
print(eval_records[:2])
print(f"\nTotal train NLI pairs: {len(train_records)}")
print(f"Total eval  NLI pairs: {len(eval_records)}")

# 4. CONVERT TO HUGGINGFACE DATASETS
train_ds = Dataset.from_pandas(pd.DataFrame(train_records))
eval_ds  = Dataset.from_pandas(pd.DataFrame(eval_records))

# 5. TOKENIZE
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize_batch(batch):
    return tokenizer(
        batch["premise"],
        batch["hypothesis"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN
    )

print("\nTokenizing...")
train_tok = train_ds.map(tokenize_batch, batched=True)
eval_tok  = eval_ds.map(tokenize_batch, batched=True)

for ds in (train_tok, eval_tok):
    ds = ds.remove_columns(["premise","hypothesis"])
    ds = ds.rename_column("label","labels")
    ds.set_format("torch")

# 6. LOAD MODEL
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# 7. METRICS
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}

# 8. TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    seed=SEED,
)

# 9. TRAINER SETUP
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 10. TRAIN & SAVE
print("\nStarting fine‑tuning...")
trainer.train()
print("Saving model to", OUTPUT_DIR)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Done.")


Loading training data from ../csv/train.csv
Loading evaluation data from ../csv/test.csv

TRAIN sample after parsing:
[{'article_id': 30258060, 'review': 'Perfect for my kitchen window. Perfect for my kitchen window', 'all_topics': ['Size', 'Installation', 'Material', 'Price', 'Design', 'Magnet', 'Light', 'Quality'], 'selected_topics': ['Design']}, {'article_id': 20336087, 'review': 'I LOVE THEM. Simply I love the design. I have all the sizes.', 'all_topics': ['Size', 'Shape', 'Design', 'Quality', 'Replacement', 'Price', 'Durability', 'Functionality'], 'selected_topics': ['Design', 'Size']}]

TEST sample after parsing:
[{'article_id': 40349946, 'review': 'Good strong quality and absorbency. Good strong quality and absorbency', 'all_topics': ['Quality', 'Size', 'Appearance', 'Thickness', 'Value', 'Colors', 'Cloth-like', 'Absorbency'], 'selected_topics': ['Quality', 'Absorbency']}, {'article_id': 90610209, 'review': 'Pillow is as advertised.  Good quality.. Reasonable purchase price and 

Map:  99%|█████████▊| 78000/78996 [00:03<00:00, 24448.73 examples/s]


KeyboardInterrupt: 