<a href="https://colab.research.google.com/github/eldananyss21/assignment-3-classification/blob/main/code/assignment3_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 3: Text Classification

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset
import torch
from sentence_transformers import SentenceTransformer
from sklearn import svm

print("Using GPU:", torch.cuda.is_available())

# IMPORT DATA
train = pd.read_csv("./data/train.csv")
dev = pd.read_csv("./data/dev.csv")
test = pd.read_csv("./data/test.csv")

# ENCODE LABELS
label_encoder = LabelEncoder()
label_encoder.fit(train["label"].tolist() + dev["label"].tolist())
label_names = label_encoder.classes_.tolist()
label2id = {name: i for i, name in enumerate(label_names)}
id2label = {i: name for i, name in enumerate(label_names)}

# TOKENIZE + CREATE DATASETS FOR TRACK 2 & TRACK 3
def tokenize_dataframes(model_name, train, dev, test, train_dev):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_fn(examples):
        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

    train_ds = Dataset.from_pandas(train).map(tokenize_fn, batched=True).rename_column("label", "labels")
    dev_ds = Dataset.from_pandas(dev).map(tokenize_fn, batched=True).rename_column("label", "labels")
    test_ds = Dataset.from_pandas(test).map(tokenize_fn, batched=True)
    train_dev_ds = Dataset.from_pandas(train_dev).map(tokenize_fn, batched=True).rename_column("label", "labels")

    return tokenizer, train_ds, dev_ds, test_ds, train_dev_ds

## Track 1: Machine Learning Technique

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# 1. TRAIN ON TRAIN SET → EVALUATE ON DEV SET
X_train = model.encode(train['text'].tolist(), show_progress_bar=True)
y_train = label_encoder.transform(train['label'])
X_dev = model.encode(dev['text'].tolist(), show_progress_bar=True)
y_dev = label_encoder.transform(dev['label'])

clf = svm.SVC()
clf.fit(X_train, y_train)
dev_preds = clf.predict(X_dev)
print("Track 1 Dev Macro F1:", f1_score(y_dev, dev_preds, average="macro"))

In [None]:
# 2. TRAIN ON TRAIN + DEV SETS → PREDICT ON TEST SET
train_dev = pd.concat([train, dev])
X = model.encode(train_dev['text'].tolist(), show_progress_bar=True)
y = label_encoder.transform(train_dev['label'])
X_test = model.encode(test['text'].tolist(), show_progress_bar=True)

In [None]:
clf.fit(X, y)
test_preds = clf.predict(X_test)
test_labels = label_encoder.inverse_transform(test_preds)
pd.DataFrame({"id": test["id"], "label": test_labels}).to_csv("track_1_test.csv", index=False)

## Track 2: RoBERTa

In [None]:
model_name = "roberta-base"
tokenizer, train_ds, dev_ds, test_ds, train_dev_ds = tokenize_dataframes(model_name, train, dev, test, pd.concat([train, dev]))

model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=len(label2id),
                                                           id2label=id2label,
                                                           label2id=label2id)

args = TrainingArguments(output_dir = "./code/results_roberta",
                         evaluation_strategy="epoch",
                         save_strategy="epoch",
                         learning_rate=1e-5,
                         per_device_train_batch_size=16,
                         per_device_eval_batch_size=16,
                         num_train_epochs=3,
                         weight_decay=0.01,
                         load_best_model_at_end=True,
                         metric_for_best_model="macro_f1",
                         fp16=True,
                         logging_steps=50,
                         save_total_limit=1)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"macro_f1": f1_score(labels, preds, average="macro")}

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_ds,
                  eval_dataset=dev_ds,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics,
                  callbacks=[EarlyStoppingCallback(early_stopping_patience=2)])

In [None]:
# 1. TRAIN ON TRAIN SET → EVALUATE ON DEV SET
trainer.train()
trainer.evaluate()

In [None]:
# 2. TRAIN ON TRAIN + DEV SETS → PREDICT ON TEST SET
trainer.train_dataset = train_dev_ds
trainer.eval_dataset = None
trainer.train()

In [None]:
roberta_preds = trainer.predict(test_ds).predictions.argmax(-1)
labels = [id2label[i] for i in roberta_preds]
pd.DataFrame({"id": test["id"], "label": labels}).to_csv("track_2_test.csv", index=False)

## Track 3: XLNet

In [None]:
model_name = "xlnet-base-cased"
tokenizer, train_ds, dev_ds, test_ds, train_dev_ds = tokenize_dataframes(model_name, train, dev, test, pd.concat([train, dev]))

model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           num_labels=len(label2id),
                                                           id2label=id2label,
                                                           label2id=label2id)

args = TrainingArguments(output_dir="./code/results_xlnet",
                         eval_strategy="epoch",
                         save_strategy="epoch",
                         learning_rate=1e-5,
                         per_device_train_batch_size=16,
                         per_device_eval_batch_size=16,
                         num_train_epochs=3,
                         weight_decay=0.01,
                         load_best_model_at_end=True,
                         metric_for_best_model="macro_f1",
                         fp16=False,
                         logging_steps=50,
                         save_total_limit=1)

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_ds,
                  eval_dataset=dev_ds,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics,
                  callbacks=[EarlyStoppingCallback(early_stopping_patience=2)])

In [None]:
# 1. TRAIN ON TRAIN SET → EVALUATE ON DEV SET
trainer.train()
trainer.evaluate()

In [None]:
# 2. TRAIN ON TRAIN + DEV SETS → PREDICT ON TEST SET
trainer.train_dataset = train_dev_ds
trainer.eval_dataset = None
trainer.train()

In [None]:
xlnet_preds = trainer.predict(test_ds).predictions.argmax(-1)
xlnet_labels = [id2label[i] for i in xlnet_preds]
pd.DataFrame({"id": test["id"], "label": xlnet_labels}).to_csv("track_3_test.csv", index=False)