In [1]:
%%time
%%capture
!pip install peft
!pip install evaluate
!pip install datasets
!pip install --upgrade transformers
!pip install sentencepiece

CPU times: user 10.3 ms, sys: 13.6 ms, total: 23.9 ms
Wall time: 14.5 s


In [2]:
%%time
import os
import shutil

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

CPU times: user 4.22 s, sys: 452 ms, total: 4.67 s
Wall time: 3.54 s


In [3]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


In [4]:
%%time
tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/mmBERT-base")
base_model = AutoModelForSequenceClassification.from_pretrained(
    "jhu-clsp/mmBERT-base"
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.23G [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CPU times: user 2.41 s, sys: 2.03 s, total: 4.44 s
Wall time: 8.33 s


In [5]:
TRAIN_DIR = "subtask1/train"
DEV_DIR = "subtask1/dev"


def load_split(split_dir):
    dfs = []
    for file in os.listdir(split_dir):
        if file.endswith(".csv"):
            lang = file.replace(".csv", "")
            df = pd.read_csv(os.path.join(split_dir, file))
            df["lang"] = lang
            dfs.append(df)
    return pd.concat(dfs, ignore_index=True)


main_df = load_split(TRAIN_DIR)
dev_df = load_split(DEV_DIR)

print("Train size:", main_df.shape)
print("Dev size:", dev_df.shape)

model.safetensors:   0%|          | 0.00/1.23G [00:00<?, ?B/s]

Train size: (73681, 4)
Dev size: (3687, 4)


In [6]:
main_df["lang"].nunique()

22

In [7]:
print("=== Dataset Shapes ===")
print("Train size:", main_df.shape)
print("Dev size:", dev_df.shape)
print()

# --- Duplicate checks ---
print("=== Duplicate Checks (TRAIN) ===")
print("Duplicate IDs:", main_df["id"].duplicated().sum())
print("Duplicate texts:", main_df["text"].duplicated().sum())
print()

# --- Missing values ---
print("=== Missing Values (TRAIN) ===")
print(main_df.isna().sum())
print()

# --- Polarization distribution ---
print("=== Polarization Distribution (TRAIN) ===")
print(main_df["polarization"].value_counts())
print()

print("Polarization distribution (%):")
print(main_df["polarization"].value_counts(normalize=True) * 100)

=== Dataset Shapes ===
Train size: (73681, 4)
Dev size: (3687, 4)

=== Duplicate Checks (TRAIN) ===
Duplicate IDs: 0
Duplicate texts: 3

=== Missing Values (TRAIN) ===
id              0
text            0
polarization    0
lang            0
dtype: int64

=== Polarization Distribution (TRAIN) ===
polarization
1    39145
0    34536
Name: count, dtype: int64

Polarization distribution (%):
polarization
1    53.127672
0    46.872328
Name: proportion, dtype: float64


In [8]:
main_df["lang_label"] = (
    main_df["lang"].astype(str) + "_" + main_df["polarization"].astype(str)
)

train_df, temp_df = train_test_split(
    main_df,
    test_size=0.10,
    stratify=main_df["lang_label"],
    random_state=42,
    shuffle=True,
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df["lang_label"],
    random_state=42,
    shuffle=True,
)

# ---- KEEP ONLY REQUIRED COLUMNS ----
cols = ["text", "polarization"]

train_df = (
    train_df[cols].rename(columns={"polarization": "labels"}).reset_index(drop=True)
)
val_df = val_df[cols].rename(columns={"polarization": "labels"}).reset_index(drop=True)
test_df = (
    test_df[cols].rename(columns={"polarization": "labels"}).reset_index(drop=True)
)

# ---- PRINT SHAPES AND LABEL DISTRIBUTIONS ----
print("Train:", train_df.shape)
print("Val:", val_df.shape)
print("Test:", test_df.shape)

print("\nTrain distribution:\n", train_df["labels"].value_counts(normalize=True) * 100)
print("\nVal distribution:\n", val_df["labels"].value_counts(normalize=True) * 100)
print("\nTest distribution:\n", test_df["labels"].value_counts(normalize=True) * 100)

Train: (66312, 2)
Val: (3684, 2)
Test: (3685, 2)

Train distribution:
 labels
1    53.132163
0    46.867837
Name: proportion, dtype: float64

Val distribution:
 labels
1    53.121607
0    46.878393
Name: proportion, dtype: float64

Test distribution:
 labels
1    53.052917
0    46.947083
Name: proportion, dtype: float64


In [9]:
train_dataset = Dataset.from_pandas(train_df[["text", "labels"]])
val_dataset = Dataset.from_pandas(val_df[["text", "labels"]])
test_dataset = Dataset.from_pandas(test_df[["text", "labels"]])

In [10]:
dataset = DatasetDict(
    {"train": train_dataset, "validation": val_dataset, "test": test_dataset}
)

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 66312
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 3684
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 3685
    })
})

In [11]:
%%time


def tokenize_function(examples):
    return tokenizer(
        examples["text"], padding="max_length", truncation=True, max_length=256
    )


# Encode the input data
dataset = dataset.map(tokenize_function, batched=True)
# The transformers model expects the target class column to be named "labels"
# dataset = dataset.rename_column(original_column_name="label", new_column_name="labels")
# Transform to pytorch tensors and only output the required columns
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/66312 [00:00<?, ? examples/s]

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

Map:   0%|          | 0/3685 [00:00<?, ? examples/s]

CPU times: user 9.49 s, sys: 322 ms, total: 9.82 s
Wall time: 5.6 s


In [12]:
# for name, module in base_model.named_modules():
#     print(name, module)
#     # if "attention" in name.lower() and isinstance(module, torch.nn.Linear):
#     #     print(name)

In [13]:
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    # target_modules=["query", "key", "value"],
    target_modules=["Wqkv", "Wo", "Wi"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
)

In [14]:
model = get_peft_model(base_model, lora_config)

In [15]:
%%capture
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

In [16]:
training_args = TrainingArguments(
    output_dir="./output_results",
    num_train_epochs=30,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    #     warmup_steps=500,
    logging_dir="./logs",
    logging_steps=500,
    learning_rate=2e-5,
    # weight_decay=0.01,
    optim="adamw_torch",
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,  # needed for early stopping
    eval_strategy="steps",
    eval_steps=500,
    save_steps=5000,
    report_to="none",
)

metric = evaluate.load("accuracy")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=2)
    ],  # Early stopping after 2 evaluations with no improvement
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.1)]
)

Downloading builder script: 0.00B [00:00, ?B/s]

In [17]:
%%time
trainer.train()



Step,Training Loss,Validation Loss,Accuracy
500,1.2871,0.545555,0.732356
1000,1.0499,0.505688,0.750543
1500,0.995,0.484978,0.771444
2000,0.9617,0.47811,0.777416
2500,0.9299,0.47564,0.77443
3000,0.9039,0.458245,0.786102
3500,0.8978,0.457956,0.780945
4000,0.8959,0.44547,0.790445
4500,0.8625,0.449668,0.788817
5000,0.8341,0.447443,0.788274


CPU times: user 1h 11min 43s, sys: 4min 14s, total: 1h 15min 57s
Wall time: 22min 20s


TrainOutput(global_step=5000, training_loss=0.9617911010742187, metrics={'train_runtime': 1340.6876, 'train_samples_per_second': 1483.836, 'train_steps_per_second': 46.387, 'total_flos': 2.746038312104755e+16, 'train_loss': 0.9617911010742187, 'epoch': 2.412062726176116})

In [19]:
%%time
trainer.evaluate()

CPU times: user 1min 11s, sys: 68.8 ms, total: 1min 11s
Wall time: 12.7 s


{'eval_loss': 0.44744250178337097,
 'eval_accuracy': 0.7882736156351792,
 'eval_runtime': 12.6636,
 'eval_samples_per_second': 290.912,
 'eval_steps_per_second': 18.241,
 'epoch': 2.412062726176116}

In [20]:
%%time
# eval now
predictions = trainer.predict(dataset["test"])

# Extract predicted labels and true labels
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

# Compute metrics using sklearn's classification report
report = classification_report(
    labels, preds, target_names=["Not Polar (0)", "Polar (1)"], digits=4
)

print(report)

accuracy = accuracy_score(labels, preds)
f1_macro = f1_score(labels, preds, average="macro")

print(f"Accuracy: {accuracy:.4f}")
print(f"Macro F1 Score: {f1_macro:.4f}")

               precision    recall  f1-score   support

Not Polar (0)     0.7902    0.7335    0.7608      1730
    Polar (1)     0.7783    0.8276    0.8022      1955

     accuracy                         0.7834      3685
    macro avg     0.7842    0.7806    0.7815      3685
 weighted avg     0.7838    0.7834    0.7828      3685

Accuracy: 0.7834
Macro F1 Score: 0.7815
CPU times: user 1min 11s, sys: 35.4 ms, total: 1min 11s
Wall time: 12.8 s


## script for submission

In [21]:
def predict_df(df, tokenizer, model):
    dataset = Dataset.from_pandas(df[["text"]])
    dataset = dataset.map(
        lambda x: tokenizer(
            x["text"], truncation=True, padding="max_length", max_length=256
        )
    )
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

    preds = trainer.predict(dataset).predictions
    pred_labels = np.argmax(preds, axis=1)

    return pred_labels

In [22]:
%%time

OUTPUT_DIR = "subtask_1"
os.makedirs(OUTPUT_DIR, exist_ok=True)

languages = sorted(dev_df["lang"].unique())
print("Languages:", languages)

for lang in languages:
    lang_df = dev_df[dev_df["lang"] == lang].reset_index(drop=True)

    # Run predictions
    pred_labels = predict_df(lang_df, tokenizer, model)

    # Build Codabench format
    submission_df = pd.DataFrame({"id": lang_df["id"], "polarization": pred_labels})

    # Save file
    fname = f"pred_{lang}.csv"
    fpath = os.path.join(OUTPUT_DIR, fname)
    submission_df.to_csv(fpath, index=False)

    print(f"Saved: {fpath}")

Languages: ['amh', 'arb', 'ben', 'deu', 'eng', 'fas', 'hau', 'hin', 'ita', 'khm', 'mya', 'nep', 'ori', 'pan', 'pol', 'rus', 'spa', 'swa', 'tel', 'tur', 'urd', 'zho']


Map:   0%|          | 0/166 [00:00<?, ? examples/s]

Saved: subtask_1/pred_amh.csv


Map:   0%|          | 0/169 [00:00<?, ? examples/s]

Saved: subtask_1/pred_arb.csv


Map:   0%|          | 0/166 [00:00<?, ? examples/s]

Saved: subtask_1/pred_ben.csv


Map:   0%|          | 0/159 [00:00<?, ? examples/s]

Saved: subtask_1/pred_deu.csv


Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Saved: subtask_1/pred_eng.csv


Map:   0%|          | 0/164 [00:00<?, ? examples/s]

Saved: subtask_1/pred_fas.csv


Map:   0%|          | 0/182 [00:00<?, ? examples/s]

Saved: subtask_1/pred_hau.csv


Map:   0%|          | 0/137 [00:00<?, ? examples/s]

Saved: subtask_1/pred_hin.csv


Map:   0%|          | 0/166 [00:00<?, ? examples/s]

Saved: subtask_1/pred_ita.csv


Map:   0%|          | 0/332 [00:00<?, ? examples/s]

Saved: subtask_1/pred_khm.csv


Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Saved: subtask_1/pred_mya.csv


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Saved: subtask_1/pred_nep.csv


Map:   0%|          | 0/118 [00:00<?, ? examples/s]

Saved: subtask_1/pred_ori.csv


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Saved: subtask_1/pred_pan.csv


Map:   0%|          | 0/119 [00:00<?, ? examples/s]

Saved: subtask_1/pred_pol.csv


Map:   0%|          | 0/167 [00:00<?, ? examples/s]

Saved: subtask_1/pred_rus.csv


Map:   0%|          | 0/165 [00:00<?, ? examples/s]

Saved: subtask_1/pred_spa.csv


Map:   0%|          | 0/349 [00:00<?, ? examples/s]

Saved: subtask_1/pred_swa.csv


Map:   0%|          | 0/118 [00:00<?, ? examples/s]

Saved: subtask_1/pred_tel.csv


Map:   0%|          | 0/115 [00:00<?, ? examples/s]

Saved: subtask_1/pred_tur.csv


Map:   0%|          | 0/177 [00:00<?, ? examples/s]

Saved: subtask_1/pred_urd.csv


Map:   0%|          | 0/214 [00:00<?, ? examples/s]

Saved: subtask_1/pred_zho.csv
CPU times: user 1min 20s, sys: 387 ms, total: 1min 21s
Wall time: 18.6 s


In [23]:
shutil.make_archive(base_name="subtask_1", format="zip", root_dir="subtask_1")

'/home/jovyan/work/subtask_1.zip'

In [24]:
# test_eval_df["preds"] = preds
# for lang in sorted(test_eval_df["lang"].unique()):
#     print(f"\n===== Language: {lang} =====")
#     lang_df = test_eval_df[test_eval_df["lang"] == lang]

#     print(classification_report(
#         lang_df["labels"],
#         lang_df["preds"],
#         target_names=["Not Polar (0)", "Polar (1)"],
#         digits=4,
#         zero_division=0
#     ))