<a href="https://colab.research.google.com/github/efandresena/SemEval/blob/main/subtask_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SUBTASK 1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
import time
from collections import defaultdict
import re
import json
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    pipeline,
    AutoConfig
)
from torch.utils.data import Dataset
import warnings
warnings.filterwarnings('ignore')

# CONFIGURATION ENGLISH

In [None]:
wokdir = "/content/drive/MyDrive/BLOC#4/NLP/SemEval" # CHNAGE IT ACCORDING TO YOUR WRKING DIRECTORY

CONFIG = {
    'language': 'eng',
    'data_subfolder': "dev_phase/subtask1/",
    'model_name': 'cardiffnlp/twitter-roberta-base-sentiment-latest',
    'max_length': 300,
    'num_epochs': 8,
    'learning_rate': 2e-5,
    'batch_size': 16,
    'weight_decay': 0.01,
    'warmup_ratio': 0.2,
    'gradient_accumulation_steps': 4,
    'val_split': 0.01,
    'use_class_weights': True,
}

LANG = CONFIG['language']

print("="*60)
print(f"BINARY CLASSIFICATION TRAINING: {LANG.upper()}")
print("="*60)


BINARY CLASSIFICATION TRAINING: ENG


# DATA LOADER

In [None]:

class PolarizationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            max_length=self.max_length,
            padding=False,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]), dtype=torch.long)
        return item

class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        if self.class_weights is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits, labels)
        else:
            loss = outputs.loss

        return (loss, outputs) if return_outputs else loss

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        'f1_macro': f1_score(labels, preds, average='macro'),
        'f1_binary': f1_score(labels, preds, pos_label=1),
        'f1_weighted': f1_score(labels, preds, average='weighted'),
    }


# LOADING DATA

In [None]:

print(f"\nLoading training data for {LANG}...")
train_path = os.path.join(wokdir, CONFIG['data_subfolder'], f"train/{LANG}.csv") # Changed workdir to wokdir
df = pd.read_csv(train_path)
print(f"Total samples: {len(df)}")
print(f"Class distribution:\n{df['polarization'].value_counts()}")
print(f"Class distribution %:\n{df['polarization'].value_counts(normalize=True)*100}")

class_counts = df['polarization'].value_counts()
imbalance_ratio = class_counts.max() / class_counts.min()
print(f"\nClass imbalance ratio: {imbalance_ratio:.2f}:1")

class_weights = None
if CONFIG['use_class_weights']:
    classes = df['polarization'].unique()
    weights = compute_class_weight(
        class_weight='balanced',
        classes=classes,
        y=df['polarization']
    )
    class_weights = torch.FloatTensor(weights)
    if torch.cuda.is_available():
        class_weights = class_weights.cuda()
    print(f"Class weights: {weights}")

train_df, val_df = train_test_split(
    df,
    test_size=CONFIG['val_split'],
    stratify=df['polarization'],
    random_state=42
)
print(f"\nTrain: {len(train_df)} | Val: {len(val_df)}")



Loading training data for eng...
Total samples: 3222
Class distribution:
polarization
0    2047
1    1175
Name: count, dtype: int64
Class distribution %:
polarization
0    63.531968
1    36.468032
Name: proportion, dtype: float64

Class imbalance ratio: 1.74:1
Class weights: [0.78700537 1.37106383]

Train: 3189 | Val: 33


# LOADING MODEL

In [None]:

print(f"\nLoading model: {CONFIG['model_name']}")
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])

config = AutoConfig.from_pretrained(CONFIG['model_name'])
config.num_labels = 2
config.id2label = {0: "not_polarized", 1: "polarized"}
config.label2id = {"not_polarized": 0, "polarized": 1}

model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG['model_name'],
    config=config,
    ignore_mismatched_sizes=True
)

print(f"Model output labels: {model.config.num_labels}")



Loading model: cardiffnlp/twitter-roberta-base-sentiment-latest


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Model output labels: 2


# TRAINING PARAMETERS

In [None]:

train_dataset = PolarizationDataset(
    train_df['text'].tolist(),
    train_df['polarization'].tolist(),
    tokenizer,
    CONFIG['max_length']
)
val_dataset = PolarizationDataset(
    val_df['text'].tolist(),
    val_df['polarization'].tolist(),
    tokenizer,
    CONFIG['max_length']
)

training_args = TrainingArguments(
    output_dir=f"./results_{LANG}",
    num_train_epochs=CONFIG['num_epochs'],
    learning_rate=CONFIG['learning_rate'],
    per_device_train_batch_size=CONFIG['batch_size'],
    per_device_eval_batch_size=CONFIG['batch_size'],
    gradient_accumulation_steps=CONFIG['gradient_accumulation_steps'],
    weight_decay=CONFIG['weight_decay'],
    warmup_ratio=CONFIG['warmup_ratio'],
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    logging_steps=25,
    fp16=True,
    report_to="none",
    save_total_limit=2,
)

trainer_class = WeightedTrainer if CONFIG['use_class_weights'] else Trainer


# TRAINER

In [None]:

trainer = trainer_class(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    class_weights=class_weights if CONFIG['use_class_weights'] else None
)

print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)
trainer.train()

print("\n" + "="*60)
print("FINAL VALIDATION RESULTS")
print("="*60)
final_eval = trainer.evaluate()
print(final_eval)



STARTING TRAINING


Step,Training Loss,Validation Loss,F1 Macro,F1 Binary,F1 Weighted
50,0.4907,0.527099,0.75188,0.714286,0.762133
100,0.408,0.483994,0.839024,0.8,0.849667
150,0.331,0.450524,0.869048,0.833333,0.878788
200,0.233,0.533935,0.825397,0.761905,0.842713
250,0.143,0.581176,0.825397,0.761905,0.842713
300,0.1221,0.603933,0.825397,0.761905,0.842713
350,0.0704,0.63494,0.856522,0.8,0.871937
400,0.0693,0.731326,0.856522,0.8,0.871937



FINAL VALIDATION RESULTS


{'eval_loss': 0.45052361488342285, 'eval_f1_macro': 0.8690476190476191, 'eval_f1_binary': 0.8333333333333334, 'eval_f1_weighted': 0.8787878787878788, 'eval_runtime': 0.0838, 'eval_samples_per_second': 393.61, 'eval_steps_per_second': 35.783, 'epoch': 8.0}


# SAVING MODEL TO HUGGING FACE

In [None]:
from huggingface_hub import HfApi, Repository
from huggingface_hub import upload_folder
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_folder = os.path.join(wokdir, 'SUBTASK_1_ENG')
# Save model and tokenizer
model.save_pretrained(model_folder)
tokenizer.save_pretrained(model_folder)

('/content/drive/MyDrive/BLOC#4/NLP/SemEval/SUBTASK_1_ENG/tokenizer_config.json',
 '/content/drive/MyDrive/BLOC#4/NLP/SemEval/SUBTASK_1_ENG/special_tokens_map.json',
 '/content/drive/MyDrive/BLOC#4/NLP/SemEval/SUBTASK_1_ENG/vocab.json',
 '/content/drive/MyDrive/BLOC#4/NLP/SemEval/SUBTASK_1_ENG/merges.txt',
 '/content/drive/MyDrive/BLOC#4/NLP/SemEval/SUBTASK_1_ENG/added_tokens.json',
 '/content/drive/MyDrive/BLOC#4/NLP/SemEval/SUBTASK_1_ENG/tokenizer.json')

In [None]:

HF_MODEL_NAME = "mirindraf/aims-sentiment-analysis"
api = HfApi()
api.create_repo(
    repo_id=HF_MODEL_NAME,
    repo_type="model",
    exist_ok=True
)
upload_folder(
    folder_path=model_folder,
    repo_id=HF_MODEL_NAME,
    repo_type="model"
)


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...K_1_ENG/model.safetensors:   0%|          |  552kB /  499MB            

CommitInfo(commit_url='https://huggingface.co/mirindraf/aims-sentiment-analysis/commit/b980b4f355111a5f8c0dbc8f8f3ad96358e8ec59', commit_message='Upload folder using huggingface_hub', commit_description='', oid='b980b4f355111a5f8c0dbc8f8f3ad96358e8ec59', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mirindraf/aims-sentiment-analysis', endpoint='https://huggingface.co', repo_type='model', repo_id='mirindraf/aims-sentiment-analysis'), pr_revision=None, pr_num=None)

In [None]:
# Load from hugging face
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL_NAME)
# Create a new Trainer (no training needed, just for inference)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer)
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [None]:
dev_path = os.path.join(wokdir, CONFIG['data_subfolder'], f"dev/{LANG}.csv")
dev_df = pd.read_csv(dev_path)

def predict_dev_and_save(dev_df, trainer, tokenizer, wokdir, LANG, CONFIG):
    # Add dummy labels
    dev_dataset = PolarizationDataset(
        dev_df['text'].tolist(),
        [0] * len(dev_df),
        tokenizer,
        CONFIG['max_length']
    )

    # Predict
    dev_preds = trainer.predict(dev_dataset)
    predictions = np.argmax(dev_preds.predictions, axis=1)

    # Save CSV
    output_path = os.path.join(wokdir, f"pred_{LANG}.csv")
    pd.DataFrame({"id": dev_df["id"], "polarization": predictions}).to_csv(output_path, index=False)

    return output_path, predictions

output_path, predictions = predict_dev_and_save(dev_df, trainer, tokenizer, wokdir, LANG, CONFIG)


[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


 offline


# PREDICTION

In [None]:

val_preds = trainer.predict(val_dataset)
val_pred_labels = np.argmax(val_preds.predictions, axis=1)
print("\nDetailed Classification Report:")
print(classification_report(val_df['polarization'], val_pred_labels,
                          target_names=['Not Polarized', 'Polarized']))

print("\n" + "="*60)
print("CREATING PREDICTIONS")
print("="*60)

dev_path = os.path.join(wokdir, CONFIG['data_subfolder'], f"dev/{LANG}.csv")
dev_df = pd.read_csv(dev_path)
print(f"Dev samples: {len(dev_df)}")

dev_dataset = PolarizationDataset(
    dev_df['text'].tolist(),
    [0] * len(dev_df),
    tokenizer,
    CONFIG['max_length']
)

dev_preds = trainer.predict(dev_dataset)
predictions = np.argmax(dev_preds.predictions, axis=1)

output_path = os.path.join(wokdir, f"pred_{LANG}.csv")
result_df = pd.DataFrame({
    "id": dev_df["id"],
    "polarization": predictions
})
result_df.to_csv(output_path, index=False)

print("\n" + "="*60)
print("COMPLETE")
print("="*60)
print(f"Saved: {output_path}")
print(f"Total predictions: {len(predictions)}")
print(f"Distribution:\n{pd.Series(predictions).value_counts()}")
print(f"Distribution %:\n{pd.Series(predictions).value_counts(normalize=True)*100}")
print(f"Final F1 Binary: {final_eval.get('eval_f1_binary', 'N/A'):.4f}")
print(f"Final F1 Macro: {final_eval.get('eval_f1_macro', 'N/A'):.4f}")
print("="*60)


BINARY CLASSIFICATION TRAINING: ENG

Loading training data for eng...
Total samples: 3222
Class distribution:
polarization
0    2047
1    1175
Name: count, dtype: int64
Class distribution %:
polarization
0    63.531968
1    36.468032
Name: proportion, dtype: float64

Class imbalance ratio: 1.74:1
Class weights: [0.78700537 1.37106383]

Train: 2738 | Val: 484

Loading model: cardiffnlp/twitter-roberta-base-sentiment-latest


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Model output labels: 2

STARTING TRAINING


Step,Training Loss,Validation Loss,F1 Macro,F1 Binary,F1 Weighted
50,0.5028,0.450438,0.775455,0.740196,0.784926
100,0.3849,0.40444,0.821024,0.784416,0.830857
150,0.268,0.454897,0.830065,0.791444,0.840439
200,0.1931,0.535114,0.828056,0.789333,0.838457
250,0.1322,0.601426,0.823703,0.784,0.834367
300,0.0993,0.639861,0.836854,0.792023,0.848896
350,0.0706,0.688378,0.822043,0.783069,0.832511
400,0.0672,0.76727,0.813481,0.768392,0.825591



FINAL VALIDATION RESULTS


{'eval_loss': 0.6398609280586243, 'eval_f1_macro': 0.8368541836937299, 'eval_f1_binary': 0.792022792022792, 'eval_f1_weighted': 0.8488956731921222, 'eval_runtime': 0.6536, 'eval_samples_per_second': 740.507, 'eval_steps_per_second': 47.429, 'epoch': 10.0}

Detailed Classification Report:
               precision    recall  f1-score   support

Not Polarized       0.88      0.89      0.88       307
    Polarized       0.80      0.79      0.79       177

     accuracy                           0.85       484
    macro avg       0.84      0.84      0.84       484
 weighted avg       0.85      0.85      0.85       484


CREATING PREDICTIONS
Dev samples: 160



COMPLETE
Saved: /content/drive/MyDrive/BLOC#4/NLP/SemEval/pred_eng.csv
Total predictions: 160
Distribution:
0    104
1     56
Name: count, dtype: int64
Distribution %:
0    65.0
1    35.0
Name: proportion, dtype: float64
Final F1 Binary: 0.7920
Final F1 Macro: 0.8369


# HAUSA :

In [None]:
# ================== CONFIG ==================
wokdir = "/content/drive/MyDrive/BLOC#4/NLP/SemEval"

CONFIG = {
    'language': 'hau',
    'data_subfolder': "dev_phase/subtask1/",
    'model_name': 'xlm-roberta-base',

    'max_length': 300,
    'num_epochs': 10,
    'learning_rate': 3e-5,
    'batch_size': 16,
    'weight_decay': 0.01,
    'warmup_ratio': 0.2,
    'gradient_accumulation_steps': 4,
    'val_split': 0.01,
    'use_class_weights': True,
    'use_kfold': True,
    'n_folds': 5,
}

LANG = CONFIG['language']

print("="*60)
print(f"IMPROVED TRAINING FOR: {LANG.upper()}")
print("="*60)

IMPROVED TRAINING FOR: HAU


# DATASET LOADER

In [None]:

class PolarizationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        enc = self.tokenizer(
            str(self.texts[idx]),
            truncation=True,
            max_length=self.max_length,
            padding=False,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]), dtype=torch.long)
        return item


# CUSTOM CLASS WEIGHTS WITH FOCAL LOSS

In [None]:

class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        if self.class_weights is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits, labels)
        else:
            loss = outputs.loss

        return (loss, outputs) if return_outputs else loss

# ================== METRICS ==================
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    return {
        'f1_macro': f1_score(labels, preds, average='macro'),
        'f1_binary': f1_score(labels, preds, pos_label=1),
        'f1_weighted': f1_score(labels, preds, average='weighted'),
    }


In [None]:
# ================== LOAD DATA ==================
print(f"\nLoading training data for {LANG}...")
train_path = os.path.join(wokdir, CONFIG['data_subfolder'], f"train/{LANG}.csv")
df = pd.read_csv(train_path)
print(f"Total samples: {len(df)}")
print(f"Class distribution:\n{df['polarization'].value_counts()}")
print(f"Class distribution %:\n{df['polarization'].value_counts(normalize=True)*100}")



Loading training data for hau...
Total samples: 3651
Class distribution:
polarization
0    3259
1     392
Name: count, dtype: int64
Class distribution %:
polarization
0    89.263216
1    10.736784
Name: proportion, dtype: float64


# DATA IMBALANCE TREATEMENT

In [None]:
# Check for class imbalance
class_counts = df['polarization'].value_counts()
imbalance_ratio = class_counts.max() / class_counts.min()
print(f"\n  Class imbalance ratio: {imbalance_ratio:.2f}:1")
if imbalance_ratio > 2:
    print(" HIGH IMBALANCE DETECTED - Using class weights!")
    CONFIG['use_class_weights'] = True

# ================== COMPUTE CLASS WEIGHTS ==================
class_weights = None
if CONFIG['use_class_weights']:
    classes = df['polarization'].unique()
    weights = compute_class_weight(
        class_weight='balanced',
        classes=classes,
        y=df['polarization']
    )
    class_weights = torch.FloatTensor(weights).cuda() if torch.cuda.is_available() else torch.FloatTensor(weights)
    print(f"\nClass weights: {weights}")

# Split data
train_df, val_df = train_test_split(
    df,
    test_size=CONFIG['val_split'],
    stratify=df['polarization'],
    random_state=42
)
print(f"\nTrain: {len(train_df)} | Val: {len(val_df)}")

# ================== TOKENIZER & MODEL ==================
print(f"\nLoading tokenizer and model: {CONFIG['model_name']}")
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
model = AutoModelForSequenceClassification.from_pretrained(
    CONFIG['model_name'],
    num_labels=2
)

# ================== CREATE DATASETS ==================
train_dataset = PolarizationDataset(
    train_df['text'].tolist(),
    train_df['polarization'].tolist(),
    tokenizer,
    CONFIG['max_length']
)
val_dataset = PolarizationDataset(
    val_df['text'].tolist(),
    val_df['polarization'].tolist(),
    tokenizer,
    CONFIG['max_length']
)



  Class imbalance ratio: 8.31:1
 HIGH IMBALANCE DETECTED - Using class weights!

Class weights: [0.56014115 4.65688776]

Train: 3614 | Val: 37

Loading tokenizer and model: xlm-roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# TRAINING PARAMETERS

In [None]:

# ================== TRAINING ARGUMENTS ==================
training_args = TrainingArguments(
    output_dir=f"./results_{LANG}",
    num_train_epochs=CONFIG['num_epochs'],
    learning_rate=CONFIG['learning_rate'],
    per_device_train_batch_size=CONFIG['batch_size'],
    per_device_eval_batch_size=CONFIG['batch_size'],
    gradient_accumulation_steps=CONFIG['gradient_accumulation_steps'],
    weight_decay=CONFIG['weight_decay'],
    warmup_ratio=CONFIG['warmup_ratio'],
    eval_strategy="steps",
    eval_steps=50,  # More frequent evaluation
    save_strategy="steps",
    save_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    logging_steps=25,
    fp16=True,
    report_to="none",
    save_total_limit=2,
)

# ================== TRAINER ==================
trainer_class = WeightedTrainer if CONFIG['use_class_weights'] else Trainer

trainer = trainer_class(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    class_weights=class_weights if CONFIG['use_class_weights'] else None
)


# TRAIN

In [None]:
print("\n" + "="*60)
print("STARTING TRAINING")
print("="*60)
trainer.train()

print("\n" + "="*60)
print("FINAL VALIDATION RESULTS")
print("="*60)
final_eval = trainer.evaluate()
print(final_eval)

# Detailed classification report on validation set
val_preds = trainer.predict(val_dataset)
val_pred_labels = np.argmax(val_preds.predictions, axis=1)
print("\nDetailed Classification Report:")
print(classification_report(val_df['polarization'], val_pred_labels,
                          target_names=['Not Polarized', 'Polarized']))



STARTING TRAINING


Step,Training Loss,Validation Loss,F1 Macro,F1 Binary,F1 Weighted
50,0.6453,0.45029,0.766709,0.615385,0.885314
100,0.5338,0.27298,0.839827,0.727273,0.928045
150,0.5114,0.247093,0.839827,0.727273,0.928045
200,0.4222,0.081711,0.936752,0.888889,0.974267
250,0.4148,0.134437,0.936752,0.888889,0.974267
300,0.3895,0.117669,0.936752,0.888889,0.974267
350,0.3037,0.090078,0.936752,0.888889,0.974267
400,0.2502,0.035182,1.0,1.0,1.0
450,0.2435,0.047394,1.0,1.0,1.0
500,0.2174,0.039491,1.0,1.0,1.0



FINAL VALIDATION RESULTS


{'eval_loss': 0.03518204763531685, 'eval_f1_macro': 1.0, 'eval_f1_binary': 1.0, 'eval_f1_weighted': 1.0, 'eval_runtime': 0.0914, 'eval_samples_per_second': 404.911, 'eval_steps_per_second': 32.831, 'epoch': 10.0}

Detailed Classification Report:
               precision    recall  f1-score   support

Not Polarized       1.00      1.00      1.00        33
    Polarized       1.00      1.00      1.00         4

     accuracy                           1.00        37
    macro avg       1.00      1.00      1.00        37
 weighted avg       1.00      1.00      1.00        37



# SAVING MODEL TO HUGGING FACE

In [None]:
from huggingface_hub import HfApi, Repository
from huggingface_hub import upload_folder
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_folder = os.path.join(wokdir, 'SUBTASK_1_HAU')
# Save model and tokenizer
model.save_pretrained(model_folder)
tokenizer.save_pretrained(model_folder)

('/content/drive/MyDrive/BLOC#4/NLP/SemEval/SUBTASK_1_HAU/tokenizer_config.json',
 '/content/drive/MyDrive/BLOC#4/NLP/SemEval/SUBTASK_1_HAU/special_tokens_map.json',
 '/content/drive/MyDrive/BLOC#4/NLP/SemEval/SUBTASK_1_HAU/sentencepiece.bpe.model',
 '/content/drive/MyDrive/BLOC#4/NLP/SemEval/SUBTASK_1_HAU/added_tokens.json',
 '/content/drive/MyDrive/BLOC#4/NLP/SemEval/SUBTASK_1_HAU/tokenizer.json')

In [None]:
HF_MODEL_NAME = "mirindraf/aims-sentiment-analysis-hau"
api = HfApi()
api.create_repo(
    repo_id=HF_MODEL_NAME,
    repo_type="model",
    exist_ok=True
)
upload_folder(
    folder_path=model_folder,
    repo_id=HF_MODEL_NAME,
    repo_type="model"
)


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...TASK_1_HAU/tokenizer.json:  97%|#########6| 16.6MB / 17.1MB            

  ...U/sentencepiece.bpe.model: 100%|##########| 5.07MB / 5.07MB            

  ...K_1_HAU/model.safetensors:   0%|          |  560kB / 1.11GB            

CommitInfo(commit_url='https://huggingface.co/mirindraf/aims-sentiment-analysis-hau/commit/45abefe5d383a4d6869b1b50ebf712ff70d8cd4d', commit_message='Upload folder using huggingface_hub', commit_description='', oid='45abefe5d383a4d6869b1b50ebf712ff70d8cd4d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mirindraf/aims-sentiment-analysis-hau', endpoint='https://huggingface.co', repo_type='model', repo_id='mirindraf/aims-sentiment-analysis-hau'), pr_revision=None, pr_num=None)

In [None]:
# Load from hugging face
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(HF_MODEL_NAME)
# Create a new Trainer (no training needed, just for inference)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer)
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [None]:
dev_path = os.path.join(wokdir, CONFIG['data_subfolder'], f"dev/{LANG}.csv")
dev_df = pd.read_csv(dev_path)

def predict_dev_and_save(dev_df, trainer, tokenizer, wokdir, LANG, CONFIG):
    # Add dummy labels
    dev_dataset = PolarizationDataset(
        dev_df['text'].tolist(),
        [0] * len(dev_df),
        tokenizer,
        CONFIG['max_length']
    )

    # Predict
    dev_preds = trainer.predict(dev_dataset)
    predictions = np.argmax(dev_preds.predictions, axis=1)

    # Save CSV
    output_path = os.path.join(wokdir, f"pred_{LANG}.csv")
    pd.DataFrame({"id": dev_df["id"], "polarization": predictions}).to_csv(output_path, index=False)

    return output_path, predictions

output_path, predictions = predict_dev_and_save(dev_df, trainer, tokenizer, wokdir, LANG, CONFIG)


#PREDICTION PIPELINE

In [None]:

print("\n" + "="*60)
print("CREATING PREDICTION PIPELINE")
print("="*60)

# Create pipeline from trained model
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline(
    "text-classification",
    model=trainer.model,
    tokenizer=tokenizer,
    device=device,
    max_length=CONFIG['max_length'],
    truncation=True
)

# Load dev data
print(f"\nLoading dev data for {LANG}...")
dev_path = os.path.join(wokdir, CONFIG['data_subfolder'], f"dev/{LANG}.csv")
dev_df = pd.read_csv(dev_path)
print(f"Dev samples: {len(dev_df)}")

# Make predictions
print("\nGenerating predictions...")
predictions = []
batch_size = 32

for i in tqdm(range(0, len(dev_df), batch_size)):
    batch_texts = dev_df['text'].iloc[i:i+batch_size].tolist()
    batch_preds = classifier(batch_texts)

    # Extract labels (LABEL_0 -> 0, LABEL_1 -> 1)
    batch_labels = [int(pred['label'].split('_')[1]) for pred in batch_preds]
    predictions.extend(batch_labels)

# ================== SAVE PREDICTIONS ==================
output_path = os.path.join(wokdir, f"pred_{LANG}.csv")
result_df = pd.DataFrame({
    "id": dev_df["id"],
    "polarization": predictions
})
result_df.to_csv(output_path, index=False)

print("\n" + "="*60)
print("PREDICTION COMPLETE!")
print("="*60)
print(f"Saved predictions to: {output_path}")
print(f"Total predictions: {len(predictions)}")
print(f"Prediction distribution:\n{pd.Series(predictions).value_counts()}")
print(f"Prediction distribution %:\n{pd.Series(predictions).value_counts(normalize=True)*100}")
print("\n✓ Done!")


#SUMMARY

In [None]:

print("\n" + "="*60)
print("TRAINING SUMMARY")
print("="*60)
print(f"Language: {LANG}")
print(f"Model: {CONFIG['model_name']}")
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Class weights used: {CONFIG['use_class_weights']}")
print(f"Final F1 Binary: {final_eval.get('eval_f1_binary', 'N/A'):.4f}")
print(f"Final F1 Macro: {final_eval.get('eval_f1_macro', 'N/A'):.4f}")
print("="*60)



IMPROVED TRAINING FOR: HAU

Loading training data for hau...
Total samples: 3651
Class distribution:
polarization
0    3259
1     392
Name: count, dtype: int64
Class distribution %:
polarization
0    89.263216
1    10.736784
Name: proportion, dtype: float64

⚠️  Class imbalance ratio: 8.31:1
   ⚠️  HIGH IMBALANCE DETECTED - Using class weights!

Class weights: [0.56014115 4.65688776]

Train: 3614 | Val: 37

Loading tokenizer and model: xlm-roberta-base


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



STARTING TRAINING


Step,Training Loss,Validation Loss,F1 Macro,F1 Binary,F1 Weighted
50,0.6453,0.45029,0.766709,0.615385,0.885314
100,0.5338,0.27298,0.839827,0.727273,0.928045
150,0.5114,0.247093,0.839827,0.727273,0.928045
200,0.4131,0.065638,0.936752,0.888889,0.974267
250,0.4288,0.142963,0.884375,0.8,0.950507
300,0.3902,0.081875,0.936752,0.888889,0.974267
350,0.2768,0.070934,0.936752,0.888889,0.974267
400,0.2347,0.047249,1.0,1.0,1.0
450,0.2336,0.044146,1.0,1.0,1.0
500,0.2679,0.042196,0.936752,0.888889,0.974267



FINAL VALIDATION RESULTS


{'eval_loss': 0.047249045222997665, 'eval_f1_macro': 1.0, 'eval_f1_binary': 1.0, 'eval_f1_weighted': 1.0, 'eval_runtime': 0.1345, 'eval_samples_per_second': 275.073, 'eval_steps_per_second': 22.303, 'epoch': 10.0}


Device set to use cuda:0



Detailed Classification Report:
               precision    recall  f1-score   support

Not Polarized       1.00      1.00      1.00        33
    Polarized       1.00      1.00      1.00         4

     accuracy                           1.00        37
    macro avg       1.00      1.00      1.00        37
 weighted avg       1.00      1.00      1.00        37


CREATING PREDICTION PIPELINE

Loading dev data for hau...
Dev samples: 182

Generating predictions...


100%|██████████| 6/6 [00:02<00:00,  2.26it/s]


PREDICTION COMPLETE!
Saved predictions to: /content/drive/MyDrive/BLOC#4/NLP/SemEval/pred_hau.csv
Total predictions: 182
Prediction distribution:
0    157
1     25
Name: count, dtype: int64
Prediction distribution %:
0    86.263736
1    13.736264
Name: proportion, dtype: float64

✓ Done!

TRAINING SUMMARY
Language: hau
Model: xlm-roberta-base
Training samples: 3614
Validation samples: 37
Class weights used: True
Final F1 Binary: 1.0000
Final F1 Macro: 1.0000



