<a href="https://colab.research.google.com/github/chekwubeutomi/nlp-polarization-project/blob/main/subtask_2_and_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from sklearn.metrics import f1_score, hamming_loss
from sklearn.model_selection import train_test_split
from tqdm import tqdm



In [None]:
CONFIG = {
    #'model_name': 'roberta-base',
    'model_name': 'Davlan/afro-xlmr-large',
    'max_length': 256,
    'batch_size': 16,
    'learning_rate': 2e-5,
    'num_epochs': 10,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

MODEL_NAME = "Davlan/afro-xlmr-base"


# LABEL_COLS = [
#     'stereotype', 'vilification', 'dehumanization',
#     'extreme_language', 'lack_of_empathy', 'invalidation'
# ]

LABEL_COLS = ["political", "racial/ethnic", "religious", "gender/sexual", "other"]

class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.FloatTensor(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label
        }



class MultiLabelClassifier:

    def __init__(self, model_name=CONFIG["model_name"], num_labels=5):
        self.device = CONFIG['device']

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=num_labels,
            problem_type="multi_label_classification"
        ).to(self.device)

    def compute_pos_weights(self, train_df):
        label_counts = train_df[LABEL_COLS].sum().values
        total = len(train_df)

        pos_weight = (total - label_counts) / label_counts
        pos_weight = torch.tensor(pos_weight, dtype=torch.float).to(self.device)

        return pos_weight

    def train(self, train_df, val_df=None, val_split=0.2):

        if val_df is None:
            train_texts = train_df['text'].values
            train_labels = train_df[LABEL_COLS].values

            X_train, X_val, y_train, y_val = train_test_split(
                train_texts, train_labels, test_size=val_split, random_state=42
            )
        else:
            X_train = train_df['text'].values
            y_train = train_df[LABEL_COLS].values
            X_val = val_df['text'].values
            y_val = val_df[LABEL_COLS].values

        train_dataset = MultiLabelDataset(X_train, y_train, self.tokenizer, CONFIG['max_length'])
        val_dataset = MultiLabelDataset(X_val, y_val, self.tokenizer, CONFIG['max_length'])

        collator = DataCollatorWithPadding(tokenizer=self.tokenizer)

        train_loader = DataLoader(
            train_dataset, batch_size=CONFIG['batch_size'],
            shuffle=True, collate_fn=collator
        )
        val_loader = DataLoader(
            val_dataset, batch_size=CONFIG['batch_size'],
            shuffle=False, collate_fn=collator
        )

        optimizer = AdamW(self.model.parameters(), lr=CONFIG['learning_rate'])
        total_steps = len(train_loader) * CONFIG['num_epochs']
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=total_steps
        )

        pos_weight = self.compute_pos_weights(train_df)
        loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

        best_val_loss = float("inf")

        for epoch in range(CONFIG['num_epochs']):
            print(f"\n===== Epoch {epoch + 1} / {CONFIG['num_epochs']} =====")

            self.model.train()
            train_loss = 0

            for batch in tqdm(train_loader, desc="Training"):
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                labels = batch["labels"].to(self.device)

                optimizer.zero_grad()

                logits = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                ).logits

                loss = loss_fn(logits, labels)

                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

                train_loss += loss.item()

            avg_train_loss = train_loss / len(train_loader)

            # Validation
            val_loss, metrics = self.evaluate(val_loader)

            print(f"Train Loss: {avg_train_loss:.4f}")
            print(f"Val Loss:   {val_loss:.4f}")
            print(f"F1 Micro:   {metrics['f1_micro']:.4f}")
            print(f"F1 Macro:   {metrics['f1_macro']:.4f}")
            print(f"Hamming:    {metrics['hamming']:.4f}")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(self.model.state_dict(), "best_model.pt")
                print("âœ” Saved best model")

        self.model.load_state_dict(torch.load("best_model.pt"))
        print("\nðŸŽ‰ Training Completed!")

    def evaluate(self, data_loader):
        self.model.eval()
        total_loss = 0

        all_preds = []
        all_labels = []

        loss_fn = torch.nn.BCEWithLogitsLoss()

        with torch.no_grad():
            for batch in data_loader:
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)
                labels = batch["labels"].to(self.device)

                logits = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                ).logits

                loss = loss_fn(logits, labels)
                total_loss += loss.item()

                probs = torch.sigmoid(logits)
                preds = (probs > 0.5).int()

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(data_loader)
        all_preds = np.array(all_preds)
        all_labels = np.array(all_labels)

        metrics = {
            "f1_micro": f1_score(all_labels, all_preds, average="micro"),
            "f1_macro": f1_score(all_labels, all_preds, average="macro"),
            "hamming": hamming_loss(all_labels, all_preds),
        }

        return avg_loss, metrics

    def predict(self, texts, threshold=0.25):
        self.model.eval()

        if isinstance(texts, str):
            texts = [texts]

        predictions = []

        with torch.no_grad():
            for text in texts:
                encoding = self.tokenizer(
                    text,
                    truncation=True,
                    max_length=CONFIG['max_length'],
                    return_tensors='pt'
                )

                input_ids = encoding['input_ids'].to(self.device)
                attention_mask = encoding['attention_mask'].to(self.device)

                logits = self.model(input_ids=input_ids, attention_mask=attention_mask).logits
                probs = torch.sigmoid(logits).cpu().numpy()[0]

                preds = (probs > threshold).astype(int)
                predictions.append(preds)

        return np.array(predictions)


    def predict_with_probabilities(self, texts):
        self.model.eval()

        if isinstance(texts, str):
            texts = [texts]

        all_probs = []

        with torch.no_grad():
            for text in texts:
                encoding = self.tokenizer(
                    text,
                    truncation=True,
                    max_length=CONFIG['max_length'],
                    return_tensors='pt'
                )

                input_ids = encoding['input_ids'].to(self.device)
                attention_mask = encoding['attention_mask'].to(self.device)

                logits = self.model(input_ids=input_ids, attention_mask=attention_mask).logits
                probs = torch.sigmoid(logits).cpu().numpy()[0]

                all_probs.append(probs)

        return np.array(all_probs)



In [None]:
df = pd.read_csv("sample_data/hau.csv")

In [None]:
df[LABEL_COLS].sum()

Unnamed: 0,0
political,178
racial/ethnic,115
religious,93
gender/sexual,29
other,14


In [None]:
classifier = MultiLabelClassifier()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
classifier.train(df, val_split=0.2)


===== Epoch 1 / 10 =====


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 183/183 [01:02<00:00,  2.93it/s]


Train Loss: 1.4810
Val Loss:   0.2703
F1 Micro:   0.2451
F1 Macro:   0.1454
Hamming:    0.1045
âœ” Saved best model

===== Epoch 2 / 10 =====


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 183/183 [01:01<00:00,  2.96it/s]


Train Loss: 1.4250
Val Loss:   0.2738
F1 Micro:   0.2604
F1 Macro:   0.1858
Hamming:    0.1026

===== Epoch 3 / 10 =====


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 183/183 [01:00<00:00,  3.03it/s]


Train Loss: 1.3045
Val Loss:   0.1792
F1 Micro:   0.3315
F1 Macro:   0.1993
Hamming:    0.0673
âœ” Saved best model

===== Epoch 4 / 10 =====


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 183/183 [01:01<00:00,  2.97it/s]


Train Loss: 1.1126
Val Loss:   0.1436
F1 Micro:   0.3797
F1 Macro:   0.2625
Hamming:    0.0501
âœ” Saved best model

===== Epoch 5 / 10 =====


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 183/183 [01:01<00:00,  2.96it/s]


Train Loss: 0.8345
Val Loss:   0.1689
F1 Micro:   0.3333
F1 Macro:   0.2398
Hamming:    0.0646

===== Epoch 6 / 10 =====


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 183/183 [01:00<00:00,  3.03it/s]


Train Loss: 0.6787
Val Loss:   0.1108
F1 Micro:   0.4206
F1 Macro:   0.3260
Hamming:    0.0369
âœ” Saved best model

===== Epoch 7 / 10 =====


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 183/183 [01:01<00:00,  2.96it/s]


Train Loss: 0.5481
Val Loss:   0.1032
F1 Micro:   0.4305
F1 Macro:   0.3133
Hamming:    0.0347
âœ” Saved best model

===== Epoch 8 / 10 =====


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 183/183 [01:01<00:00,  2.95it/s]


Train Loss: 0.4461
Val Loss:   0.1016
F1 Micro:   0.4593
F1 Macro:   0.3192
Hamming:    0.0309
âœ” Saved best model

===== Epoch 9 / 10 =====


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 183/183 [01:01<00:00,  2.98it/s]


Train Loss: 0.4156
Val Loss:   0.0940
F1 Micro:   0.4583
F1 Macro:   0.3220
Hamming:    0.0285
âœ” Saved best model

===== Epoch 10 / 10 =====


Training: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 183/183 [01:01<00:00,  2.96it/s]


Train Loss: 0.3338
Val Loss:   0.0919
F1 Micro:   0.4737
F1 Macro:   0.3245
Hamming:    0.0274
âœ” Saved best model

ðŸŽ‰ Training Completed!


In [None]:
df_test = pd.read_csv("sample_data/test_hau2.csv")
df_test.head()

Unnamed: 0,id,text,political,racial/ethnic,religious,gender/sexual,other
0,hau_7bafacd606d9dee74e7cee95f8277d4e,@USER koh da arniya ce,,,,,
1,hau_cbf1bdd94361d60e55c6774b2a69198a,@USER arne kaga gemunsaðŸ˜‚,,,,,
2,hau_8e2286abdaa2b53c5a43e2a13e11cddd,@USER and the guy is busy calling him sakarai ...,,,,,
3,hau_1fcff27ecdc63dca7852720481daf56d,wallahi tallahi na tsani atiku abubakar da sen...,,,,,
4,hau_fbfe2c9ca8b5bb50f1fd4cd295c95a15,@USER ai hausawa majority gidadawa ne,,,,,


In [None]:
test_text = df_test["text"].tolist()

In [None]:
predictions = classifier.predict(test_text)

In [None]:
df_pred = pd.DataFrame(predictions, columns=LABEL_COLS)
df_pred.head()

Unnamed: 0,political,racial/ethnic,religious,gender/sexual,other
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,1,0,0,0,0
4,0,0,0,0,0


In [None]:
pred_id = df_test['id']
pred_id.head()

Unnamed: 0,id
0,hau_7bafacd606d9dee74e7cee95f8277d4e
1,hau_cbf1bdd94361d60e55c6774b2a69198a
2,hau_8e2286abdaa2b53c5a43e2a13e11cddd
3,hau_1fcff27ecdc63dca7852720481daf56d
4,hau_fbfe2c9ca8b5bb50f1fd4cd295c95a15


In [None]:
df_pred = pd.concat([pred_id, df_pred], axis=1)
df_pred.head()

Unnamed: 0,id,political,racial/ethnic,religious,gender/sexual,other
0,hau_7bafacd606d9dee74e7cee95f8277d4e,0,0,0,0,0
1,hau_cbf1bdd94361d60e55c6774b2a69198a,0,0,0,0,0
2,hau_8e2286abdaa2b53c5a43e2a13e11cddd,0,0,0,0,0
3,hau_1fcff27ecdc63dca7852720481daf56d,1,0,0,0,0
4,hau_fbfe2c9ca8b5bb50f1fd4cd295c95a15,0,0,0,0,0


In [None]:
df_pred.to_csv("predictions22hau.csv", index=False)