In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_scheduler
from tqdm import tqdm
import pandas as pd

In [None]:
! pip install -q evaluate

In [3]:
import evaluate
import numpy as np

In [4]:
PATH = "/kaggle/input/dataset-topics/"
data_name = "humset_bias_train_en_normalized.jsonl"

In [5]:
data = pd.read_json(PATH + data_name, lines=True)

data['subpillars'] = data['subpillars'].apply(lambda x: ["No-topics"] if len(x) == 0 else x)
data['subpillars_labels'] = data['subpillars'].apply(lambda x: "~".join(x))

data.head(5)

Unnamed: 0,id,text,subpillars,subpillars_labels
0,293791,Highly vulnerable regions like Africa need at ...,"[Context->Economy, Context->Environment]",Context->Economy~Context->Environment
1,169537,The deterioration of the security situation ha...,"[Displacement->Push factors, Displacement->Typ...",Displacement->Push factors~Displacement->Type/...
2,155848,"To date, UNHCR and its partner the Fondation H...",[Information and communication->Communication ...,Information and communication->Communication m...
3,158303,"""We're seeing an alarming deterioration in foo...","[Humanitarian conditions->Living standards, Im...",Humanitarian conditions->Living standards~Impa...
4,337197,"In January and February 2021, UNICEF and its p...",[Capacities & response->International response...,Capacities & response->International response~...


In [6]:
data = data.sample(frac=0.5, random_state=42)

In [7]:
len(data)

28568

In [8]:
all_topics = set(topic for topics in data['subpillars'] for topic in topics)

In [9]:
all_labels = list(set(label for sublist in data["subpillars"] for label in sublist))
label_to_index = {label: idx for idx, label in enumerate(all_labels)}
num_labels = len(all_labels)

def encode_labels(subpillars):
    """Generate a multi-hot vector with 1s for the existing labels and 0s elsewhere."""
    multi_hot = [0.0] * num_labels
    for label in subpillars:
        multi_hot[label_to_index[label]] = 1.0
    return multi_hot

data["labels"] = data["subpillars"].apply(encode_labels)

In [10]:
label_to_index

{'Shock/event->Hazard & threats': 0,
 'At risk->Risk and vulnerabilities': 1,
 'Priority needs->Expressed by humanitarian staff': 2,
 'Humanitarian conditions->Number of people in need': 3,
 'Covid-19->Restriction measures': 4,
 'Shock/event->Underlying/aggravating factors': 5,
 'Displacement->Push factors': 6,
 'Humanitarian conditions->Coping mechanisms': 7,
 'Context->Security & stability': 8,
 'Context->Socio cultural': 9,
 'Casualties->Injured': 10,
 'Covid-19->Prevention campaign': 11,
 'Priority interventions->Expressed by humanitarian staff': 12,
 'Context->Economy': 13,
 'Displacement->Type/numbers/movements': 14,
 'Covid-19->Vaccination': 15,
 'Information and communication->Communication means and preferences': 16,
 'Context->Demography': 17,
 'Casualties->Dead': 18,
 'Capacities & response->People reached/response gaps': 19,
 'Capacities & response->National response': 20,
 'Humanitarian conditions->Living standards': 21,
 'Priority needs->Expressed by population': 22,
 'Im

In [11]:
from sklearn.model_selection import train_test_split


train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

In [12]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.loc[index, "text"]
        labels = self.data.loc[index, "labels"]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(labels, dtype=torch.float)
        }

In [13]:
model_name = "/kaggle/input/berta/transformers/default/1"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(all_topics), problem_type="multi_label_classification")
max_len = 512
epoch_num = 3

In [14]:
dataset_train = TextDataset(train_data, tokenizer, max_len)
data_loader_train = DataLoader(dataset_train, batch_size=32, shuffle=True)

dataset_val = TextDataset(val_data, tokenizer, max_len)
data_loader_val = DataLoader(dataset_val, batch_size=32, shuffle=True)

In [17]:
optimizer = AdamW(model.parameters(), lr=1e-5)
num_training_steps = len(data_loader_train) * 2
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

loss_fn = torch.nn.BCEWithLogitsLoss()

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int).reshape(-1)
    return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))



In [None]:

for epoch in range(epoch_num):
    print(f"Starting epoch {epoch+1}/{epoch_num}")
    model.train()
    train_loss = 0
    for batch in tqdm(data_loader_train, desc="Training", leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(data_loader_train)
    print(f"Epoch {epoch+1} Training Loss: {avg_train_loss}")

    model.eval()
    val_loss = 0
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader_val, desc="Validation", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = loss_fn(logits, labels)
            val_loss += loss.item()

            all_predictions.append(logits.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    avg_val_loss = val_loss / len(data_loader_val)
    print(f"Epoch {epoch+1} Validation Loss: {avg_val_loss}")

    all_predictions = np.vstack(all_predictions)
    all_labels = np.vstack(all_labels)
    metrics = compute_metrics((all_predictions, all_labels))
    print(f"Epoch {epoch+1} Metrics: {metrics}")

    model.save_pretrained(f"./bert_multilabel_classification_model_epoch_{epoch+1}")
    tokenizer.save_pretrained(f"./bert_multilabel_classification_model_epoch_{epoch+1}")