In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import numpy as np

In [3]:
MODEL_PATH = "/kaggle/input/berta_base_humset_merged/pytorch/default/1"
DATA_FOLDER = "/kaggle/input/humset-normalized/"

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=40, problem_type="multi_label_classification")
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [6]:
validation_dataset = pd.read_csv(DATA_FOLDER + "val_data.csv", index_col=0)
validation_dataset.head()

Unnamed: 0,id,text,subpillars,subpillars_labels,labels
0,298136,The lack of schools means that children of sc...,"['Humanitarian conditions->Living standards', ...",Humanitarian conditions->Living standards~Huma...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,474596,FOOD CONSUMPTION The analysis found that 7 out...,"['Humanitarian conditions->Living standards', ...",Humanitarian conditions->Living standards~Impa...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,116034,"The document showed that 1,909 new cases were ...","['Casualties->Dead', 'Humanitarian conditions-...",Casualties->Dead~Humanitarian conditions->Phys...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,137452,"Immediate gaps for 143,000 internally displace...",['Priority needs->Expressed by humanitarian st...,Priority needs->Expressed by humanitarian staf...,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,406252,Refugee: Rapid Investigation and Response Team...,"['Covid-19->Cases', 'Covid-19->Restriction mea...",Covid-19->Cases~Covid-19->Restriction measures,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."


In [8]:
data = pd.read_json(DATA_FOLDER + 'humset_bias_test_en_normalized_with_empty.jsonl', lines=True)
data['subpillars_labels'] = data['subpillars'].apply(lambda x: "~".join(x))
all_topics = set(topic for topics in data['subpillars'] for topic in topics)
data.head(5)

Unnamed: 0,id,text,subpillars,subpillars_labels
0,313178,"Other gaps include capacity to provide PSS, PF...",[Capacities & response->International response...,Capacities & response->International response~...
1,85411,"The BAY states ( Borno, Adamawa and Yobe ) , ...",[Humanitarian conditions->Number of people in ...,Humanitarian conditions->Number of people in need
2,492362,Damboa LGA: 35 hygiene promoters ( IMC ) hav...,[Capacities & response->International response...,Capacities & response->International response~...
3,239365,RRT training of underperforming LGA commenced ...,[Covid-19->Cases],Covid-19->Cases
4,294188,A massive fire broke out in Kutupalong Balukha...,"[Casualties->Injured, Impact->Impact on people...",Casualties->Injured~Impact->Impact on people~C...


In [9]:
topic_mapping = {'Shock/event->Hazard & threats': 0,
 'At risk->Risk and vulnerabilities': 1,
 'Priority needs->Expressed by humanitarian staff': 2,
 'Humanitarian conditions->Number of people in need': 3,
 'Covid-19->Restriction measures': 4,
 'Shock/event->Underlying/aggravating factors': 5,
 'Displacement->Push factors': 6,
 'Humanitarian conditions->Coping mechanisms': 7,
 'Context->Security & stability': 8,
 'Context->Socio cultural': 9,
 'Casualties->Injured': 10,
 'Covid-19->Prevention campaign': 11,
 'Priority interventions->Expressed by humanitarian staff': 12,
 'Context->Economy': 13,
 'Displacement->Type/numbers/movements': 14,
 'Covid-19->Vaccination': 15,
 'Information and communication->Communication means and preferences': 16,
 'Context->Demography': 17,
 'Casualties->Dead': 18,
 'Capacities & response->People reached/response gaps': 19,
 'Capacities & response->National response': 20,
 'Humanitarian conditions->Living standards': 21,
 'Priority needs->Expressed by population': 22,
 'Impact->Impact on people': 23,
 'Impact->Number of people affected': 24,
 'Covid-19->Testing': 25,
 'Information and communication->Knowledge and info gaps (pop)': 26,
 'Context->Legal & policy': 27,
 'Context->Environment': 28,
 'Humanitarian conditions->Physical and mental well being': 29,
 'Capacities & response->International response': 30,
 'Context->Politics': 31,
 'Impact->Driver/aggravating factors': 32,
 'Information and communication->Knowledge and info gaps (hum)': 33,
 'Covid-19->Cases': 34,
 'Humanitarian access->Relief to population': 35,
 'Impact->Impact on systems, services and networks': 36,
 'Covid-19->Deaths': 37,
 'Humanitarian access->Physical constraints': 38,
 'Shock/event->Type and characteristics': 39}
inversed_topic_mapping = {v: k for k, v in topic_mapping.items()}

In [27]:
def calculate_average_sigmoid_sum(model, data, tokenizer):
    inputs = tokenizer(data['text'].tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: inputs[key].to(device) for key in inputs}
    outputs = model(**inputs)
    logits = outputs.logits
    sigmoid_outputs = torch.sigmoid(logits)    
    return sigmoid_outputs.sum(dim=1).mean().item()

In [8]:
all_labels = list(set(label for sublist in data["subpillars"] for label in sublist))
label_to_index = topic_mapping
num_labels = len(all_labels)

def encode_labels(subpillars):
    """Generate a multi-hot vector with 1s for the existing labels and 0s elsewhere."""
    multi_hot = [0.0] * num_labels
    for label in subpillars:
        multi_hot[label_to_index[label]] = 1.0
    return multi_hot

data["labels"] = data["subpillars"].apply(encode_labels)

In [9]:
data.head(5)

Unnamed: 0,id,text,subpillars,subpillars_labels,labels
0,199921,with over one million IDPs inside Burkina Faso...,[Displacement->Type/numbers/movements],Displacement->Type/numbers/movements,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,293791,Highly vulnerable regions like Africa need at ...,"[Context->Environment, Context->Economy]",Context->Environment~Context->Economy,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
2,169537,The deterioration of the security situation ha...,"[Displacement->Push factors, Context->Security...",Displacement->Push factors~Context->Security &...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,155848,"To date, UNHCR and its partner the Fondation H...",[Capacities & response->International response...,Capacities & response->International response~...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,158303,"""We're seeing an alarming deterioration in foo...","[Humanitarian conditions->Living standards, Di...",Humanitarian conditions->Living standards~Disp...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


In [13]:
multi_topic_data = data[data['subpillars'].apply(len) > 1]
single_topic_data = data[data['subpillars'].apply(len) == 1]
no_topic_data = data[data['subpillars'].apply(len) == 0]

In [14]:
len(multi_topic_data), len(single_topic_data), len(no_topic_data)

(1970, 3088, 1060)

In [15]:
def calculate_average_sigmoid_sum(model, data, tokenizer):
    inputs = tokenizer(data['text'].tolist(), return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: inputs[key].to(device) for key in inputs}
    outputs = model(**inputs)
    logits = outputs.logits
    sigmoid_outputs = torch.sigmoid(logits)    
    return sigmoid_outputs.sum(dim=1).mean().item()

In [16]:
res_single = calculate_average_sigmoid_sum(model, single_topic_data[:100], tokenizer)
print(f"Single topic sigmoid average sum: {res_single}")
res_multi = calculate_average_sigmoid_sum(model, multi_topic_data[:100], tokenizer)
print(f"Multi topic sigmoid average sum: {res_multi}")
res_no_topic = calculate_average_sigmoid_sum(model, no_topic_data[:100], tokenizer)
print(f"No-topic sigmoid average sum: {res_no_topic}")

Single topic sigmoid average sum: 1.371899127960205
Multi topic sigmoid average sum: 2.0421056747436523
No-topic sigmoid average sum: 1.59225332736969


In [36]:
from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score
from tqdm import tqdm

In [75]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def get_relevant_labels(input_texts, model, tokenizer, threshold):
    inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: inputs[key].to(device) for key in inputs}
    outputs = model(**inputs)
    logits = outputs.logits.cpu().detach().numpy()
    probabilities = sigmoid(logits)
    return (probabilities > threshold).astype(int)

def evaluate_threshold(threshold, dataset, model, tokenizer):
    all_predictions = []
    all_labels = []
    for i, example in dataset.iterrows():
        input_text = example['text']
        true_labels = example['labels']
        if type(true_labels) == 'str':
            true_labels = true_labels[1:-1].split(", ")
        true_labels = np.array(true_labels).astype(float).astype(int)
        predicted_labels = get_relevant_labels(input_text, model, tokenizer, threshold)[0]

        all_predictions.append(predicted_labels)
        all_labels.append(true_labels)

    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)

    f1_micro = f1_score(all_labels, all_predictions, average="micro")
    f1_macro = f1_score(all_labels, all_predictions, average="macro")
    precision_micro = precision_score(all_labels, all_predictions, average="micro")
    precision_macro = precision_score(all_labels, all_predictions, average="macro")
    recall_micro = recall_score(all_labels, all_predictions, average="micro")
    recall_macro = recall_score(all_labels, all_predictions, average="macro")
    
    return {
        "threshold": threshold,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        "precision_micro": precision_micro,
        "precision_macro": precision_macro,
        "recall_micro": recall_micro,
        "recall_macro": recall_macro,
    }

In [68]:
thresholds = np.linspace(0.3, 0.7, 5)
results = []
for threshold in tqdm(thresholds):
    metrics = evaluate_threshold(threshold, validation_dataset, model, tokenizer)
    results.append(metrics)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 5/5 [04:30<00:00, 54.10s/it]


In [70]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,threshold,f1_micro,f1_macro,precision_micro,precision_macro,recall_micro,recall_macro
0,0.3,0.690918,0.625142,0.671563,0.644798,0.711421,0.63195
1,0.4,0.694315,0.613446,0.724748,0.697536,0.666334,0.576878
2,0.5,0.685837,0.582313,0.764057,0.725462,0.622145,0.522587
3,0.6,0.667169,0.544367,0.796512,0.739606,0.573965,0.46828
4,0.7,0.633424,0.482325,0.825613,0.751999,0.513815,0.39797


In [76]:
final_metrics = evaluate_threshold(0.3, data, model, tokenizer)

  _warn_prf(average, modifier, msg_start, len(result))


In [77]:
print("Test Set Metrics:\n", final_metrics)

Test Set Metrics:
 {'threshold': 0.3, 'f1_micro': 0.630053750876373, 'f1_macro': 0.5358135956007237, 'precision_micro': 0.5496992557855032, 'precision_macro': 0.48363852575058386, 'recall_micro': 0.7379225400301082, 'recall_macro': 0.6235016360976259}


In [79]:
df_test_result = pd.Series(final_metrics)
df_test_result.head()

threshold          0.300000
f1_micro           0.630054
f1_macro           0.535814
precision_micro    0.549699
precision_macro    0.483639
dtype: float64