In [1]:
import os
print(os.getcwd())

/cs/student/project_msc/2025/aisd/gracelin/gracelin/code/ai4sd/cw2/src/HEARTS


In [2]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 4070 Ti SUPER


In [3]:
import logging
os.environ["HUGGINGFACE_TRAINER_ENABLE_PROGRESS_BAR"] = "1"
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.INFO)

In [4]:
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, pipeline
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from pathlib import Path

In [5]:
from datasets import Dataset as ds

In [6]:
model_name = "albert/albert-base-v2"
model_output_dir = "custom_albertv2"

In [7]:
# Binary classification: stereotype vs non-stereotype
# Map labels containing "stereotype" to 1, all others to 0
label2id = {"False": 0, "True": 1}
id2label = {0: "False", 1: "True"}
num_labels = 2

# Convert original labels to binary
def get_binary_label(label):
    return 1 if label else 0

In [8]:
custom_df = pd.read_csv("../stereotype_final.csv", index_col=False)
custom_df['category'] = custom_df['stereotype'].map(lambda x: 1 if x else 0)
custom_df.rename(columns={'sentence': 'text'}, inplace=True)

custom_df, test_df = train_test_split(custom_df, test_size = 0.10)


In [9]:
custom_df.columns

Index(['Unnamed: 0', 'generalisation_category_label', 'connotation',
       'gram_form', 'generalisation_situation', 'situation_evaluation', 'text',
       'scsc_score', 'stereotype', 'roberta_score', 'category'],
      dtype='object')

In [10]:
np.random.seed(88)

tracker = EmissionsTracker()
tracker.start()

try:
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels, 
        id2label=id2label, label2id=label2id, 
        ignore_mismatched_sizes=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)
    
    custom_df = ds.from_pandas(custom_df)

    custom_train_split, custom_val = custom_df.train_test_split(test_size=0.2, seed=88).values()

    print("Sample input from train:", custom_df[0])

    tokenized_train = custom_train_split.map(tokenize_function, batched=True).map(
        lambda examples: {'labels': get_binary_label(examples['category'])}
    )
    tokenized_val = custom_val.map(tokenize_function, batched=True).map(
        lambda examples: {'labels': get_binary_label(examples['category'])}
    )
    
    # Remove columns that can't be converted to tensors (keep only what the model needs)
    columns_to_remove = [col for col in tokenized_train.column_names if col not in ['input_ids', 'attention_mask', 'token_type_ids', 'labels']]
    tokenized_train = tokenized_train.remove_columns(columns_to_remove)
    tokenized_val = tokenized_val.remove_columns(columns_to_remove)
    
    print("Sample tokenized input from train:", tokenized_train[0])
    print("Sample tokenized input from validation:", tokenized_val[0])
    print(f"Label type check (train): {type(tokenized_train[0]['labels'])} = {tokenized_train[0]['labels']}")
    print(f"Label type check (val): {type(tokenized_val[0]['labels'])} = {tokenized_val[0]['labels']}")
    print(f"Columns in tokenized_train: {tokenized_train.column_names}")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
        balanced_acc = balanced_accuracy_score(labels, predictions)
        return {"precision": precision, "recall": recall, "f1": f1, "balanced accuracy": balanced_acc}

    model_output_dir_path = Path(model_output_dir)
    model_output_dir_path.mkdir(parents=True, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=model_output_dir_path, num_train_epochs=6, eval_strategy="epoch", learning_rate=2e-5,
        per_device_train_batch_size=64, per_device_eval_batch_size=64, weight_decay=0.01,
        save_strategy="epoch", load_best_model_at_end=True, save_total_limit=1)

    trainer = Trainer(
        model=model, args=training_args, processing_class=tokenizer, train_dataset=tokenized_train,
        eval_dataset=tokenized_val, compute_metrics=compute_metrics)

    trainer.train()
    trainer.save_model(model_output_dir_path)
    print("Training complete.")
finally:
    emissions: float = tracker.stop()

print(f"Estimated total emissions: {str(emissions)} kg CO2")

[codecarbon INFO @ 12:11:03] [setup] RAM Tracking...
[codecarbon INFO @ 12:11:03] [setup] CPU Tracking...


 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 12:11:03] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-9800X CPU @ 3.80GHz
[codecarbon INFO @ 12:11:03] [setup] GPU Tracking...
[codecarbon INFO @ 12:11:03] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 12:11:03] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 12:11:03] >>> Tracker's metadata:
[codecarbon INFO @ 12:11:03]   Platform system: Linux-5.14.0-570.58.1.el9_6.x86_64-x86_64-with-glibc2.34
[codecarbon INFO @ 12:11:03]   Python version: 3.10.19
[codecarbon INFO @ 12:11:03]   CodeCarbon version: 3.2.0
[codecarbon INFO @ 12:11:03]   Available RAM : 30.799 GB
[codecarbon INFO @ 12:11:03]   CPU count: 16 thread(s) in 1 physical CPU(s)
[co

Sample input from train: {'Unnamed: 0': 234, 'generalisation_category_label': 'subset_generic target', 'connotation': 'neutral', 'gram_form': 'noun', 'generalisation_situation': 'enduring characteristics_abstract', 'situation_evaluation': 'negative', 'text': 'People living in syria just want to be violent', 'scsc_score': 0.7465039163141961, 'stereotype': True, 'roberta_score': 0.7847822, 'category': 1, '__index_level_0__': 234}


Map:   0%|          | 0/2845 [00:00<?, ? examples/s]

Map:   0%|          | 0/2845 [00:00<?, ? examples/s]

Map:   0%|          | 0/712 [00:00<?, ? examples/s]

Map:   0%|          | 0/712 [00:00<?, ? examples/s]

Sample tokenized input from train: {'input_ids': [2, 17044, 25, 21, 476, 16, 2257, 3860, 148, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': 0}
Sample tokenized input from validation: {'input_ids': [2, 14, 9148, 2014, 1687, 491, 57, 289, 24, 23, 228, 20, 233, 2230, 875, 34, 4721, 16435, 4894, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

[codecarbon INFO @ 12:11:09] [setup] RAM Tracking...
[codecarbon INFO @ 12:11:09] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 12:11:10] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-9800X CPU @ 3.80GHz
[codecarbon INFO @ 12:11:10] [setup] GPU Tracking...
[codecarbon INFO @ 12:11:10] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 12:11:10] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 12:11:10] >>> Tracker's metadata:
[codecarbon INFO @ 12:11:10]   Platform system: Linux-5.14.0-570.58.1.el9_6.x86_64-x86_64-with-glibc2.34
[codecarbon INFO @ 12:11:10]   Python version: 3.10.19
[codecarbon INFO @ 12:11:10]   CodeCarbon version: 3.2.0
[codecarbon INFO @ 12:11:10]  

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Balanced accuracy
1,No log,0.505904,0.76247,0.765895,0.76069,0.765895
2,No log,0.410425,0.830849,0.809888,0.814562,0.809888
3,No log,0.410081,0.824248,0.821862,0.822886,0.821862
4,No log,0.404072,0.833413,0.833629,0.83352,0.833629
5,No log,0.402253,0.839704,0.838011,0.838773,0.838011
6,No log,0.431985,0.833079,0.830075,0.83133,0.830075


[codecarbon INFO @ 12:11:22] Energy consumed for RAM : 0.000086 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 12:11:23] Delta energy consumed for CPU with cpu_load : 0.000085 kWh, power : 19.74569610090001 W
[codecarbon INFO @ 12:11:23] Energy consumed for All CPU : 0.000085 kWh
[codecarbon INFO @ 12:11:23] Energy consumed for all GPUs : 0.000737 kWh. Total GPU Power : 165.7024668344477 W
[codecarbon INFO @ 12:11:23] 0.000908 kWh of electricity and 0.000000 L of water were used since the beginning.
[codecarbon INFO @ 12:11:29] Energy consumed for RAM : 0.000089 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 12:11:30] Delta energy consumed for CPU with cpu_load : 0.000079 kWh, power : 17.624555701500004 W
[codecarbon INFO @ 12:11:30] Energy consumed for All CPU : 0.000079 kWh
[codecarbon INFO @ 12:11:30] Energy consumed for all GPUs : 0.000946 kWh. Total GPU Power : 195.81271363108314 W
[codecarbon INFO @ 12:11:30] 0.001114 kWh of electricity and 0.000000 L of water were used since the beginni

Training complete.


[codecarbon INFO @ 12:12:41] Delta energy consumed for CPU with cpu_load : 0.000008 kWh, power : 16.554860602500003 W
[codecarbon INFO @ 12:12:41] Energy consumed for All CPU : 0.000452 kWh
[codecarbon INFO @ 12:12:41] Energy consumed for all GPUs : 0.005278 kWh. Total GPU Power : 101.16225085754458 W
[codecarbon INFO @ 12:12:41] 0.006233 kWh of electricity and 0.000000 L of water were used since the beginning.


Estimated total emissions: 0.0014809582398911231 kg CO2


In [14]:
np.random.seed(88)

tracker = EmissionsTracker()
tracker.start()

try:
    test_df = ds.from_pandas(test_df)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_output_dir,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id, 
        ignore_mismatched_sizes=True)
    tokenizer = AutoTokenizer.from_pretrained(model_output_dir)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)

    tokenized_test = test_df.map(tokenize_function, batched=True).map(
        lambda examples: {'labels': get_binary_label(examples['category'])})
    print("Sample tokenized input from test:", tokenized_test[0])

    result_output_dir = Path(model_output_dir).parent / "custom_results"
    result_output_dir.mkdir(parents=True, exist_ok=True)

    # Use GPU (device=0) instead of CPU (device=-1) for faster inference
    pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

    # Convert to list - the pipeline expects a list of strings
    test_texts = list(test_df['text'])
    predictions = pipe(test_texts, top_k=1)

    # Extract label and score from nested list results
    pred_labels = [1 if pred[0]['label'] == 'stereotype' else 0 for pred in predictions]
    pred_probs = [pred[0]['score'] for pred in predictions]
    y_true = [get_binary_label(label) for label in test_df['category']]
    results_df = pd.DataFrame({
        'text': test_df['text'],
        'predicted_label': pred_labels,
        'predicted_probability': pred_probs,
        'actual_label': y_true,
        'group': test_df['category'],
    })

    results_file_path = result_output_dir / "full_results.csv"
    results_df.to_csv(results_file_path, index=False)
finally:
    emissions: float = tracker.stop()

print(f"Estimated total emissions: {str(emissions)} kg CO2")

[codecarbon INFO @ 12:17:42] [setup] RAM Tracking...
[codecarbon INFO @ 12:17:42] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 12:17:43] CPU Model on constant consumption mode: Intel(R) Core(TM) i7-9800X CPU @ 3.80GHz
[codecarbon INFO @ 12:17:43] [setup] GPU Tracking...
[codecarbon INFO @ 12:17:43] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 12:17:43] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 12:17:43] >>> Tracker's metadata:
[codecarbon INFO @ 12:17:43]   Platform system: Linux-5.14.0-570.58.1.el9_6.x86_64-x86_64-with-glibc2.34
[codecarbon INFO @ 12:17:43]   Python version: 3.10.19
[codecarbon INFO @ 12:17:43]   CodeCarbon version: 3.2.0
[codecarbon INFO @ 12:17:43]  

Map:   0%|          | 0/396 [00:00<?, ? examples/s]

Map:   0%|          | 0/396 [00:00<?, ? examples/s]

Device set to use cuda:0


Sample tokenized input from test: {'Unnamed: 0': 1459, 'generalisation_category_label': 'subset_generic target', 'connotation': 'negative', 'gram_form': 'noun', 'generalisation_situation': 'enduring characteristics_abstract', 'situation_evaluation': 'negative', 'text': 'Qatar is nice around this time of year. That place is terrorist just like every other Arab country.', 'scsc_score': 0.7366418006826077, 'stereotype': True, 'roberta_score': 0.7959243, 'category': 1, '__index_level_0__': 1459, 'input_ids': [2, 12384, 25, 2210, 140, 48, 85, 16, 159, 9, 30, 209, 25, 10012, 114, 101, 352, 89, 3666, 475, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels'

[codecarbon INFO @ 12:17:51] Energy consumed for RAM : 0.000028 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 12:17:52] Delta energy consumed for CPU with cpu_load : 0.000026 kWh, power : 18.2777283678 W
[codecarbon INFO @ 12:17:52] Energy consumed for All CPU : 0.000026 kWh
[codecarbon INFO @ 12:17:52] Energy consumed for all GPUs : 0.000186 kWh. Total GPU Power : 119.2919555880455 W
[codecarbon INFO @ 12:17:52] 0.000241 kWh of electricity and 0.000000 L of water were used since the beginning.


Estimated total emissions: 5.721295564169141e-05 kg CO2
