In [1]:
import os
print(os.getcwd())

/cs/student/project_msc/2025/aisd/gracelin/gracelin/code/ai4sd/cw2/src/replication


In [2]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 4090


In [3]:
from datasets import load_dataset
import logging
os.environ["HUGGINGFACE_TRAINER_ENABLE_PROGRESS_BAR"] = "1"
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.INFO)

In [4]:
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, pipeline
from sklearn.metrics import balanced_accuracy_score, precision_recall_fscore_support, classification_report
from pathlib import Path

In [5]:
emgsd_train = load_dataset("holistic-ai/EMGSD", split="train")
emgsd_test = load_dataset("holistic-ai/EMGSD", split="test")
model_name = "albert/albert-base-v2"
model_output_dir = "albertv2"

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

train.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/45760 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11441 [00:00<?, ? examples/s]

In [6]:
# Binary classification: stereotype vs non-stereotype
# Map labels containing "stereotype" to 1, all others to 0
label2id = {"non-stereotype": 0, "stereotype": 1}
id2label = {0: "non-stereotype", 1: "stereotype"}
num_labels = 2

# Convert original labels to binary
def get_binary_label(label):
    return 1 if "stereotype" in label.lower() else 0

In [7]:
np.random.seed(88)

tracker = EmissionsTracker()
tracker.start()

try:
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=num_labels, 
        id2label=id2label, label2id=label2id, 
        ignore_mismatched_sizes=True
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)

    emgsd_train_split, emgsd_val = emgsd_train.train_test_split(test_size=0.2, seed=88).values()

    print("Sample input from train:", emgsd_train_split[0])

    tokenized_train = emgsd_train_split.map(tokenize_function, batched=True).map(
        lambda examples: {'labels': get_binary_label(examples['category'])}
    )
    tokenized_val = emgsd_val.map(tokenize_function, batched=True).map(
        lambda examples: {'labels': get_binary_label(examples['category'])}
    )
    
    # Remove columns that can't be converted to tensors (keep only what the model needs)
    columns_to_remove = [col for col in tokenized_train.column_names if col not in ['input_ids', 'attention_mask', 'token_type_ids', 'labels']]
    tokenized_train = tokenized_train.remove_columns(columns_to_remove)
    tokenized_val = tokenized_val.remove_columns(columns_to_remove)
    
    print("Sample tokenized input from train:", tokenized_train[0])
    print("Sample tokenized input from validation:", tokenized_val[0])
    print(f"Label type check (train): {type(tokenized_train[0]['labels'])} = {tokenized_train[0]['labels']}")
    print(f"Label type check (val): {type(tokenized_val[0]['labels'])} = {tokenized_val[0]['labels']}")
    print(f"Columns in tokenized_train: {tokenized_train.column_names}")

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
        balanced_acc = balanced_accuracy_score(labels, predictions)
        return {"precision": precision, "recall": recall, "f1": f1, "balanced accuracy": balanced_acc}

    model_output_dir_path = Path(model_output_dir)
    model_output_dir_path.mkdir(parents=True, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=model_output_dir_path, num_train_epochs=6, eval_strategy="epoch", learning_rate=2e-5,
        per_device_train_batch_size=64, per_device_eval_batch_size=64, weight_decay=0.01,
        save_strategy="epoch", load_best_model_at_end=True, save_total_limit=1)

    trainer = Trainer(
        model=model, args=training_args, processing_class=tokenizer, train_dataset=tokenized_train,
        eval_dataset=tokenized_val, compute_metrics=compute_metrics)

    trainer.train()
    trainer.save_model(model_output_dir_path)
    print("Training complete.")
finally:
    emissions: float = tracker.stop()

print(f"Estimated total emissions: {str(emissions)} kg CO2")

[codecarbon INFO @ 20:32:04] [setup] RAM Tracking...
[codecarbon INFO @ 20:32:04] [setup] CPU Tracking...
[codecarbon INFO @ 20:32:04] [setup] RAM Tracking...
[codecarbon INFO @ 20:32:04] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 20:32:04] CPU Model on constant consumption mode: AMD Ryzen 9 9950X 16-Core Processor
[codecarbon INFO @ 20:32:04] [setup] GPU Tracking...
[codecarbon INFO @ 20:32:04] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 20:32:04] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: pynvml
            
 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 20:32:04] CPU Model on constant consumption mode: 

Sample input from train: {'stereotype_type': 'nationality', 'text': 'The Ukrainian did not know where he was He was very smart and grew up poor.', 'text_with_marker': 'The Ukrainian did not know where he was ===He was very smart and grew up poor.===', 'category': 'stereotype', 'data_source': 'stereoset_intersentence', 'label': 'stereotype_nationality'}


Map:   0%|          | 0/36608 [00:00<?, ? examples/s]

Map:   0%|          | 0/9152 [00:00<?, ? examples/s]

[codecarbon INFO @ 20:32:16] [setup] RAM Tracking...
[codecarbon INFO @ 20:32:16] [setup] CPU Tracking...
[codecarbon INFO @ 20:32:16] [setup] RAM Tracking...
[codecarbon INFO @ 20:32:16] [setup] CPU Tracking...


Sample tokenized input from train: {'input_ids': [2, 14, 5354, 144, 52, 143, 113, 24, 23, 24, 23, 253, 3978, 17, 1642, 71, 1696, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': 1}
Sample tokenized input from validation: {'input_ids': [2, 12126, 25, 40, 1180, 475, 14, 148, 235, 50, 65, 6611, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

 Linux OS detected: Please ensure RAPL files exist, and are readable, at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 20:32:17] CPU Model on constant consumption mode: AMD Ryzen 9 9950X 16-Core Processor
[codecarbon INFO @ 20:32:17] [setup] GPU Tracking...
[codecarbon INFO @ 20:32:17] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 20:32:17] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 20:32:17] >>> Tracker's metadata:
[codecarbon INFO @ 20:32:17]   Platform system: Linux-5.14.0-570.58.1.el9_6.x86_64-x86_64-with-glibc2.34
[codecarbon INFO @ 20:32:17]   Python version: 3.10.19
[codecarbon INFO @ 20:32:17]   CodeCarbon version: 3.2.0
[codecarbon INFO @ 20:32:17]   Available RAM : 60.203 GB
[codecarbon INFO @ 20:32:17]   CPU count: 32 thread(s) in 1 physical CPU(s)
[codecar

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Balanced accuracy
1,0.4718,0.425329,0.769005,0.791489,0.775401,0.791489
2,0.3489,0.37222,0.798308,0.80813,0.802627,0.80813
3,0.2701,0.382345,0.812685,0.817989,0.815186,0.817989
4,0.2066,0.455595,0.814527,0.807972,0.811046,0.807972
5,0.1458,0.539823,0.811597,0.815228,0.813343,0.815228
6,0.0917,0.627202,0.807964,0.808559,0.80826,0.808559


[codecarbon INFO @ 20:32:35] Energy consumed for RAM : 0.000086 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 20:32:36] Delta energy consumed for CPU with cpu_load : 0.000073 kWh, power : 17.010660439769232 W
[codecarbon INFO @ 20:32:36] Energy consumed for All CPU : 0.000073 kWh
[codecarbon INFO @ 20:32:36] Energy consumed for all GPUs : 0.001187 kWh. Total GPU Power : 266.9934270807599 W
[codecarbon INFO @ 20:32:36] 0.001346 kWh of electricity and 0.000000 L of water were used since the beginning.
[codecarbon INFO @ 20:32:36] Delta energy consumed for CPU with cpu_load : 0.000073 kWh, power : 17.010660439769232 W
[codecarbon INFO @ 20:32:36] Energy consumed for All CPU : 0.000073 kWh
[codecarbon INFO @ 20:32:36] Energy consumed for all GPUs : 0.001187 kWh. Total GPU Power : 266.9934270807599 W
[codecarbon INFO @ 20:32:36] 0.001346 kWh of electricity and 0.000000 L of water were used since the beginning.
[codecarbon INFO @ 20:32:39] Energy consumed for RAM : 0.000174 kWh. RAM Power : 20.

Training complete.


[codecarbon INFO @ 20:38:43] Delta energy consumed for CPU with cpu_load : 0.000020 kWh, power : 17.000000867 W
[codecarbon INFO @ 20:38:43] Energy consumed for All CPU : 0.001809 kWh
[codecarbon INFO @ 20:38:43] Energy consumed for all GPUs : 0.033969 kWh. Total GPU Power : 61.36706371527272 W
[codecarbon INFO @ 20:38:43] 0.037905 kWh of electricity and 0.000000 L of water were used since the beginning.
[codecarbon INFO @ 20:38:43] Energy consumed for All CPU : 0.001809 kWh
[codecarbon INFO @ 20:38:43] Energy consumed for all GPUs : 0.033969 kWh. Total GPU Power : 61.36706371527272 W
[codecarbon INFO @ 20:38:43] 0.037905 kWh of electricity and 0.000000 L of water were used since the beginning.


Estimated total emissions: 0.009005837202744633 kg CO2


In [7]:
emgsd_test.column_names

['stereotype_type',
 'text',
 'text_with_marker',
 'category',
 'data_source',
 'label']

In [None]:
np.random.seed(88)
print(f"Number of unique labels: {num_labels}")

model = AutoModelForSequenceClassification.from_pretrained(
    model_output_dir,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id, 
    ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_output_dir)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True, max_length=512)

tokenized_test = emgsd_test.map(tokenize_function, batched=True).map(
    lambda examples: {'labels': get_binary_label(examples['category'])})
print("Sample tokenized input from test:", tokenized_test[0])

result_output_dir = Path(model_output_dir).parent / "results"
result_output_dir.mkdir(parents=True, exist_ok=True)

# Use GPU (device=0) instead of CPU (device=-1) for faster inference
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

# Convert to list - the pipeline expects a list of strings
test_texts = list(emgsd_test['text'])
predictions = pipe(test_texts, top_k=1)

# Extract label and score from nested list results
pred_labels = [1 if pred[0]['label'] == 'stereotype' else 0 for pred in predictions]
pred_probs = [pred[0]['score'] for pred in predictions]
y_true = [get_binary_label(label) for label in emgsd_test['category']]
results_df = pd.DataFrame({
    'text': emgsd_test['text'],
    'predicted_label': pred_labels,
    'predicted_probability': pred_probs,
    'actual_label': y_true,
    'group': emgsd_test['category'],
    # 'dataset_name': emgsd_test['data_name']
})

results_file_path = result_output_dir / "full_results.csv"
results_df.to_csv(results_file_path, index=False)

report = classification_report(y_true, pred_labels, output_dict=True)
df_report = pd.DataFrame(report).transpose()
result_file_path = result_output_dir / "classification_report.csv"
df_report.to_csv(result_file_path)

print("Classification Report:")
print(df_report)

Device set to use cuda:0


Number of unique labels: 2
Sample tokenized input from test: {'stereotype_type': 'nationality', 'text': ' Central African rebels left towns dirty', 'text_with_marker': ' Central African rebels left towns dirty', 'category': 'stereotype', 'data_source': 'seegull_augmented', 'label': 'stereotype_nationality', 'input_ids': [2, 521, 1180, 8485, 225, 3613, 6121, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': 1}
Classification Report:
              precision    recall  f1-score       support
0              0.880530  0.855873  0.868026   753