# RUNS: hate-speech-CNERG/bert-base-uncased-hatexplain

## Colab load

In [1]:
!git clone https://github.com/dierop/ALCLabs

Cloning into 'ALCLabs'...
remote: Enumerating objects: 286, done.[K
remote: Counting objects: 100% (286/286), done.[K
remote: Compressing objects: 100% (192/192), done.[K
remote: Total 286 (delta 151), reused 217 (delta 89), pack-reused 0 (from 0)[K
Receiving objects: 100% (286/286), 4.37 MiB | 9.88 MiB/s, done.
Resolving deltas: 100% (151/151), done.


In [1]:
!pip install datasets



In [2]:
cd ALCLabs/Exist2025/

/content/ALCLabs/Exist2025


In [3]:
ls

[0m[01;34mdata[0m/                       task3_2_hard_ScalaR_1.json  [01;34mtrainer_output[0m/
dataloader.py               task3_2_soft_ScalaR_1.json  train.py
[01;34m__pycache__[0m/                task3_3_hard_ScalaR_1.json  [01;34mwandb[0m/
task3_1_hard_ScalaR_1.json  task3_3_soft_ScalaR_1.json
task3_1_soft_ScalaR_1.json  test.ipynb


## Temp fix

In [4]:
from train import BertTrainerWrapper


In [4]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    f1_score, log_loss, accuracy_score
)
from datasets import Dataset

import torch
import numpy as np

from dataloader import load_data_json
from typing import List, Dict, Any
import json

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score, log_loss
from datasets import Dataset
from collections import Counter, defaultdict
import torch, numpy as np, pandas as pd
from scipy.special import expit
from typing import List, Dict, Any
import json

class Test:
    def __init__(self, df=None, label_name="label1", soft=False,
                 model_name="bert-base-multilingual-cased"):
        if df is None:
            df = load_data_json("data/EXIST2025_training_videos.json", soft=soft)
        self.df = df.dropna(subset=[label_name]).reset_index(drop=True)
        self.label_name = label_name
        self.soft = soft
        self.model_name = model_name

        self.label_encoder = None
        self.keys = None
        self.num_labels = None
        self.problem_type = None

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = None
        self.dataset = None
        self.trainer = None

    # ------------------------------------------------------------------
    # 1. PREPARAR ETIQUETAS
    # ------------------------------------------------------------------
    def prepare_labels(self, threshold=0.5):
        col = self.df[self.label_name]

        # -------- SOFT -------------------------------------------------
        if self.soft:
            # dict → vector
            if isinstance(col.iloc[0], dict):
                self.keys = sorted(set().union(*col.apply(lambda x: x.keys())))
                self.df["labels"] = col.apply(
                    lambda d: [d.get(k, 0.0) for k in self.keys])
                self.num_labels = len(self.keys)
                self.problem_type = "multi_label_classification"

            # float → regresión binaria
            else:
                self.df["labels"] = col.astype(float)
                self.num_labels = 1
                self.problem_type = "regression"

            return  # fin SOFT

        # -------- HARD -------------------------------------------------
        # HARD MULTI-LABEL  (lista de etiquetas)
        if isinstance(col.iloc[0], (list, set)):
            # multi-label hard  → vector multihot float
            self.keys = sorted(set().union(*col))

            def to_multihot(lst):
                return [float(1) if k in lst else float(0) for k in self.keys]  # 🆕 float

            self.df["labels"] = col.apply(to_multihot)
            self.num_labels   = len(self.keys)
            self.problem_type = "multi_label_classification"

        # HARD SINGLE-LABEL  (cadena)
        else:
            self.label_encoder = LabelEncoder()
            self.df["labels"] = self.label_encoder.fit_transform(col)
            self.num_labels = len(self.label_encoder.classes_)
            self.problem_type = "single_label_classification"

    # ------------------------------------------------------------------
    # 2. TOKENIZAR
    # ------------------------------------------------------------------
    def tokenize_dataset(self):
        ds = Dataset.from_pandas(self.df[["text", "labels"]])
        ds = ds.train_test_split(test_size=0.15, shuffle=True)
        ds = ds.map(lambda x: self.tokenizer(
            x["text"], truncation=True, padding="max_length",
            max_length=128), batched=False)
        self.dataset = ds

    # ------------------------------------------------------------------
    # 3. CONSTRUIR MODELO
    # ------------------------------------------------------------------
    def build_model(self):
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels,
            problem_type=self.problem_type,
            ignore_mismatched_sizes=True
        )

    # ------------------------------------------------------------------
    # 4. ENTRENAR
    # ------------------------------------------------------------------
    def train(self, epochs=3, batch_size=8, lr=2e-5, eval_batch_size=2,
              eval_accumulation_steps=32):
        self.prepare_labels()
        self.tokenize_dataset()
        self.build_model()

        args = TrainingArguments(
            eval_strategy="epoch",
            learning_rate=lr,
            per_device_train_batch_size=batch_size,
            num_train_epochs=epochs,
            weight_decay=0.01,
            per_device_eval_batch_size = eval_batch_size,
            eval_accumulation_steps    = eval_accumulation_steps,
        )

        self.trainer = Trainer(
            model=self.model,
            args=args,
            train_dataset=self.dataset['train'],
            eval_dataset=self.dataset['test'],
            tokenizer=self.tokenizer,
            compute_metrics=self.compute_metrics
        )

        self.trainer.train()

    # ------------------------------------------------------------------
    # 5. MÉTRICAS
    # ------------------------------------------------------------------
    def compute_metrics(self, eval_pred, th=0.5):
      logits = eval_pred.predictions        # shape (N, K)
      labels = eval_pred.label_ids
      if isinstance(logits, (tuple, list)):
        logits = logits[0]

      # Si es lista de lotes, concaténalos
      if isinstance(logits, (list, tuple)) and isinstance(logits[0], np.ndarray):
          logits = np.concatenate(logits, axis=0)

      # ----- SOFT ------------------------------------------------------
      if self.soft:
          probs = expit(logits)            # sin torch, sin GPU

          if self.num_labels == 1:
              y_true = labels.squeeze()
              y_pred = probs.squeeze()
              y_true_h = (y_true >= th).astype(int)
              y_pred_h = (y_pred >= th).astype(int)

              return {
                  "mae":     float(np.mean(np.abs(y_pred - y_true))),
                  "logloss": float(log_loss(y_true_h, y_pred)),
                  "f1_micro": float(f1_score(y_true_h, y_pred_h, average="micro")),
                  "f1_macro": float(f1_score(y_true_h, y_pred_h, average="macro")),
              }

          # multilabel
          y_true   = labels
          y_pred   = probs
          y_true_h = (y_true >= th).astype(int)
          y_pred_h = (y_pred >= th).astype(int)

          ce = np.mean([log_loss(y_true_h[:, k], y_pred[:, k])
                        for k in range(self.num_labels)])

          return {
              "mae":       float(np.mean(np.abs(y_pred - y_true))),
              "logloss":   float(ce),
              "f1_micro":  float(f1_score(y_true_h.reshape(-1),
                                          y_pred_h.reshape(-1),
                                          average="micro", zero_division=0)),
              "f1_macro":  float(f1_score(y_true_h,
                                          y_pred_h,
                                          average="macro", zero_division=0)),
          }

      # ----- HARD MULTILABEL ------------------------------------------
      if self.problem_type == "multi_label_classification":
          probs = expit(logits)
          y_pred = (probs >= th).astype(int)
          micro  = f1_score(labels.reshape(-1), y_pred.reshape(-1),
                            average="micro", zero_division=0)
          macro  = f1_score(labels, y_pred,
                            average="macro", zero_division=0)
          return {"f1_micro": float(micro), "f1_macro": float(macro)}

      # ----- HARD SINGLE-LABEL ----------------------------------------
      preds = np.argmax(logits, axis=-1)
      return {"accuracy": float(accuracy_score(labels, preds))}

    # ------------------------------------------------------------------
    # 6. PREDICT FLEXIBLE
    # ------------------------------------------------------------------
    @torch.no_grad()
    def predict(self, texts, threshold=0.5, return_probabilities=True):
        if isinstance(texts, str):
            texts = [texts]

        toks = self.tokenizer(
            texts, return_tensors="pt",
            truncation=True, padding="max_length", max_length=128
        )
        device = next(self.model.parameters()).device
        toks = {k: v.to(device) for k, v in toks.items()}
        self.model.eval()

        logits = self.model(**toks).logits

        # ---------- soft ---------------------------------------------
        if self.soft:
            # binario soft
            if self.num_labels == 1:
                probs = torch.sigmoid(logits).squeeze().cpu().tolist()
                return probs if return_probabilities else [
                    int(p >= threshold) for p in probs
                ]
            # multilabel soft
            probs = torch.sigmoid(logits).cpu().numpy()
            outputs = []
            for vec in probs:
                d = {k: float(v) for k, v in zip(self.keys, vec)}
                if return_probabilities:
                    outputs.append(d)
                else:
                    sel = [k for k, p in d.items() if p >= threshold] or \
                          [max(d, key=d.get)]
                    outputs.append(sel)
            return outputs

        # ---------- hard ---------------------------------------------
        if self.problem_type == "multi_label_classification":
            probs = torch.sigmoid(logits).cpu().numpy()
            outputs = []
            for vec in probs:
                labels = [k for k, v in zip(self.keys, vec) if v >= threshold]
                outputs.append(labels or [self.keys[np.argmax(vec)]])
            return outputs
        else:
            pred_ids = logits.argmax(-1).cpu().numpy()
            return self.label_encoder.inverse_transform(pred_ids)


    def build_submission(
        self,
        threshold: float = 0.5,
        test_case: str = "EXIST2025",
        return_probabilities: bool = True,
        model_name: str = None,
        task_name: str = "task3_3",
        team_name: str = "ScalaR",
        run:int=1,
    ) -> List[Dict[str, Any]]:
        """
        label_name ..... 'label1', 'label2' o 'label3'
        threshold ...... umbral para convertir prob→etiqueta (solo hard multietiqueta)
        test_case ...... literal a incluir en cada JSON
        return_probabilities
            soft=True  -> dict de probabilidades (como tus ejemplos) si True,
                        lista de etiquetas si False
            soft=False -> se ignora; siempre lista/str de etiquetas duras
        """
        # ------------------------------------------------------
        # 1. Normalizar samples a lista ordenada [ {"id":..,"text":..}, ... ]
        # ------------------------------------------------------
        data=load_data_json("data/EXIST2025_training_videos.json")

        # DAta is a pandas dataframe

        texts = data['text'].tolist()
        ids = data['id'].tolist()
        # ------------------------------------------------------
        # 2. Inferencia
        # ------------------------------------------------------
        preds = [self.predict(
            p,
            threshold=threshold,
            return_probabilities=return_probabilities
        ) for p in texts]

        # ------------------------------------------------------
        # 3. Empaquetar resultado por muestra
        # ------------------------------------------------------
        outputs = []
        for s, p in zip(ids, preds):
            # p puede ser:
            #   • dict  -> probabilidades por clase
            #   • list  -> lista de etiquetas duras
            #   • int/str -> etiqueta única (hard-single)
            #   • float -> prob. binaria (soft escalar)
            value: Any
            if isinstance(p, list):
                value = p
                if isinstance(p[0], dict):
                    # soft multi-label
                    # if k=='-' convertimos a 'NO'
                    value = {k if k != "-" else 'NO': v for k, v in p[0].items()}

            elif isinstance(p,  str):
                value = p
            elif isinstance(p, np.ndarray):
                value=  p[0]
                value = 'NO' if value  == '-' else value

            outputs.append(
                {
                    "test_case": test_case,
                    "id": s,
                    "value": value,
                }
            )

        output_file = f'{task_name}_{"soft" if self.soft else "hard"}_{team_name}_{run}.json'
        with open(output_file, "w") as f:
            json.dump(outputs, f, indent=4, ensure_ascii=False)
        print(f"Submission saved to {output_file}")
        return outputs


## Soft

In [5]:
bert1 = Test(soft=True, model_name="hate-speech-CNERG/bert-base-uncased-hatexplain", label_name="label1")
bert1.train(epochs=3)
bert1.build_submission(task_name="task3_1",run=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/2145 [00:00<?, ? examples/s]

Map:   0%|          | 0/379 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hate-speech-CNERG/bert-base-uncased-hatexplain and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mdiegorospagan[0m ([33mdiegorospagan-universitat-polit-cnica-de-val-ncia[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Mae,Logloss,F1 Micro,F1 Macro
1,No log,0.634674,0.372965,0.635147,0.622691,0.614152
2,0.646400,0.624521,0.342619,0.621247,0.658311,0.65427
3,0.646400,0.631771,0.332298,0.626889,0.671504,0.670955


Submission saved to task3_1_soft_ScalaR_2.json


[{'test_case': 'EXIST2025',
  'id': '120001',
  'value': {'NO': 0.854523777961731, 'YES': 0.134112149477005}},
 {'test_case': 'EXIST2025',
  'id': '120002',
  'value': {'NO': 0.5989495515823364, 'YES': 0.4302096366882324}},
 {'test_case': 'EXIST2025',
  'id': '120003',
  'value': {'NO': 0.2501389980316162, 'YES': 0.7219666242599487}},
 {'test_case': 'EXIST2025',
  'id': '120004',
  'value': {'NO': 0.22631269693374634, 'YES': 0.7692561149597168}},
 {'test_case': 'EXIST2025',
  'id': '120005',
  'value': {'NO': 0.826419472694397, 'YES': 0.18121089041233063}},
 {'test_case': 'EXIST2025',
  'id': '120006',
  'value': {'NO': 0.2964475154876709, 'YES': 0.6951873302459717}},
 {'test_case': 'EXIST2025',
  'id': '120007',
  'value': {'NO': 0.16292542219161987, 'YES': 0.825683057308197}},
 {'test_case': 'EXIST2025',
  'id': '120008',
  'value': {'NO': 0.1506296694278717, 'YES': 0.831970751285553}},
 {'test_case': 'EXIST2025',
  'id': '120009',
  'value': {'NO': 0.2976810038089752, 'YES': 0.69082

In [8]:
#liberate GPU RAM
import torch
torch.cuda.empty_cache()



In [7]:
bert2 = Test(soft=True, model_name="hate-speech-CNERG/bert-base-uncased-hatexplain", label_name="label2")
bert2.train(epochs=3)
bert2.build_submission(task_name="task3_2",run=2)

Map:   0%|          | 0/2145 [00:00<?, ? examples/s]

Map:   0%|          | 0/379 [00:00<?, ? examples/s]

  self.trainer = Trainer(


Epoch,Training Loss,Validation Loss,Mae,Logloss,F1 Micro,F1 Macro
1,No log,0.550013,0.322967,0.55132,0.716799,0.233436
2,0.537600,0.532096,0.299052,0.532928,0.737907,0.38827
3,0.537600,0.54328,0.291516,0.544458,0.737027,0.451699


Submission saved to task3_2_soft_ScalaR_2.json


[{'test_case': 'EXIST2025',
  'id': '120001',
  'value': {'NO': 0.7327743172645569,
   'DIRECT': 0.33858272433280945,
   'JUDGEMENTAL': 0.037225548177957535}},
 {'test_case': 'EXIST2025',
  'id': '120002',
  'value': {'NO': 0.7137416005134583,
   'DIRECT': 0.22379235923290253,
   'JUDGEMENTAL': 0.042803045362234116}},
 {'test_case': 'EXIST2025',
  'id': '120003',
  'value': {'NO': 0.5235983729362488,
   'DIRECT': 0.41495561599731445,
   'JUDGEMENTAL': 0.06807385385036469}},
 {'test_case': 'EXIST2025',
  'id': '120004',
  'value': {'NO': 0.3096788823604584,
   'DIRECT': 0.5201644897460938,
   'JUDGEMENTAL': 0.13389261066913605}},
 {'test_case': 'EXIST2025',
  'id': '120005',
  'value': {'NO': 0.5652806162834167,
   'DIRECT': 0.31144458055496216,
   'JUDGEMENTAL': 0.06545636802911758}},
 {'test_case': 'EXIST2025',
  'id': '120006',
  'value': {'NO': 0.7175052165985107,
   'DIRECT': 0.2622414529323578,
   'JUDGEMENTAL': 0.04356658086180687}},
 {'test_case': 'EXIST2025',
  'id': '120007',


In [9]:
bert3 = Test(soft=True, model_name="hate-speech-CNERG/bert-base-uncased-hatexplain", label_name="label3")
bert3.train(epochs=3)
bert3.build_submission(task_name="task3_3",run=2)

Map:   0%|          | 0/2145 [00:00<?, ? examples/s]

Map:   0%|          | 0/379 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hate-speech-CNERG/bert-base-uncased-hatexplain and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.trainer = Trainer(


Epoch,Training Loss,Validation Loss,Mae,Logloss,F1 Micro,F1 Macro
1,No log,0.322942,0.172009,0.32341,0.863237,0.120968
2,0.341500,0.310256,0.156942,0.303119,0.870273,0.130478
3,0.341500,0.313652,0.157075,0.304277,0.873791,0.166078


Submission saved to task3_3_soft_ScalaR_2.json


[{'test_case': 'EXIST2025',
  'id': '120001',
  'value': {'NO': 0.8136799931526184,
   'IDEOLOGICAL-INEQUALITY': 0.02814907394349575,
   'MISOGYNY-NON-SEXUAL-VIOLENCE': 0.02100774459540844,
   'OBJECTIFICATION': 0.06081663817167282,
   'SEXUAL-VIOLENCE': 0.023641666397452354,
   'STEREOTYPING-DOMINANCE': 0.0880696177482605}},
 {'test_case': 'EXIST2025',
  'id': '120002',
  'value': {'NO': 0.7127540111541748,
   'IDEOLOGICAL-INEQUALITY': 0.050591424107551575,
   'MISOGYNY-NON-SEXUAL-VIOLENCE': 0.02621469832956791,
   'OBJECTIFICATION': 0.05364593118429184,
   'SEXUAL-VIOLENCE': 0.02907707542181015,
   'STEREOTYPING-DOMINANCE': 0.10909833759069443}},
 {'test_case': 'EXIST2025',
  'id': '120003',
  'value': {'NO': 0.3495138883590698,
   'IDEOLOGICAL-INEQUALITY': 0.17356429994106293,
   'MISOGYNY-NON-SEXUAL-VIOLENCE': 0.055212683975696564,
   'OBJECTIFICATION': 0.05426902323961258,
   'SEXUAL-VIOLENCE': 0.04155711084604263,
   'STEREOTYPING-DOMINANCE': 0.24586300551891327}},
 {'test_case':

## Hard

In [14]:
#liberate GPU RAM
import torch
torch.cuda.empty_cache()


In [11]:
bert1 = Test(soft=False, model_name="hate-speech-CNERG/bert-base-uncased-hatexplain", label_name="label1")
bert1.train(epochs=3)
bert1.build_submission(task_name="task3_1",run=2)

Map:   0%|          | 0/2131 [00:00<?, ? examples/s]

Map:   0%|          | 0/377 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hate-speech-CNERG/bert-base-uncased-hatexplain and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.628192,0.65252
2,0.597600,0.642867,0.660477
3,0.597600,0.718777,0.633952


Submission saved to task3_1_hard_ScalaR_2.json


[{'test_case': 'EXIST2025', 'id': '120001', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120002', 'value': 'NO'},
 {'test_case': 'EXIST2025', 'id': '120003', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120004', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120005', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120006', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120007', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120008', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120009', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120010', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120011', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120012', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120013', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120014', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120015', 'value': 'YES'},
 {'test_case': 'EXIST2025', 'id': '120016', 'value': 'YES'},
 {'test_case': 'EXIST2025

In [13]:
bert2 = Test(soft=False, model_name="hate-speech-CNERG/bert-base-uncased-hatexplain", label_name="label2")
bert2.train(epochs=3)
bert2.build_submission(task_name="task3_2",run=2)

Map:   0%|          | 0/2096 [00:00<?, ? examples/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

  self.trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.918856,0.589189
2,0.898600,0.905311,0.591892
3,0.898600,1.005541,0.581081


Submission saved to task3_2_hard_ScalaR_2.json


[{'test_case': 'EXIST2025', 'id': '120001', 'value': 'DIRECT'},
 {'test_case': 'EXIST2025', 'id': '120002', 'value': 'NO'},
 {'test_case': 'EXIST2025', 'id': '120003', 'value': 'DIRECT'},
 {'test_case': 'EXIST2025', 'id': '120004', 'value': 'DIRECT'},
 {'test_case': 'EXIST2025', 'id': '120005', 'value': 'NO'},
 {'test_case': 'EXIST2025', 'id': '120006', 'value': 'NO'},
 {'test_case': 'EXIST2025', 'id': '120007', 'value': 'DIRECT'},
 {'test_case': 'EXIST2025', 'id': '120008', 'value': 'DIRECT'},
 {'test_case': 'EXIST2025', 'id': '120009', 'value': 'DIRECT'},
 {'test_case': 'EXIST2025', 'id': '120010', 'value': 'DIRECT'},
 {'test_case': 'EXIST2025', 'id': '120011', 'value': 'DIRECT'},
 {'test_case': 'EXIST2025', 'id': '120012', 'value': 'DIRECT'},
 {'test_case': 'EXIST2025', 'id': '120013', 'value': 'DIRECT'},
 {'test_case': 'EXIST2025', 'id': '120014', 'value': 'NO'},
 {'test_case': 'EXIST2025', 'id': '120015', 'value': 'NO'},
 {'test_case': 'EXIST2025', 'id': '120016', 'value': 'DIRECT

In [15]:
bert3 = Test(soft=False, model_name="hate-speech-CNERG/bert-base-uncased-hatexplain", label_name="label3")
bert3.train(epochs=3)
bert3.build_submission(task_name="task3_3",run=2)

Map:   0%|          | 0/1937 [00:00<?, ? examples/s]

Map:   0%|          | 0/342 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hate-speech-CNERG/bert-base-uncased-hatexplain and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([6]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Micro,F1 Macro
1,No log,0.328574,0.863548,0.126233
2,No log,0.332288,0.866959,0.175042
3,0.323500,0.331233,0.864035,0.207232


Submission saved to task3_3_hard_ScalaR_2.json


[{'test_case': 'EXIST2025', 'id': '120001', 'value': [['-']]},
 {'test_case': 'EXIST2025', 'id': '120002', 'value': [['-']]},
 {'test_case': 'EXIST2025',
  'id': '120003',
  'value': [['STEREOTYPING-DOMINANCE']]},
 {'test_case': 'EXIST2025',
  'id': '120004',
  'value': [['STEREOTYPING-DOMINANCE']]},
 {'test_case': 'EXIST2025', 'id': '120005', 'value': [['-']]},
 {'test_case': 'EXIST2025', 'id': '120006', 'value': [['-']]},
 {'test_case': 'EXIST2025',
  'id': '120007',
  'value': [['IDEOLOGICAL-INEQUALITY']]},
 {'test_case': 'EXIST2025',
  'id': '120008',
  'value': [['IDEOLOGICAL-INEQUALITY']]},
 {'test_case': 'EXIST2025', 'id': '120009', 'value': [['-']]},
 {'test_case': 'EXIST2025',
  'id': '120010',
  'value': [['STEREOTYPING-DOMINANCE']]},
 {'test_case': 'EXIST2025', 'id': '120011', 'value': [['-']]},
 {'test_case': 'EXIST2025', 'id': '120012', 'value': [['-']]},
 {'test_case': 'EXIST2025',
  'id': '120013',
  'value': [['STEREOTYPING-DOMINANCE']]},
 {'test_case': 'EXIST2025', 'id