In [1]:
!pip install accelerate -U
!pip install datasets
!pip install evaluate
!pip install wikidata
!pip install sentencepiece
!pip install wandb

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [2]:
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed
)
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from tqdm import tqdm


In [3]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
from huggingface_hub import login

token = ""

# non spuntare add token as git credential
login(token = token)

In [9]:
train_corpus = ""
root = '/content/drive/MyDrive/SocioEmbeddings/data'

with open(f"{root}/wikitext_trainset.txt", "r") as f:
    train_corpus = f.read()
train_corpus = train_corpus.split("\n")

test_corpus = ""

with open(f"{root}/wikitext_testset.txt", "r") as f:
    test_corpus = f.read()
test_corpus = test_corpus.split("\n")

display(len(train_corpus))
display(len(test_corpus))

6271

301

In [10]:
dataset = load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset')
train, test = dataset["train"], dataset["validation"]
display(len(train))
display(len(test))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.31k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/946k [00:00<?, ?B/s]

valid.csv:   0%|          | 0.00/45.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6251 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/300 [00:00<?, ? examples/s]

6251

300

In [11]:
# Define label mapping
label2id = {
    "cultural exclusive": 0,
    "cultural agnostic": 1,
    "cultural representative" : 2
}

id2label = {
    0: "cultural exclusive",
    1: "cultural agnostic",
    2: "cultural representative"
}

In [12]:
# Define the custom WikiDataset
class WikiDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, corpus, label2id, id2label, tokenizer, max_length, max_chunks_per_article=None):
        self.dataset = dataset
        self.corpus = corpus
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.id2label = id2label
        self.max_length = max_length
        self.max_chunks_per_article = max_chunks_per_article
        self.samples = self.retrieve_samples(max_length, max_chunks_per_article)

    def retrieve_samples(self, max_length, max_chunks_per_article):
      samples = []
      for i in range(len(self.dataset)):
          label = self.dataset[i]["label"]
          label = self.label2id.get(label, -1)
          if label == -1:
              continue
          item_name = self.dataset[i].get("name", "")
          description = self.dataset[i].get("description", "")
          item_type = self.dataset[i].get("type", "")
          category = self.dataset[i].get("category", "")
          article = self.corpus[i]

          # Enriched text with task prompt and definitions
          text = f"""
              Task: You are given a cultural item. Classify it into one of the three categories: 'exclusive', 'agnostic', or 'representative'.

              Definitions:
              Cultural Exclusive: The item is known or used only within a specific culture and is not widely recognized outside of it.
              Cultural Agnostic: The item is commonly known or used worldwide, without strong association to any particular culture.
              Cultural Representative: The item originated in a specific culture and is culturally claimed, but it is also known and used across other cultures.

              Instructions:
              Carefully read the information provided below. Based on the definitions above, assign the most appropriate label to the item.

              Item: {item_name}
              Description: {description}
              Type: {item_type}
              Category: {category}

              Full text: {article}
              """
          tokenized = self.tokenizer(
              text,
              truncation=False,
              return_attention_mask=True,
              return_tensors="pt"
          )

          input_tokens = tokenized["input_ids"][0]
          attention_mask = tokenized["attention_mask"][0]

          if len(input_tokens) == 0:
              continue

          for j in range(0, len(input_tokens), max_length):
              if max_chunks_per_article and j // max_length >= max_chunks_per_article:
                  break

              chunk_input_ids = input_tokens[j:j + max_length]
              chunk_attention_mask = attention_mask[j:j + max_length]

              # Pad if necessary
              pad_len = max_length - len(chunk_input_ids)
              if pad_len > 0:
                  pad_id = self.tokenizer.pad_token_id
                  chunk_input_ids = torch.cat([chunk_input_ids, torch.zeros(pad_len, dtype=torch.long)])
                  chunk_attention_mask = torch.cat([chunk_attention_mask, torch.zeros(pad_len, dtype=torch.long)])

              samples.append({
                  "input_ids": chunk_input_ids,
                  "attention_mask": chunk_attention_mask,
                  "labels": torch.tensor(label)
              })
      return samples


    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


In [13]:
# Metrics used during training
import evaluate

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc = evaluate.load("accuracy").compute(predictions=predictions, references=labels)["accuracy"]
    f1 = evaluate.load("f1").compute(predictions=predictions, references=labels, average='macro')["f1"]

    # cm = confusion_matrix(labels, predictions)
    # print("Confusion matrix:\n", cm)

    return {"accuracy": acc, "f1": f1}


In [19]:
# Load pre-trained DistilBERT model and tokenizer
language_model_name = "microsoft/deberta-v3-base"

model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/SocioEmbeddings/data/transformer/")
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/SocioEmbeddings/data/transformer/")

In [20]:
# Build train and test datasets
train_dataset = WikiDataset(
    dataset=train,
    corpus=train_corpus,
    label2id=label2id,
    id2label=id2label,
    tokenizer=tokenizer,
    max_length=512,
    max_chunks_per_article=1
)

test_dataset = WikiDataset(
    dataset=test,
    corpus=test_corpus,
    label2id=label2id,
    id2label=id2label,
    tokenizer=tokenizer,
    max_length=512,
    max_chunks_per_article=1
    )

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
# Set seed and device
device = "cuda" if torch.cuda.is_available() else "cpu"
set_seed(42)

# Compute class weights for imbalance
labels = [label2id[item["label"]] for item in train]
class_weights = compute_class_weight(class_weight="balanced", classes=np.array([0,1,2]), y=labels)
weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

print("Class weights:", weights_tensor)


Class weights: tensor([0.7743, 1.1131, 1.2344], device='cuda:0')


In [22]:
# Define the training configuration
training_args = TrainingArguments(
    output_dir="deberta-v3-base",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    warmup_steps=250,
    weight_decay=1e-6,
    save_strategy="best",
    metric_for_best_model = "f1",
    greater_is_better=True,
    learning_rate=1e-6,
    logging_dir="./distilbert_log/",
    logging_steps=100,
    fp16=True,
    eval_strategy="steps",
    eval_steps=100,
)

In [23]:
# Define a custom Trainer to apply class weights
from transformers import Trainer

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=weights_tensor)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Initialize the weighted trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = WeightedTrainer(


In [None]:
# # Start the training
# print(f"# Training samples: {len(train_dataset)}")
# print("Sample input text:", tokenizer.decode(train_dataset[0]["input_ids"]))

# trainer.train()

# Training samples: 6251
Sample input text: [CLS] Task: You are given a cultural item. Classify it into one of the three categories: 'exclusive', 'agnostic', or 'representative'. Definitions: Cultural Exclusive: The item is known or used only within a specific culture and is not widely recognized outside of it. Cultural Agnostic: The item is commonly known or used worldwide, without strong association to any particular culture. Cultural Representative: The item originated in a specific culture and is culturally claimed, but it is also known and used across other cultures. Instructions: Carefully read the information provided below. Based on the definitions above, assign the most appropriate label to the item. Item: 916 Description: 2012 film by M. Mohanan Type: entity Category: films Full text: 916 is a 2012 indian malayalam-language drama film written and directed by m. mohanan, starring mukesh, anoop menon, malavika menon, asif ali and meera vasudev. the film is about maintaining the



Step,Training Loss,Validation Loss,Accuracy,F1
100,0.5907,0.767923,0.766667,0.751548
200,0.638,0.702157,0.776667,0.767842
300,0.6188,0.844823,0.763333,0.745701
400,0.5248,0.757808,0.786667,0.775133
500,0.6273,0.864154,0.763333,0.751904
600,0.4498,0.780066,0.783333,0.770671
700,0.4584,0.867496,0.763333,0.755926
800,0.4394,0.953371,0.76,0.743242
900,0.5886,0.742116,0.803333,0.791121
1000,0.6488,0.92976,0.77,0.754691


TrainOutput(global_step=4689, training_loss=0.551530819708006, metrics={'train_runtime': 857.7961, 'train_samples_per_second': 21.862, 'train_steps_per_second': 5.466, 'total_flos': 4934254410408960.0, 'train_loss': 0.551530819708006, 'epoch': 3.0})

In [None]:
# save_path = "/content/drive/MyDrive/SocioEmbeddings/data/transformer"
# # Save model and tokenizer
# trainer.save_model(save_path)
# tokenizer.save_pretrained(save_path)

# # Save trainer state (scheduler, optimizer, step, ecc.)
# trainer.save_state()