In [45]:
!pip install datasets --quiet
!pip install evaluate --quiet
import evaluate
import pandas as pd
import transformers
from datasets import load_dataset
from datasets import Dataset, DatasetDict
from huggingface_hub import login
import torch
import numpy as np
from typing import Dict
from transformers import DataCollatorWithPadding
from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
)
from transformers import DebertaV2Model, DebertaV2PreTrainedModel
import torch.nn as nn
import torch
from transformers import AutoConfig


In [46]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [47]:
dataset = load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset')

In [48]:
train = dataset['train'].to_pandas()
validation = dataset['validation'].to_pandas()

In [49]:
# use of deberta
language_model_name = "microsoft/deberta-v3-base"

### Training Argurments

batch_size = 32
learning_rate = 1e-5
weight_decay = 0.001
epochs = 10
device = "cuda" if torch.cuda.is_available() else "cpu"


set_seed(42)

In [50]:
# creating training sentences
training_sentences = []

for i, row in train.iterrows():
  string = ''
  string += 'It is about '
  string += str(row['name'])
  string += '. Described as '
  string += row.description
  string += "."
  training_sentences.append(string)

In [51]:
validation_sentences = []

for i, row in validation.iterrows():
  string = ''
  string += 'It is about '
  string += str(row['name'])
  string += '. Described as '
  string += row.description
  string += "."
  validation_sentences.append(string)

In [52]:
# adding customized sentences to the dataset

train['sentences'] = training_sentences
validation['sentences'] = validation_sentences

In [53]:
# encoding the labels for classifier

labels_dict = {'cultural representative': 0, 'cultural exclusive': 1, 'cultural agnostic': 2}
labels_dict_reversed = {0: 'cultural representative', 1: 'cultural exclusive', 2: 'cultural agnostic'}

train = train.replace({'label': labels_dict})
validation = validation.replace({'label': labels_dict})

In [55]:
# formating data for the trainer
train["idx"] = train.index
validation["idx"] = validation.index

train = train[['idx', 'sentences', 'label']]
validation = validation[['idx', 'sentences', 'label']]

# creating Hugging Face formated dataset
train = Dataset.from_pandas(train)
validation = Dataset.from_pandas(validation)

dataset_dict = DatasetDict({
    "train": train,
    "validation": validation
})

In [56]:
# loading the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(language_model_name,
                                                                   ignore_mismatched_sizes=True,
                                                                   output_attentions=False, output_hidden_states=False,
                                                                   num_labels=3)
# Load the pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained(language_model_name)

# setting the data collator, needed for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples["sentences"], padding=True, truncation=True)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
# freezing first layers to keep knowledge from pretrained model

for name, param in model.deberta.named_parameters():
    if any(name.startswith(f'encoder.layer.{i}') for i in range(0, 6)):
        param.requires_grad = False

for param in model.deberta.embeddings.parameters():
    param.requires_grad = False

In [58]:
# tokenizing dataset
dataset_dict = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/6251 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [59]:
# adapted metrics function with accuracy, f1 score, precision and recall

def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")
   load_precision = evaluate.load("precision")
   load_recall = evaluate.load("recall")

   logits, label = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=label)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=label, average='weighted')["f1"]
   precision = load_precision.compute(predictions=predictions, references=label, average='weighted')["precision"]
   recall = load_recall.compute(predictions=predictions, references=label, average='weighted')["recall"]
   return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

In [60]:
training_args = TrainingArguments(
    output_dir="training_dir",                    # output directory [Mandatory]
    num_train_epochs=epochs,                      # total number of training epochs
    per_device_train_batch_size=batch_size,       # batch size per device during training
    warmup_steps=500,                             # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                    # strength of weight decay
    save_strategy="no",
    learning_rate=learning_rate,
    report_to="none",
    logging_dir="classifier_logs",
    logging_strategy="steps",
    logging_steps=10
)



In [61]:
# initializing the trainer
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=dataset_dict['train'],
   eval_dataset=dataset_dict['validation'],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [62]:
trainer.train()

Step,Training Loss
10,1.084
20,1.0799
30,1.091
40,1.0945
50,1.0916
60,1.0899
70,1.0829
80,1.0896
90,1.0911
100,1.079


TrainOutput(global_step=1960, training_loss=0.5348198104877862, metrics={'train_runtime': 805.8685, 'train_samples_per_second': 77.568, 'train_steps_per_second': 2.432, 'total_flos': 2248409226583260.0, 'train_loss': 0.5348198104877862, 'epoch': 10.0})

In [63]:
trainer.evaluate()

{'eval_loss': 0.6571194529533386,
 'eval_accuracy': 0.7733333333333333,
 'eval_f1': 0.7730489794559318,
 'eval_precision': 0.7727886152862168,
 'eval_recall': 0.7733333333333333,
 'eval_runtime': 3.6514,
 'eval_samples_per_second': 82.161,
 'eval_steps_per_second': 10.407,
 'epoch': 10.0}

In [64]:
# saving weights
trainer.save_model("lm_trained_judith_esteban")
tokenizer.save_pretrained("tokenizer_judith_esteban")

('tokenizer_judith_esteban/tokenizer_config.json',
 'tokenizer_judith_esteban/special_tokens_map.json',
 'tokenizer_judith_esteban/spm.model',
 'tokenizer_judith_esteban/added_tokens.json',
 'tokenizer_judith_esteban/tokenizer.json')

In [65]:
# inference for one sentence

def predict_class(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
    return labels_dict_reversed[preds.item()]

In [66]:
predict_class('It is about 77 records. Described as UK record label.', model, tokenizer, device)

'cultural exclusive'

In [67]:
# generalized inference to any new dataset

dataset = load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset')
validation = dataset['validation'].to_pandas()

validation_sentences = []

for i, row in validation.iterrows():
  string = ''
  string += 'It is about '
  string += str(row['name'])
  string += '. Described as '
  string += row.description
  string += "."
  validation_sentences.append(string)

predictions = []
for sentence in validation_sentences:
  predictions.append(predict_sentiment(sentence, model, tokenizer, device))

# saving the predictions

pd.DataFrame(predictions).to_csv('predictions.csv')

### Training without freezing the layers

In [68]:
# loading the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(language_model_name,
                                                                   ignore_mismatched_sizes=True,
                                                                   output_attentions=False, output_hidden_states=False,
                                                                   num_labels=3)
# Load the pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained(language_model_name)

# setting the data collator, needed for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples["sentences"], padding=True, truncation=True)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [69]:
# tokenizing dataset
dataset_dict = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/6251 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [70]:
training_args = TrainingArguments(
    output_dir="training_dir_unfrozen",                    # output directory [Mandatory]
    num_train_epochs=epochs,                      # total number of training epochs
    per_device_train_batch_size=batch_size,       # batch size per device during training
    warmup_steps=500,                             # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                    # strength of weight decay
    save_strategy="no",
    learning_rate=learning_rate,
    report_to="none",
    logging_dir="classifier_logs_unfrozen",
    logging_strategy="steps",
    logging_steps=10
)

In [71]:
# initializing the trainer

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=dataset_dict['train'],
   eval_dataset=dataset_dict['validation'],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [72]:
trainer.train()

Step,Training Loss
10,1.1041
20,1.1001
30,1.1051
40,1.0996
50,1.0944
60,1.0951
70,1.0887
80,1.0897
90,1.0803
100,1.0705


TrainOutput(global_step=1960, training_loss=0.3925497573249194, metrics={'train_runtime': 1075.5327, 'train_samples_per_second': 58.12, 'train_steps_per_second': 1.822, 'total_flos': 2248409226583260.0, 'train_loss': 0.3925497573249194, 'epoch': 10.0})

In [73]:
trainer.evaluate()

{'eval_loss': 0.8083745241165161,
 'eval_accuracy': 0.79,
 'eval_f1': 0.788543198908917,
 'eval_precision': 0.7875611345898231,
 'eval_recall': 0.79,
 'eval_runtime': 3.4327,
 'eval_samples_per_second': 87.394,
 'eval_steps_per_second': 11.07,
 'epoch': 10.0}

## Test on a customized classifier

In [74]:
# customized model with one more fully connected layer in the classifier

class CustomDebertaV3Classifier(DebertaV2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.deberta = DebertaV2Model(config)  # V2 same as V3 in terms of model structure

        # 💡 adding a fully-connected layer
        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, config.num_labels)
        )

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # token for classification

        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None: # resistence to empty input
            loss_fct = nn.CrossEntropyLoss()

            # loss computation on the logits of the classifier
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}


In [75]:
# using customized model with pretrained weights on the transformers part
config = AutoConfig.from_pretrained("microsoft/deberta-v3-base", num_labels=3)
model = CustomDebertaV3Classifier.from_pretrained("microsoft/deberta-v3-base", config=config)

training_args = TrainingArguments(
    output_dir="training_dir_modified_model",     # output directory [Mandatory]
    num_train_epochs=epochs,                      # total number of training epochs
    per_device_train_batch_size=batch_size,       # batch size per device during training
    warmup_steps=500,                             # number of warmup steps for learning rate scheduler
    weight_decay=weight_decay,                    # strength of weight decay
    save_strategy="no",
    learning_rate=learning_rate,
    report_to="none",
    logging_dir="model_modified_logs",
    logging_strategy="steps",
    logging_steps=10
)

# initializing the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['validation'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)


Some weights of CustomDebertaV3Classifier were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.3.bias', 'classifier.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [76]:
trainer.train()

Step,Training Loss
10,1.0871
20,1.0937
30,1.101
40,1.0981
50,1.0963
60,1.0895
70,1.0878
80,1.0863
90,1.093
100,1.0867


TrainOutput(global_step=1960, training_loss=0.3855784734901117, metrics={'train_runtime': 1073.204, 'train_samples_per_second': 58.246, 'train_steps_per_second': 1.826, 'total_flos': 2243221189741788.0, 'train_loss': 0.3855784734901117, 'epoch': 10.0})

In [77]:
trainer.evaluate()

{'eval_loss': 0.8317744731903076,
 'eval_accuracy': 0.77,
 'eval_f1': 0.7708671973188103,
 'eval_precision': 0.7719473684210526,
 'eval_recall': 0.77,
 'eval_runtime': 3.6421,
 'eval_samples_per_second': 82.371,
 'eval_steps_per_second': 10.434,
 'epoch': 10.0}

In [None]:
!zip -r lm_trained_judith_esteban.zip lm_trained_judith_esteban/
!zip -r tokenizer_judith_esteban.zip tokenizer_judith_esteban/

updating: lm_trained_judith_esteban/ (stored 0%)
updating: lm_trained_judith_esteban/tokenizer_config.json (deflated 73%)
updating: lm_trained_judith_esteban/added_tokens.json (stored 0%)
updating: lm_trained_judith_esteban/model.safetensors