In [56]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

import evaluate
import torch
import numpy as np

In [57]:
num_val = "1000" #1000, 2000, 5000, 9000

In [58]:
#load dataset
dataset = load_dataset(
    "csv",
    data_files={
        "train": f"../dataset_completo/balanced_datasets/balanced_train_{num_val}.csv", 
        "test": f"../dataset_completo/balanced_datasets/balanced_test.csv", 
        "val": f"../dataset_completo/balanced_datasets/balanced_validation.csv" 
    },
)
# product,component,priority,severity,first_comment,first_priority,first_severity,days_resolution,comments,label
# Define the fields to concatenate
def concatenate_fields(example):
    fields_to_concat = [
        example['source'],
        #example['product'], 
        example['short_desc'], 
        example['priority'], 
        example['bug_severity'],
        #example['days_resolution'],
        #example['comments'], #elaborazione troppo costosa
    ]
    
    # Join the fields into a single string for the 'text' column
    example['text'] = ' '.join([str(field) for field in fields_to_concat if field])
    return example

# Apply the concatenation to both train and test datasets
dataset = dataset.map(concatenate_fields)
dataset = dataset.remove_columns([
    'product', 
    'short_desc', 
    'priority', 
    'bug_severity',
    'source',
    'days_resolution',
    'comments']) # lasciamo solo la colonna text per la classificazione

print(dataset['train'][1])
dataset


{'label': 0, 'text': 'KDE key list too big NOR normal'}


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 990
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 2250
    })
    val: Dataset({
        features: ['label', 'text'],
        num_rows: 2250
    })
})

In [59]:
model_name = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

# define label maps
id2label = {0: "fast", 1: "slow"}
label2id = {"fast": 0, "slow": 1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pre processing del dataset

In [60]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [61]:
# create tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)


In [62]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Rimuove il testo originale per risparmiare memoria
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

# Converti i dataset in formato PyTorch
tokenized_dataset.set_format("torch")


print(tokenized_dataset['train'][0])
print(tokenized_dataset['val'][0])
print(tokenized_dataset['test'][0])

Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

Map: 100%|██████████| 2250/2250 [00:00<00:00, 11210.60 examples/s]

{'label': tensor(1), 'input_ids': tensor([  101, 10650,  2229, 25509,  7361,  2011,  3775, 13512, 22287, 13102,
        20644, 11751,  9739,  5582, 18384, 20389, 15950, 24548,  9331, 20644,
         2487, 24590,  3973, 11896,  5396,  3671,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,  




Evaluation

In [63]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")


In [64]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt

# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    
    # Compute metrics
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    
    # Create and save confusion matrix
    cm = confusion_matrix(labels, predictions)
    plt.figure(figsize=(10, 8))
    plt.imshow(cm, interpolation='nearest', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(np.unique(labels)))
    plt.xticks(tick_marks, np.unique(labels), rotation=45)
    plt.yticks(tick_marks, np.unique(labels))
    plt.xlabel('Predicted')
    plt.ylabel('True')

    # Add text annotations
    thresh = cm.max() / 2.
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.close()
    
    return {
        "accuracy": acc,
        "f1": f1,
        "recall": recall
    }

Addestriamo il modello 

In [68]:


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix padding issue
    tokenizer.padding_side = "right"

model.train() 


# define training arguments
training_args = TrainingArguments(
    output_dir= f"{model_name}_{num_val}_ft",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="steps",  #  Valutazione più frequente
    eval_steps=100, 
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=3,  #  Evita troppi checkpoint
    lr_scheduler_type="cosine",
    load_best_model_at_end=True,
    logging_steps=50,  # Log ogni 50 passi
    metric_for_best_model="eval_loss",  # 👈 Assicura che il modello salvi in base alla Validation Loss
    greater_is_better=False  # 👈 Perché una loss minore è meglio
)
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer_stats = trainer.train()
eval_results = trainer.evaluate()
print(trainer_stats)
print(eval_results)

  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Recall
100,0.2539,0.810502,0.688889,0.68706,0.688889
200,0.0785,1.144195,0.704889,0.704569,0.704889
300,0.0368,1.211562,0.704444,0.704444,0.704444


Could not locate the best model at distilbert-base-uncased_1000_ft/checkpoint-100/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=310, training_loss=0.1575372213317502, metrics={'train_runtime': 84.1445, 'train_samples_per_second': 58.827, 'train_steps_per_second': 3.684, 'total_flos': 655713623347200.0, 'train_loss': 0.1575372213317502, 'epoch': 5.0})
{'eval_loss': 1.211676836013794, 'eval_accuracy': 0.7048888888888889, 'eval_f1': 0.7048888888888889, 'eval_recall': 0.7048888888888889, 'eval_runtime': 9.9026, 'eval_samples_per_second': 227.214, 'eval_steps_per_second': 14.239, 'epoch': 5.0}


In [None]:
import pandas as pd
from pathlib import Path

# Definiamo le metriche da salvare
training_results = {
    "Dataset Size": num_val,  # Numero di dati usati per il fine-tuning
    "Training Loss": trainer_stats.training_loss,  # Training Loss
    "Train Time (s)": trainer_stats.metrics["train_runtime"],  # Tempo di addestramento
    "Steps": trainer_stats.global_step,  # Numero di passi (steps)
    "Samples/sec": trainer_stats.metrics["train_samples_per_second"],  # Campioni al secondo
    "Steps/sec": trainer_stats.metrics["train_steps_per_second"],  # Passi al secondo
    "Validation Loss": eval_result.get("eval_loss", None),  # Valutazione della loss
    "Validation Accuracy": eval_result.get("eval_accuracy", None),  # Accuracy della validazione
    "Validation F1": eval_result.get("eval_f1", None)  # F1 della validazione
}

# Definiamo il file di destinazione per i risultati
results_file = f"{model_name}_fine_tuned_on_{num_val}/training_comparison.csv"

# Carica i dati precedenti se esiste già un file
try:
    df_results = pd.read_csv(results_file)
except FileNotFoundError:
    df_results = pd.DataFrame()  # Se il file non esiste, creiamo un DataFrame vuoto

# Converti il dizionario in un DataFrame
training_results_df = pd.DataFrame([training_results])  # Passiamo una lista contenente il dizionario

# Assicura che la cartella esista prima di salvare
Path(results_file).parent.mkdir(parents=True, exist_ok=True)

# Aggiungi il nuovo risultato al DataFrame esistente
df_results = pd.concat([df_results, training_results_df], ignore_index=True)

# Salva il DataFrame aggiornato
df_results.to_csv(results_file, index=False)

# Mostra la tabella aggiornata
print(df_results)

  Dataset Size  Training Loss  Train Time (s)  Steps  Samples/sec  Steps/sec
0         1000       0.527281         84.9187    310       58.291      3.651
1         1000       0.550503         84.2204    310       58.774      3.681


In [67]:
model.save_pretrained(f"./fine_tuned_model_distilbert-base-uncased_{num_val}")
tokenizer.save_pretrained(f"./fine_tuned_model_distilbert-base-uncased_{num_val}")

('./fine_tuned_model_distilbert-base-uncased_1000/tokenizer_config.json',
 './fine_tuned_model_distilbert-base-uncased_1000/special_tokens_map.json',
 './fine_tuned_model_distilbert-base-uncased_1000/vocab.txt',
 './fine_tuned_model_distilbert-base-uncased_1000/added_tokens.json',
 './fine_tuned_model_distilbert-base-uncased_1000/tokenizer.json')