In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pandas as pd
import numpy as np
import os
from pathlib import Path
import time
import psutil
import pynvml
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

Controllare bene questi dati prima di lanciare l'eval sui dati di test sul modello fine tunato o sul modello base:
num_val = 1000, 2000, 5000 o 9000
fine_tuned = False (se si desidera fare l'eval sul modello "base")
model_name è il nome del modello su cui è avvenuto il fine_tuning

In [None]:
num_val = "9000" #1000, 2000, 5000, 9000
model_name = "distilbert-base-uncased"
#model_name="meta-llama/Llama-3.1-8B-Instruct"
fine_tuned = True # Imposta a True per valutare il modello fine-tunato, False per il modello base
fine_tuned_path = f"./fine_tuned_model_{model_name}_{num_val}" if fine_tuned else None

In [None]:
from dotenv import load_dotenv
# Funzione per caricare il modello
def load_model(model_name, fine_tuned=False, fine_tuned_path=None, device="cuda"):
    """
    Carica un modello pre-addestrato o fine-tunato per la classificazione.
    
    :param model_name: Nome del modello pre-addestrato (es. 'distilbert-base-uncased')
    :param fine_tuned: Booleano, se True carica il modello fine-tunato
    :param fine_tuned_path: Percorso del modello fine-tunato
    :param device: Dispositivo su cui caricare il modello ('cuda' o 'cpu')
    :return: Modello e tokenizer
    """
    id2label = {0: "fast", 1: "slow"}
    label2id = {"fast": 0, "slow": 1}
    load_dotenv()
    hf_token = os.getenv("HF_TOKEN")
    if fine_tuned and fine_tuned_path and os.path.exists(fine_tuned_path):
        print(f"Loading fine-tuned model from: {fine_tuned_path}")
        model = AutoModelForSequenceClassification.from_pretrained(
            fine_tuned_path, num_labels=2, id2label=id2label, label2id=label2id
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    else:
        print(f"Loading base model: {model_name}")
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name, num_labels=2, id2label=id2label, label2id=label2id, token=hf_token
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)


    device = torch.device(device if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(f"📌 Model loaded on: {device}")

    return model, tokenizer

In [None]:
# Funzione per calcolare le metriche
def calculate_metrics(true_labels, predictions):
    """
    Calcola accuracy, precision, recall e F1-score tramite la libreria sklearn.
    """
    return {
        'accuracy': accuracy_score(true_labels, predictions),
        'precision': precision_score(true_labels, predictions, average='binary', zero_division=0),
        'recall': recall_score(true_labels, predictions, average='binary', zero_division=0),
        'f1': f1_score(true_labels, predictions, average='binary', zero_division=0)
    }

In [None]:
# Funzione per valutare il modello
def evaluate_bert_model(model, tokenizer, eval_dataset, model_name, fine_tuned, num_val):
    """
    Valuta il modello BERT-like su un dataset di test.
    """
    print("\nStarting evaluation phase...")
    model.eval()

    # Configura il tokenizer
    tokenizer.padding_side = 'right'
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    predictions, true_labels = [], []
    batch_size = 8
    output_dir = f"{model_name}_{'fine_tuned' if fine_tuned else 'not_fine_tuned'}_on_{num_val}" if fine_tuned else f"{model_name}_not_fine_tuned"
    os.makedirs(output_dir, exist_ok=True)

    pynvml.nvmlInit()
    inference_times = []
    system_metrics = []

    # Processa il dataset in batch
    for i in tqdm(range(0, len(eval_dataset), batch_size), desc="Evaluating", unit="batch"):
        batch = eval_dataset[i:i + batch_size]
        texts, labels = batch['text'], batch['label']

        torch.cuda.synchronize()
        start_time = time.time()

        inputs = tokenizer(
            texts,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(model.device)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predictions_batch = torch.argmax(logits, dim=-1).cpu().tolist()

        predictions.extend(predictions_batch)
        true_labels.extend(labels)

        torch.cuda.synchronize()
        end_time = time.time()
        inference_times.append(end_time - start_time)

        cpu_usage = psutil.cpu_percent()
        ram_usage = psutil.virtual_memory().percent
        gpu_usage = pynvml.nvmlDeviceGetUtilizationRates(pynvml.nvmlDeviceGetHandleByIndex(0)).gpu
        system_metrics.append({"batch": i // batch_size, "cpu": cpu_usage, "ram": ram_usage, "gpu": gpu_usage, "time": end_time - start_time})

    # Calcola le metriche
    metrics = calculate_metrics(true_labels, predictions)
    pd.DataFrame([metrics]).to_csv(os.path.join(output_dir, "metrics.csv"), index=False)

    # Salva le metriche di sistema
    if system_metrics:
        avg_metrics = {
            "cpu": sum(m["cpu"] for m in system_metrics) / len(system_metrics),
            "ram": sum(m["ram"] for m in system_metrics) / len(system_metrics),
            "gpu": sum(m["gpu"] for m in system_metrics) / len(system_metrics),
            "time": sum(m["time"] for m in system_metrics) / len(system_metrics),
        }
        pd.DataFrame([avg_metrics]).to_csv(os.path.join(output_dir, "avg_system_metrics.csv"), index=False)

    # Genera la matrice di confusione
    cm = confusion_matrix(true_labels, predictions)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['0', '1'], yticklabels=['0', '1'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(os.path.join(output_dir, "confusion_matrix.png"), format="png")
    plt.close()

    print("\nEvaluation Results:")
    print(f"Model: {model_name}")
    print(f"Samples evaluated: {len(true_labels)}")
    print("\nMetrics:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")

In [17]:
# Carica il dataset
dataset = load_dataset(
    "csv",
    data_files={"test": "../dataset_completo/balanced_datasets/balanced_test.csv"},
)

In [18]:
# Prepara il dataset
def concatenate_fields(example):
    fields_to_concat = [
        example['source'],
        example['product'],
        example['short_desc'],
        example['priority'],
        example['bug_severity'],
    ]
    example['text'] = ' '.join([str(field) for field in fields_to_concat if field])
    return example

dataset = dataset.map(concatenate_fields)
dataset = dataset.remove_columns(['product', 'short_desc', 'priority', 'bug_severity', 'days_resolution', 'comments'])

In [19]:
# Carica il modello
model, tokenizer = load_model(model_name, fine_tuned, fine_tuned_path, device="cuda")

Loading base model: meta-llama/Llama-3.1-8B-Instruct


Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.49s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.1-8B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenizza il dataset
def tokenize_function(examples):
    #tokenizer.pad_token = tokenizer.eos_token
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text"])
tokenized_dataset.set_format("torch")

In [21]:
# Esegui la valutazione
evaluate_bert_model(
    model=model,
    tokenizer=tokenizer,
    eval_dataset=dataset["test"],
    model_name=model_name,
    fine_tuned=fine_tuned,
    num_val=num_val
)


Starting evaluation phase...


Evaluating: 100%|██████████| 2250/2250 [1:27:01<00:00,  2.32s/batch]



Evaluation Results:
Model: meta-llama/Llama-3.1-8B-Instruct
Samples evaluated: 2250

Metrics:
accuracy: 0.5062
precision: 0.5138
recall: 0.2320
f1: 0.3197
