In [1]:
# Torch e CUDA
import torch
from torch.utils.data import Subset

# Transformers e Training
from transformers import (
    TextStreamer,
    TrainingArguments,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
# Dataset e valutazione
from datasets import load_dataset, Dataset
from evaluate import load
import bitsandbytes as bnb
# Metriche di valutazione

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
# Data manipulation
import pandas as pd
import numpy as np

# Sistema e utility
import os
from dotenv import load_dotenv
from pathlib import Path
from datetime import datetime

# Visualizzazione
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
num_val = "1000" #1000, 2000, 5000, 9000

In [3]:
import time
import torch
import psutil
import pynvml
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm import tqdm

def evaluate_bert_model(model, tokenizer, eval_dataset, model_name, fine_tuned, num_val):
    print("\nStarting evaluation phase...")

    model.eval()  # Set to evaluation mode

    # Ensure tokenizer padding
    tokenizer.padding_side = 'left'
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    predictions, true_labels, prediction_sources = [], [], []
    batch_size = 8
    invalid = 0

    # Create results folder
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    fine_tuned_status = "fine_tuned" if fine_tuned else "not_fine_tuned"
    output_dir = f"{model_name}_{fine_tuned_status}_on_{num_val}" if fine_tuned else f"{model_name}_{fine_tuned_status}"
    os.makedirs(output_dir, exist_ok=True)

    pynvml.nvmlInit()
    inference_times = []
    system_metrics = []

    # Process dataset in batches
    for i in tqdm(range(0, len(eval_dataset), batch_size), desc="Evaluating", unit="batch"):
        batch = eval_dataset[i:i + batch_size]
        texts, labels, sources = batch['text'], batch['label'], batch['source']

        torch.cuda.synchronize()
        start_time = time.time()

        inputs = tokenizer(
            texts,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(model.device)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predictions_batch = torch.argmax(logits, dim=-1).cpu().tolist()

        predictions.extend(predictions_batch)
        true_labels.extend(labels)
        prediction_sources.extend(sources)

        torch.cuda.synchronize()
        end_time = time.time()
        inference_times.append(end_time - start_time)

        cpu_usage = psutil.cpu_percent()
        ram_usage = psutil.virtual_memory().percent
        gpu_usage = pynvml.nvmlDeviceGetUtilizationRates(pynvml.nvmlDeviceGetHandleByIndex(0)).gpu
        system_metrics.append({"batch": i//batch_size, "cpu": cpu_usage, "ram": ram_usage, "gpu": gpu_usage, "time": end_time - start_time})

    if not predictions:
        print("No valid predictions were generated!")
        return None, [], [], []

    # Compute metrics
    metrics = calculate_metrics(true_labels, predictions)

    # Save metrics to CSV
    metrics_path_csv = os.path.join(output_dir, "metrics.csv")
    pd.DataFrame([metrics]).to_csv(metrics_path_csv, index=False)

    # Compute average system metrics
    if system_metrics:
        avg_metrics = {
            "cpu": sum(m["cpu"] for m in system_metrics) / len(system_metrics),
            "ram": sum(m["ram"] for m in system_metrics) / len(system_metrics),
            "gpu": sum(m["gpu"] for m in system_metrics) / len(system_metrics),
            "time": sum(m["time"] for m in system_metrics) / len(system_metrics),
        }

        avg_metrics_df = pd.DataFrame([avg_metrics])
        avg_metrics_path = os.path.join(output_dir, "avg_system_metrics.csv")
        avg_metrics_df.to_csv(avg_metrics_path, index=False)

    # Generate confusion matrix
    cm = confusion_matrix(true_labels, predictions)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['0', '1'], yticklabels=['0', '1'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    cm_path = os.path.join(output_dir, "confusion_matrix.png")
    plt.savefig(cm_path, format="png")
    plt.close()

    # Display results
    print("\nEvaluation Results:")
    print(f"Model: {model_name}")
    print(f"Samples evaluated: {len(true_labels)}")
    print(f"Invalid predictions: {invalid}")
    print("\nMetrics:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")

    return

In [4]:
def calculate_metrics(true_labels, predictions):
    """
    Calcola accuracy, precision, recall e F1-score.
    """
    return {
        'accuracy': accuracy_score(true_labels, predictions),
        'precision': precision_score(true_labels, predictions, average='binary', zero_division=0),
        'recall': recall_score(true_labels, predictions, average='binary', zero_division=0),
        'f1': f1_score(true_labels, predictions, average='binary', zero_division=0)
    }

In [5]:
#load dataset
dataset = load_dataset(
    "csv",
    data_files={
        "test": f"../dataset_completo/balanced_datasets/balanced_test.csv", 
    },
)
# product,component,priority,severity,first_comment,first_priority,first_severity,days_resolution,comments,label
# Define the fields to concatenate
def concatenate_fields(example):
    fields_to_concat = [
        example['source'],
        #example['product'], 
        example['short_desc'], 
        #example['priority'], 
        #example['bug_severity'],
        #example['days_resolution'],
        #example['comments'],
    ]
    
    # Join the fields into a single string for the 'text' column
    example['text'] = ' '.join([str(field) for field in fields_to_concat if field])
    return example

# Apply the concatenation to both train and test datasets
dataset = dataset.map(concatenate_fields)
dataset = dataset.remove_columns([
    'product', 
    'short_desc', 
    'priority', 
    'bug_severity',
    #'source',
    'days_resolution',
    'comments']) # lasciamo solo la colonna text per la classificazione

print(dataset['test'][1])
dataset


{'source': 'FreeDesktop', 'label': 0, 'text': 'FreeDesktop Gstvappi04Vaapipostproc failed to convert format NV12 to RGB'}


DatasetDict({
    test: Dataset({
        features: ['source', 'label', 'text'],
        num_rows: 2250
    })
})

secegliere se fare l'evaluation del modello fine tunato o sul modello base

In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import os

def load_model(model_name= 'distilbert-base-uncased', fine_tuned=False, fine_tuned_path=None, device="cuda"):
    """
    Carica un modello pre-addestrato o fine-tunato per la classificazione.
    
    :param model_name: Nome del modello pre-addestrato (es. 'bert-base-uncased')
    :param fine_tuned: Booleano, se True carica il modello fine-tunato
    :param fine_tuned_path: Percorso del modello fine-tunato
    :param device: Dispositivo su cui caricare il modello ('cuda' o 'cpu')
    :return: Modello e tokenizer
    """
    # define label maps
    id2label = {0: "fast", 1: "slow"}
    label2id = {"fast": 0, "slow": 1}


    if fine_tuned and fine_tuned_path and os.path.exists(fine_tuned_path):
        print(f"Loading fine-tuned model from: {fine_tuned_path}")
        model = AutoModelForSequenceClassification.from_pretrained(
           fine_tuned_path, num_labels=2, id2label=id2label, label2id=label2id
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    else:
        print(f"Loading base model: {model_name}")
        model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=2, id2label=id2label, label2id=label2id
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Spostare su GPU se disponibile
    device = torch.device(device if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(f"📌 Model loaded on: {device}")

    return model, tokenizer


In [12]:
fine_tuned = False
model_name= 'distilbert-base-uncased'
model, tokenizer = load_model(model_name= 'distilbert-base-uncased', fine_tuned=fine_tuned, fine_tuned_path=None, device="cuda")

Loading base model: distilbert-base-uncased


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


📌 Model loaded on: cuda


In [None]:
fine_tuned= True
fine_tuned_path=f"./fine_tuned_model_distilbert-base-uncased_{num_val}"
model, tokenizer = load_model(model_name= 'distilbert-base-uncased', fine_tuned=fine_tuned, fine_tuned_path=f"./fine_tuned_model_distilbert-base-uncased_{num_val}", device="cuda")

In [13]:
# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [14]:
# create tokenize function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)


In [15]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Rimuove il testo originale per risparmiare memoria
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

# Converti i dataset in formato PyTorch
tokenized_dataset.set_format("torch")

tokenizer.pad_token = tokenizer.eos_token if tokenizer.pad_token is None else tokenizer.pad_token
tokenizer.padding_side = "right"


In [16]:
# Esegui la valutazione con il modello fine-tunato
if fine_tuned:
    evaluate_bert_model(
        model=model,
        tokenizer=tokenizer,
        eval_dataset=dataset["test"],
        model_name=f"./fine_tuned_model_distilbert-base-uncased_{num_val}",
        fine_tuned=fine_tuned,
        num_val=num_val
    )
else:
    evaluate_bert_model(
        model=model,
        tokenizer=tokenizer,
        eval_dataset=dataset["test"],
        model_name=model_name,
        fine_tuned=fine_tuned,
        num_val=0
    )



Starting evaluation phase...


Evaluating: 100%|██████████| 282/282 [00:01<00:00, 184.63batch/s]



Evaluation Results:
Model: distilbert-base-uncased
Samples evaluated: 2250
Invalid predictions: 0

Metrics:
accuracy: 0.5080
precision: 0.5054
recall: 0.7484
f1: 0.6034
