In [11]:
# Torch e CUDA
import torch
import gc
from torch.utils.data import Subset

# Transformers e Training
from transformers import (
    TextStreamer,
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
# Dataset e valutazione
from datasets import load_dataset, Dataset
from evaluate import load
import bitsandbytes as bnb
# Metriche di valutazione
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix
)

# Data manipulation
import pandas as pd
import numpy as np

# Sistema e utility
import os
from dotenv import load_dotenv
from pathlib import Path
from datetime import datetime

# Visualizzazione
import seaborn as sns
import matplotlib.pyplot as plt


prompt

In [None]:
prompt_template = (
    "### Instruction:\n"
    "You are an expert software developer and bug triaging specialist. Your task is to predict whether a bug "
    "will be resolved in LESS than 50 DAYS or MORE than 50 DAYS based on the provided bug details.\n\n"
    
    "- Output '0' if the bug will be resolved in LESS than 50 DAYS.\n"
    "- Output '1' if the bug will be resolved in MORE than 50 DAYS.\n\n"
    
    "Your response MUST be strictly either '0' or '1'. Do NOT include any additional text, explanations, formatting, symbols, or extra characters in your response.\n\n"

    "### Input:\n"
    "Source: {source}\n"
    "Short Description: {short_desc}\n"
    "Priority: {priority}\n"
    "Severity: {bug_severity}\n"
    #"Estimated resolution time: {days_resolution}\n\n" - questo potrebbe influenzare troppo il modello per la predizione

    "### Example Responses:\n"
    "Input: Source: KDE | Product: Payment System | Short Description: Critical security vulnerability found in authentication system | Priority: P1 | Severity: Critical\n"
    "Output: 0\n\n"
    "Input: Source: OpenOffice | Product: UI Module | Short Description: UI glitch affecting low-impact visual elements in settings panel | Priority: P3 | Severity: Minor\n"
    "Output: 1\n\n"

    "### Output: {label}\n"
)
num_val = "1000" #1000, 2000, 5000, 9000

caricamento del modello

In [13]:
max_seq_length = 2048
dtype = torch.float16 #altrimenti None
load_in_4bit = True
seed = 3407
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
model_name="meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token = hf_token)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)


Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.28s/it]


In [14]:

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    # Layer fondamentali per catturare relazioni tra token
    # q_proj : "Query projection", v_proj: "Value projection", k_proj : "Key projection", o_proh: "output projection"
    #target_modules=['q_proj', 'v_proj', 'k_proj','o_proj','gate_proj','up_proj','down_proj','lm_head','embedded_layers']
    target_modules = ['q_proj', 'v_proj', 'gate_proj', 'up_proj', 'down_proj'] #forse lm_head non serve perchè generiamo solo un singolo token
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

trainable params: 35,127,296 || all params: 8,065,388,544 || trainable%: 0.4355


formattazione del prompt con i dati del dataset

In [None]:
EOS_TOKEN = tokenizer.eos_token  # Assicuriamoci di aggiungere il token EOS alla fine

def formatting_prompts(examples, include_label=True):
    texts = []
    for source, short_desc, priority, bug_severity, label in zip(
        examples["source"], examples["short_desc"], examples["priority"], examples["bug_severity"], examples["label"]
    ):
        if include_label:
            text = prompt_template.format(
                source=source, 
                short_desc=short_desc,
                priority=priority,
                bug_severity=bug_severity,
                label=label,  # La label viene passata solo se include_label=True
            ) + EOS_TOKEN
        else:
            text = prompt_template.format(
                source=source, 
                short_desc=short_desc,
                priority=priority,
                bug_severity=bug_severity,
                label="",  #  Non passiamo la label
            ) + EOS_TOKEN
        
        texts.append(text)
    
    return {"text": texts}


# Caricamento dataset
dataset = load_dataset(
    "csv",
    data_files={
        "train": f"../dataset_completo/balanced_datasets/balanced_train_{num_val}.csv", 
        "test": f"../dataset_completo/balanced_datasets/balanced_test.csv", 
        "val": f"../dataset_completo/balanced_datasets/balanced_validation.csv" 
    },
)

# Formattiamo il dataset con il nuovo prompt
# Applichiamo la funzione al dataset
dataset["train"] = dataset["train"].map(lambda x: formatting_prompts(x, include_label=True), batched=True)
dataset["val"] = dataset["val"].map(lambda x: formatting_prompts(x, include_label=False), batched=True)  # 🚨 Label nascosta
#dataset["test"] = dataset["test"].map(lambda x: formatting_prompts(x, include_label=False), batched=True)  # 🚨 Label nascosta


dataset['train'][0]

{'short_desc': 'Reset needs more explanation and an example',
 'product': 'Identity Manager Designer',
 'priority': 'P5 None',
 'bug_severity': 'Enhancement',
 'days_resolution': 435,
 'comments': 'The Filter Editor lets you set attribute to reset The doc explains it a little but not enough to really understand what it is This is an important new feature that weve exposed in Spitfire and without a fuller explanation users probably wont dare to use it An example or use case for why you would use this would be really helpful Lee can you provide this information or an example Juliet Talk to Shon Vella for an example Thanks Bill Shouldnt hold 11 Marking 20 Ship We should look at adding this to the doc in Designer Add the following information The Reset option makes a data store the authoritative source of information For example if an employees addresses should only be changed in HR database then set the Reset option in the filter for this attribute When an address is changed in the email 

Fine-tuning del modello

In [None]:
from evaluate import load
from trl import SFTTrainer, SFTConfig
# Carichiamo la metrica di accuracy
#accuracy_metric = load("accuracy")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # ✅ Fix padding issue
    tokenizer.padding_side = "right"
model.train() 
directory = f"{model_name}".split("/")[-1].strip()
# 🔹 Configurazione per l'addestramento (usando SFTConfig)
sft_config = SFTConfig(
    output_dir=f"{directory}_{num_val}_ft",
    max_seq_length=2048,
    dataset_text_field="text",  # Cambia se necessario
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,  #  Più epoche per adattare bene LoRA
    gradient_accumulation_steps=4,  #  Ridotto per aggiornamenti più frequenti
    evaluation_strategy="steps",  #  Valutazione più frequente
    eval_steps=100, 
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=3,  #  Evita troppi checkpoint
    learning_rate=5e-5,  #  Aumentato per migliorare adattamento
    lr_scheduler_type="cosine",  #  Cosine decay per convergenza più fluida
    warmup_ratio=0.05,  # Warmup ridotto per velocizzare training
    fp16=True,  #  Mantieni mixed precision
    logging_steps=50,  #  Meno logging per ridurre overhead
)
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    peft_config=peft_config,
    max_seq_length=2048,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=sft_config,
    packing= False,
)

# Avviamo il training!
trainer_stats = trainer.train()
print(trainer_stats)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 9000/9000 [00:00<00:00, 19955.70 examples/s]
Map: 100%|██████████| 2250/2250 [00:00<00:00, 20201.87 examples/s]
  super().__init__(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
100,0.5087,0.436573
200,0.3971,0.387529
300,0.3749,0.367181
400,0.3627,0.348613
500,0.355,0.334592
600,0.3281,0.320921
700,0.3253,0.307495
800,0.3038,0.296343
900,0.3131,0.285027
1000,0.3128,0.272477


TrainOutput(global_step=2810, training_loss=0.291064202615799, metrics={'train_runtime': 12403.3228, 'train_samples_per_second': 3.628, 'train_steps_per_second': 0.227, 'total_flos': 3.1195040796337766e+17, 'train_loss': 0.291064202615799, 'epoch': 4.992})


In [None]:
# Definiamo i dati da salvare
training_results = {
    "Dataset Size": num_val,  # Numero di dati usati per il fine-tuning
    "Training Loss": trainer_stats.training_loss,
    "Train Time (s)": trainer_stats.metrics["train_runtime"],
    "Steps": trainer_stats.global_step,
    "Samples/sec": trainer_stats.metrics["train_samples_per_second"],
    "Steps/sec": trainer_stats.metrics["train_steps_per_second"]
}

# Carica dati precedenti se esiste già un file
results_file = f"{model_name}_fine_tuned_on_{num_val}/training_comparison.csv"
try:
    df_results = pd.read_csv(results_file)
except FileNotFoundError:
    df_results = pd.DataFrame()

# Aggiungi nuovo risultato e salva
df_results = df_results.append(training_results, ignore_index=True)
df_results.to_csv(results_file, index=False)

# Mostra la tabella aggiornata
print(df_results)

In [22]:
model.save_pretrained(f"./fine_tuned_model_llama_3.1_8b_{num_val}")
tokenizer.save_pretrained(f"./fine_tuned_model_llama_3.1_8b_{num_val}")

('./fine_tuned_model_llama_3.1_8b_9000/tokenizer_config.json',
 './fine_tuned_model_llama_3.1_8b_9000/special_tokens_map.json',
 './fine_tuned_model_llama_3.1_8b_9000/tokenizer.json')