Installazione dipendenze

In [None]:
%pip install -q -U transformers peft accelerate optimum
%pip install bitsandbytes
%pip install auto-gptq
%pip install nltk
%pip install trl
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Codice completo per il fine tuning

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForCausalLM
from trl import SFTTrainer



# Modello base
base_model_id = "TechxGenus/gemma-7b-GPTQ"
# Carico il modello base
quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True)
model = AutoModelForCausalLM.from_pretrained(base_model_id,quantization_config=quantization_config_loading, device_map="auto")
#Definisco il tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.unk_token

max_length = 1024

# Funzione per formattare il prompt
# Questa funzione per ogni elemento del dataset(ogni sentenza) crea un prompt con il seguente formato:
# - Intestazione con la descrizione del task
# - Testo della domanda
# - Risposta in formato JSON con le entità estratte
# - Le entità sono divise per label e vengono restituite in un dizionario
# - Se non vengono trovate entità per una label, viene restituita una lista vuota

def formatting_func(example):

  output_texts = []

  for i in range(len(example["input"])):
    question = example["input"][i]
    text = f"""Extract the entities for the following labels from the given text and provide the results in JSON format
- Entities must be extracted exactly as mentioned in the text.
- Return each entity under its label without creating new labels.
- Provide a list of entities for each label, ensuring that if no entities are found for a label, an empty list is returned.
- Accuracy and relevance in your responses are key.

Lables and their Descriptions:
- Task: applications, problems to solve, systems to construct.
- Method: methods, models, systems to use, tools, components of a system.
- Metric: metrics, measures, or entities that can express quality of a system/method.
- Material: data, datasets, resources, Corpus, Knowledge base.
- OtherScientificTerm: phrases that are a scientific terms but do not fall into any of the above classes.
- Generic: general terms or pronouns that may refer to a entity but are not themselves informative.

### Input text: {question}

### Response:
{{"""

    keys = ["Metric", "Method", "Material", "Task", "OtherScientificTerm", "Generic"]

    for key in keys:
      text += f""" \n"{key}": {example[key][i]}, """

    text = text[:-2]
    text += "\n}"
    output_texts.append(text)

  return output_texts


# Carico il dataset
train_dataset = load_dataset('json', data_files='/content/drive/MyDrive/Tesi/dataset/scierc_train_inc.json', split='train')
val_dataset = load_dataset('json', data_files='/content/drive/MyDrive/Tesi/dataset/scierc_dev_inc.json', split='train')

# Preparo il modello per essere addestrato con PEFT in 4bit

from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# Configurazione del modello per PEFT

from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

# Configurazione dell'acceleratore per l'addestramento

from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)
model = accelerator.prepare_model(model)

import transformers

# Nome del fine tuning e della cartella di output

project = "ner-finetune-new-prompt"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "/content/drive/MyDrive/Tesi/models/" + run_name

import transformers
from datetime import datetime

project = "no-inc"
base_model_name = "gemma"
run_name = base_model_name + "-" + project
output_dir = "/content/drive/MyDrive/Tesi/models/" + run_name

trainer = SFTTrainer(
    formatting_func=formatting_func,
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=1,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        max_steps=1200,
        learning_rate=2e-4, 
        bf16=False,
        fp16 = False,
        optim="paged_adamw_32bit", 
        logging_steps=1,              
        logging_dir="./logs",        
        save_strategy="steps",       
        save_steps=200,                
        evaluation_strategy="steps", 
        eval_steps=200,               
        do_eval=True,                
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"     
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

