In [1]:
import re
import os

import datasets
import evaluate
import numpy as np
import pandas as pd

pd.options.display.max_columns = 500

from datasets import load_dataset
from peft import get_peft_model
from peft import LoraConfig
from peft import PeftConfig
from peft import PeftModelForCausalLM
from peft import prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import BitsAndBytesConfig
from transformers import DataCollatorForLanguageModeling
from transformers import default_data_collator
from transformers import Trainer
from transformers import TrainingArguments
from transformers import GenerationConfig
from trl import SFTTrainer
import torch

SENTENCE_MAX_LENGTH=512

[2024-02-24 17:25:15,584] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)




In [2]:
ds = datasets.Dataset.from_json(path_or_paths='/opt/data_share/nfe_produto_1000_estruturado_anotado_train.json', split='train')
ds = ds.select_columns(['text', 'estruturado'])

def complete_prompt(row):
    """Add instruction to text context"""
    text = f"""\
Você é um assistente que organiza itens de notas fiscais. Para cada descrição oferecida, organize os dados em formato json colocando as informações de produto, marca, quantidade e unidade de medida separados. Outras informações devem ser colocadas em um campo separado. Se os números estiverem com vírgula para separar decimais, utilizar a notação americana no json.

### Descrição:
{row['text']}

### Resposta:
{row["estruturado"]}

### Fim
"""
    return {'text_prompt': text}

ds = ds.map(complete_prompt)
ds = ds.train_test_split(test_size=0.05, seed=42)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

In [3]:
from pprint import pprint
pprint(ds['train'][0])

{'estruturado': '{\n'
                "    'produto': 'LEITE UHT INTEGRAL',\n"
                "    'marca': 'SIG LANGUIRU',\n"
                "    'quantidade': 1,\n"
                "    'unidade': 'l',\n"
                "    'observacao': ''\n"
                '}',
 'text': 'LEITE UHT INTEGRAL  SIG LANGUIRU 1LT',
 'text_prompt': 'Você é um assistente que organiza itens de notas fiscais. '
                'Para cada descrição oferecida, organize os dados em formato '
                'json colocando as informações de produto, marca, quantidade e '
                'unidade de medida separados. Outras informações devem ser '
                'colocadas em um campo separado. Se os números estiverem com '
                'vírgula para separar decimais, utilizar a notação americana '
                'no json.\n'
                '\n'
                '### Descrição:\n'
                'LEITE UHT INTEGRAL  SIG LANGUIRU 1LT\n'
                '\n'
                '### Resposta:\n'
           

In [4]:
BASE_MODEL = '/opt/data_share/model/mistral-7b-instruct_pt_ds_v02_block_3072/checkpoint-11500/'
SUFFIX = 'nfe-v92'
NAME = f'llama2govbr-{SUFFIX}'
OUTPUT = f"/opt/data_share/model/finetunning/{NAME}"


tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL,
    use_fast=False,
)


if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '<PAD>'})
tokenizer.padding_side = 'right'


model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    load_in_4bit=True,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    ),
    device_map="auto",
    trust_remote_code=True,
)

model.resize_token_embeddings(model.config.vocab_size + 1)
model.config.use_cache = False


peft_config = LoraConfig(
    r=128,
    lora_alpha=256,
    lora_dropout=0.06,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)


ta = TrainingArguments(
    report_to="none",
    output_dir=OUTPUT,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    bf16=False,
    fp16=True,
    eval_steps=20,
    learning_rate=3e-5,
    weight_decay=0.001,
	evaluation_strategy="steps",
    save_steps=20,
    save_total_limit=3,
    logging_steps=10,
    logging_first_step=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_bleu",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')


def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]
    return logits.argmax(dim=-1)


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    s_preds = preds.copy()
    s_labels = labels.copy()

    s_preds[s_preds==-100] = tokenizer.pad_token_id
    s_labels[s_labels==-100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(s_preds, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(s_labels, skip_special_tokens=True)
    
    labels = labels[:, 1:].reshape(-1)
    preds = preds[:, :-1].reshape(-1)

    me = {}
    try:
        me.update(rouge.compute(predictions=pred_str, references=labels_str))
    except:
        traceback.print_exc()
        print("Error computing rouge")
        pass

    try:
        me.update(bleu.compute(predictions=pred_str, references=labels_str))
    except:
        traceback.print_exc()
        print("Error computing bleu")
        pass

    return me


def tokenize(prompt):
    result = tokenizer(
        prompt,
        return_tensors='pt', 
        padding='max_length', 
        max_length=SENTENCE_MAX_LENGTH, 
        truncation=True,
    )
    return {
        "input_ids": result["input_ids"].reshape(-1),
        "attention_mask": result["attention_mask"].reshape(-1),
    }


ds = ds.map(lambda x: tokenize(x['text_prompt']))

Map:   0%|          | 0/855 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

In [6]:
trainer = SFTTrainer(
    model=model,
    args=ta,
    tokenizer=tokenizer,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    dataset_text_field='text_prompt',
    peft_config=peft_config,
    max_seq_length=SENTENCE_MAX_LENGTH,
    packing=False,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model(OUTPUT)

Map:   0%|          | 0/855 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Precisions,Brevity Penalty,Length Ratio,Translation Length,Reference Length
20,0.5211,0.32093,0.903838,0.855682,0.90136,0.903186,0.838348,"[0.8955532574974147, 0.8507306889352818, 0.8195995785036881, 0.791063829787234]",1.0,1.009395,4835,4790
40,0.2582,0.237159,0.938177,0.903861,0.937097,0.937558,0.901881,"[0.94285116181704, 0.9146238377007607, 0.8916151056112652, 0.8698836708315382]",0.997282,0.997286,4777,4790
60,0.2448,0.212992,0.939901,0.906228,0.938,0.939282,0.908088,"[0.9460363940598201, 0.918918918918919, 0.8974632274568323, 0.878174773999139]",0.998119,0.998121,4781,4790
80,0.2413,0.205225,0.940796,0.907841,0.939719,0.940344,0.908703,"[0.9460927705808608, 0.9185825775152922, 0.8969335604770017, 0.8776607181251344]",0.999165,0.999165,4786,4790
100,0.237,0.20109,0.94071,0.908814,0.93969,0.940386,0.91092,"[0.9475115014638227, 0.9214692843571881, 0.9004688832054561, 0.8816440714439423]",0.998328,0.99833,4782,4790
