# first trial to finetuning llama2 on a generic summarization dataset

## take eur-lex-sum dataset from huggingface just italian

In [1]:
import pandas as pd
import os

EUR_LEX_SUM_PATH = "../data/raw/summarization/eur-lex-sum_it/"
TRAIN_EUR_PATH = os.path.join(EUR_LEX_SUM_PATH, "train.json")
TEST_EUR_PATH = os.path.join(EUR_LEX_SUM_PATH, "test.json")

# Read JSON file line by line
with open(TRAIN_EUR_PATH, "r", encoding="utf-8") as file:
    lines = file.readlines()

# Join the lines into a single JSON string
json_data_train = "[" + ",".join(lines) + "]"

# Load JSON into Pandas DataFrame
try:
    data_train = pd.read_json(json_data_train)
    print(data_train)
except ValueError as e:
    print("Error:", e)

  data_train = pd.read_json(json_data_train)


            celex_id                                          reference  \
0         32019R0980  21.6.2019\nIT\nGazzetta ufficiale dell'Unione ...   
1         32019D0785  16.5.2019\nIT\nGazzetta ufficiale dell'Unione ...   
2         32019R1122  2.7.2019\nIT\nGazzetta ufficiale dell'Unione e...   
3         32019R0856  28.5.2019\nIT\nGazzetta ufficiale dell'Unione ...   
4     22020A0724(01)  24.7.2020\nIT\nGazzetta ufficiale dell’Unione ...   
...              ...                                                ...   
1023  21986A0618(01)  EUR-Lex - 21986A0618(01) - IT\nAvis juridique ...   
1024      32004R0785  30.4.2004\nIT\nGazzetta ufficiale dell'Unione ...   
1025      31965R0019  EUR-Lex - 31965R0019 - IT\nAvis juridique impo...   
1026  31958R0003(01)  EUR-Lex - 31958R0003(01) - IT\nAvis juridique ...   
1027  22004A0210(01)  EUR-Lex - 22004A0210(01) - IT\nAvis juridique ...   

                                                summary  
0     Prospetto da pubblicare per l’offer

In [2]:
with open(TEST_EUR_PATH, "r", encoding="utf-8") as file:
    lines = file.readlines()

# Join the lines into a single JSON string
json_data_test = "[" + ",".join(lines) + "]"

# Load JSON into Pandas DataFrame
try:
    data_test = pd.read_json(json_data_test)
    print(data_test)
except ValueError as e:
    print("Error:", e)

  data_test = pd.read_json(json_data_test)


           celex_id                                          reference  \
0        32019R1156  12.7.2019\nIT\nGazzetta ufficiale dell'Unione ...   
1        32019R1020  25.6.2019\nIT\nGazzetta ufficiale dell'Unione ...   
2        32019L0789  17.5.2019\nIT\nGazzetta ufficiale dell'Unione ...   
3        32019R0515  29.3.2019\nIT\nGazzetta ufficiale dell'Unione ...   
4        32019R0517  29.3.2019\nIT\nGazzetta ufficiale dell'Unione ...   
..              ...                                                ...   
183      32008R0762  13.8.2008\nIT\nGazzetta ufficiale dell’Unione ...   
184      32010R1236  31.12.2010\nIT\nGazzetta ufficiale dell'Unione...   
185      32010R0066  30.1.2010\nIT\nGazzetta ufficiale dell’Unione ...   
186      32009R0469  16.6.2009\nIT\nGazzetta ufficiale dell'Unione ...   
187  22009A0610(01)  10.6.2009\nIT\nGazzetta ufficiale dell’Unione ...   

                                               summary  
0    Distribuzione transfrontaliera degli organismi...

## model llama2

In [3]:
import os

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer



In [4]:
# model = "NousResearch/llama-2-7b-chat-hf"

In [5]:
# tokenizer = AutoTokenizer.from_pretrained(model)
# pipeline = pipeline(
#     "text-generation",
#     model=model,
#     torch_dtype=torch.float16,
#     device_map="auto",
# )

In [6]:
# use this if you have access to the official LLaMA 2 model "meta-llama/Llama-2-7b-chat-hf", though keep in mind you'll need to pass a Hugging Face key argument
model_name = "NousResearch/llama-2-7b-chat-hf"
dataset_name = "/content/train.jsonl"
new_model = "llama-2-7b-law-gpt"
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 5
max_seq_length = None
packing = False
device_map = {"": 0}

In [7]:
prompt = "Sei un modello che fa riassunto di leggi"
temperature = 0.4
number_of_examples = 100

In [8]:
from datasets import Dataset

dataset_train = Dataset.from_pandas(data_train[:100])
dataset_test = Dataset.from_pandas(data_test[:50])

In [9]:
dataset_train

Dataset({
    features: ['celex_id', 'reference', 'summary'],
    num_rows: 100
})

In [10]:
# Preprocess datasets
train_dataset_mapped = dataset_train.map(
    lambda examples: {
        "text": [
            f"[INST] <<SYS>>\nSei un LawGPT\n<</SYS>>\n\n"
            + reference
            + " [/INST] "
            + summary
            for reference, summary in zip(examples["reference"], examples["summary"])
        ]
    },
    batched=True,
)
valid_dataset_mapped = dataset_test.map(
    lambda examples: {
        "text": [
            f"[INST] <<SYS>>\nSei un LawGPT\n<</SYS>>\n\n"
            + reference
            + " [/INST] "
            + summary
            for reference, summary in zip(examples["reference"], examples["summary"])
        ]
    },
    batched=True,
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [11]:
# ! questo diventa torch.float16
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [19]:
import torch
import os

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

torch.__version__

'2.0.1+cu117'

In [20]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
print(torch.cuda.current_device())

True
1
GRID A100D-2-20C MIG 2g.20gb
0


In [14]:
model = AutoModelForCausalLM.from_pretrained(
    model_name, quantization_config=bnb_config, device_map=device_map
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [15]:
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [16]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="all",
    evaluation_strategy="steps",
    eval_steps=5,  # Evaluate every 20 steps
)

In [17]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_mapped,
    eval_dataset=valid_dataset_mapped,  # Pass validation dataset here
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 20.00 GiB total capacity; 4.92 GiB already allocated; 94.81 MiB free; 5.04 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.model.save_pretrained(new_model)

In [None]:
# Cell 4: Test the model
# logging.set_verbosity(logging.CRITICAL)
# prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\nWrite a function that reverses a string. [/INST]" # replace the command here with something relevant to your task
# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
# result = pipe(prompt)
# print(result[0]['generated_text'])

In [None]:
model_path = "../models/llama-7b-hf-law_gpt"

# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"