In [None]:
import torch
from datasets import load_from_disk
from peft import LoraConfig, get_peft_model
import evaluate
from transformers import DataCollatorForSeq2Seq, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

import matplotlib.pyplot as plt
from utils import prepare_prompt, print_trainable_parameters, evaluate_model
import transformers
from tqdm import tqdm

import json

# Dataset Splitting

In [None]:
dataset_split = load_from_disk('dataset_split')

print(dataset_split)

# Model Fine-Tuning

In [None]:
slm_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer_slm = AutoTokenizer.from_pretrained(slm_name, cache_dir="/Data/gabriel-mercier/slm_models", padding_side="left")
tokenizer_slm.pad_token = tokenizer_slm.eos_token

In [None]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True, 
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_compute_dtype=torch.bfloat16,
                                bnb_4bit_quant_type='nf4',
                            )
model_raw = AutoModelForCausalLM.from_pretrained(
    slm_name,
    cache_dir="/Data/gabriel-mercier/slm_models",
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map="auto"   
)


In [None]:
lora_config = LoraConfig(r=16, 
                        lora_alpha=32,
                        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
                        lora_dropout=0.05,
                        bias='none',
                        task_type="CAUSAL_LM")

model = get_peft_model(model_raw, lora_config)
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)



In [None]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer_slm.eos_token_id
generation_config.eos_token_id = tokenizer_slm.eos_token_id
generation_config.do_sample = True

print_trainable_parameters(model)

### Example

In [None]:
assistant_start = "Résumé concis et structuré (100 mots maximum) :"
summary_data = dataset_split['train'][1]['summary']
prompt = prepare_prompt(dataset_split['train'][1], summary_included=False)

encoding = tokenizer_slm(prompt, return_tensors="pt").to(device)

with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config,
    )
prediction = tokenizer_slm.decode(outputs[0], skip_special_tokens=True)

start_index = prediction.find(assistant_start)
if start_index != -1:
    response_start = start_index + len(assistant_start)
else:
    response_start = -1 

print("=== GENERATED SUMMARY ===")
print(prediction[response_start+1:])
print(len(prediction[response_start+1:].split()))

print("=== LABEL SUMMARY ===")
print(summary_data)
print(len(summary_data.split()))

In [None]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = prepare_prompt(data_point)+tokenizer_slm.eos_token 
    tokenized_full_prompt = tokenizer_slm(full_prompt, return_tensors='pt')
    labels = tokenized_full_prompt.input_ids.clone() 
    
    assistant_token = tokenizer_slm("Résumé concis et structuré", return_tensors='pt')['input_ids'][0]
   
    complement_token = tokenizer_slm("(100 mots maximum) :", return_tensors='pt')['input_ids'][0]
    
    T = tokenized_full_prompt['input_ids'].flatten()
    S = assistant_token.flatten()
    
    for i in range(len(T) - len(S) + 1):
        if torch.equal(T[i:i+len(S)], S):
            end_prompt_idx = i+len(S)   
    
    labels[:, :end_prompt_idx+len(complement_token)] = -100
    

    return {
        'input_ids': tokenized_full_prompt.input_ids.flatten(),
        'labels': labels.flatten(),
        'attention_mask': tokenized_full_prompt.attention_mask.flatten(),
    }


In [None]:
dataset_train = dataset_split["train"].shuffle(seed=42).map(generate_and_tokenize_prompt)
dataset_val = dataset_split["validation"].shuffle(seed=42).map(generate_and_tokenize_prompt)
dataset_test = dataset_split["test"]

dataset_train = dataset_train.remove_columns(["text", "summary"])
dataset_val = dataset_val.remove_columns(["text", "summary"])


In [None]:
print(dataset_train)
print(dataset_val)
print(dataset_test)

### Training

In [None]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    bf16=True,
    save_total_limit=3,
    logging_steps=1,
    evaluation_strategy="epoch",
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
)


In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer_slm, model=model),
)

In [None]:
trainer.train()

# Model Evaluation

### Example

In [None]:
encoding = tokenizer_slm(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config,
    )
prediction = tokenizer_slm.decode(outputs[0], skip_special_tokens=True)

start_index = prediction.find(assistant_start)
if start_index != -1:
    response_start = start_index + len(assistant_start)
else:
    response_start = -1 


print("=== GENERATED SUMMARY ===")
print(prediction[response_start+1:])
print(len(prediction[response_start+1:].split()))
print("=== LABEL SUMMARY ===")
print(summary_data)
print(len(summary_data.split()))

In [None]:
rouge = evaluate.load("rouge")
bert_score = evaluate.load("bertscore")

In [None]:
rouges_results_finetune, bert_results_finetune = evaluate_model(model, dataset_test, tokenizer_slm, device, generation_config)

In [None]:
results_finetune = {
    "rouge": rouges_results_finetune,
    "bert": bert_results_finetune
}

with open("evaluation_results_finetune.json", "w") as f:
    json.dump(results_finetune, f, indent=4)


In [None]:
model_raw = AutoModelForCausalLM.from_pretrained(
    slm_name,
    cache_dir="/Data/gabriel-mercier/slm_models",
)
model_raw.to(device)


In [None]:
rouges_results_raw, bert_results_raw = evaluate_model(model, dataset_test, tokenizer_slm, device, generation_config)

results_raw = {
    "rouge": rouges_results_raw,
    "bert": bert_results_raw
}

with open("evaluation_results_raw.json", "w") as f:
    json.dump(results_raw, f, indent=4)