In [None]:
%%capture
!pip install rouge_score

In [None]:
%%capture
!pip install evaluate

In [None]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git

In [None]:
%%capture
!pip install datasets

In [None]:
%%capture
!pip install -U transformers peft accelerate bitsandbytes

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    !pip install --no-deps cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
lora_path = "/content/drive/MyDrive/English_SRL_unsloth_Llama-3.2-3B-Instruct-bnb-4bit/lora/"
print(os.listdir(lora_path))

['README.md', 'adapter_model.safetensors', 'tokenizer.json', 'tokenizer_config.json', 'adapter_config.json', 'special_tokens_map.json']


In [None]:
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
from trl import SFTTrainer
import torch
import pandas as pd
import ast
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import Trainer, TrainingArguments, BatchEncoding#AdamW, get_linear_schedule_with_warmup

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.5.1+cu121 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.11 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
device = torch.device("cuda")
json_data = '/content/drive/MyDrive/davodi.json'
persian_df = pd.read_json(json_data)
persian_df = persian_df.drop(['_id', 'sentence', 'frame', 'lexicalUnit', 'status', 'issuer', 'is_active', 'createdAt', 'updatedAt', 'PId', 'lang', 'description', 'lexicalUnitHint', 'reviewer', 'lexicalUnitHelper', 'frameHelper', 'frameName', 'lexicalUnitName'], axis = 1)

In [None]:
def framenet_tags(FN_tags):
    framenet_pattern = []
    for FN_tag in FN_tags:
        tag_type = FN_tag.get('tagType')
        if tag_type == 5:
            element = FN_tag.get('element', {})
            element_name = element.get('name', '')
            framenet_pattern.append(element_name)
        else:
            framenet_pattern.append('O')
    return framenet_pattern

In [None]:
persian_df['frameNetTags'] = persian_df['frameNetTags'].apply(lambda FN_tags : framenet_tags(FN_tags))

In [None]:
import numpy as np
import os
import random
torch.cuda.empty_cache()
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [None]:
import datasets
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
persian_df_tr, persian_df_temp = train_test_split(persian_df, random_state=seed, test_size=.08)
persian_df_val, persian_df_te = train_test_split(persian_df_temp, random_state=seed, test_size=.05)
del persian_df_temp
persian_df_tr.reset_index(drop=True, inplace=True)
persian_df_val.reset_index(drop=True, inplace=True)
persian_df_te.reset_index(drop=True, inplace=True)
persian_df_tr = Dataset.from_pandas(persian_df_tr)
persian_df_val = Dataset.from_pandas(persian_df_val)
persian_df_te = Dataset.from_pandas(persian_df_te)

In [None]:
prop_prompt_template = """<|start_header_id|>system<|end_header_id|>
Please generate PropBank roles for the provided text.
<|eot_id|><|start_header_id|>user<|end_header_id|>
You are an expert in the field of Semantic Role Labeling and lexical resources especially PropBank.
You know anything about how to label sentence tokens with PropBank roles.
Please use the following text:"+ {}+"Here are the propbank roles you have to use for labeling:" + {}+"
Your task is to generate PropBank roles for the provided text.
The output should be a list of roles in a list format. IF a token does not have any role, put 'O'.
Make sure that you do NOT use any roles other than the ones I provided in this prompt.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{}"""

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
#model_name = "unsloth/Llama-3.3-70B-Instruct-bnb-4bit"
model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
bnb_config = BitsAndBytesConfig(load_in_4bit=True,bnb_4bit_compute_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,quantization_config=bnb_config,device_map="auto")
lora_config = LoraConfig(r=8,lora_alpha=32,target_modules=["q_proj", "v_proj"],lora_dropout=0.05,bias="none")
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("✅ DeepSeek LLM Loaded with LoRA and 4-bit Precision!")

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

trainable params: 2,293,760 || all params: 3,215,043,584 || trainable%: 0.0713
✅ DeepSeek LLM Loaded with LoRA and 4-bit Precision!


In [None]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(model_name = model_name, max_seq_length = 2048,dtype = None,load_in_4bit = True,)
model = FastLanguageModel.get_peft_model(model,r=16,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj",],
    lora_alpha=16,lora_dropout=0,bias="none",use_gradient_checkpointing="unsloth",random_state=3407,use_rslora=False,loftq_config=None,)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_function(examples):
    words = examples["words"]
    props = examples["propBankTags"]
    texts = []
    for word, prop in zip(words, props):
        text = prop_prompt_template.format(word, str(prop_roles), prop) + EOS_TOKEN
        texts.append(text)
    return tokenizer(examples["words"],padding=True,truncation=True,return_tensors="pt")

In [None]:
prop_roles = set()
for _, row in persian_df.iterrows():
    prop_str = ast.literal_eval(str(row['propBankTags']))
    prop_roles.update(prop_str)
prop_roles = list(prop_roles)

In [None]:
main_dataset = persian_df_tr.map(formatting_prompts_function, remove_columns=persian_df_tr.column_names, batched=True)#, num_proc=1
eval_dataset = persian_df_val.map(formatting_prompts_function, remove_columns=persian_df_val.column_names, batched=True)
test_dataset = persian_df_te.map(formatting_prompts_function, remove_columns=persian_df_te.column_names, batched=True)

main_dataset = main_dataset.shuffle(seed=seed)
eval_dataset = eval_dataset.shuffle(seed=seed)
test_dataset = test_dataset.shuffle(seed=seed)

Map:   0%|          | 0/92 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
!os.environ['CUDA_VISIBLE_DEVICES'] ='0'

/bin/bash: line 1: os.environ[CUDA_VISIBLE_DEVICES]: command not found


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from evaluate import load
import numpy as np
import math
import torch
import torch.nn.functional as F

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    labels = torch.tensor(labels, dtype=torch.int32)
    print("Predictions:", predictions.shape)
    print("Labels:", labels.shape)
    results = {}
    def compute_token_accuracy(predicted_ids, label_ids):
        total_tokens = 0
        correct_tokens = 0
        for pred, ref in zip(predicted_ids, label_ids):
            for p_token, r_token in zip(pred, ref):
                if r_token != -100:
                    total_tokens += 1
                    if p_token == r_token:
                        correct_tokens += 1
        return (correct_tokens / total_tokens) * 100 if total_tokens > 0 else 0.0
    if labels is not None:
        logits_tensor = torch.tensor(logits, dtype=torch.float32)
        labels_tensor = torch.tensor(labels, dtype=torch.long)
        per_sample_losses = []
        for sample_logits, sample_labels in zip(logits_tensor, labels_tensor):
            loss = F.cross_entropy(sample_logits.view(-1, sample_logits.size(-1)),sample_labels.view(-1),reduction="none")
            per_sample_loss = loss.view(sample_labels.size(0), -1).mean(dim=1).mean().item()
            per_sample_losses.append(per_sample_loss)
        results["manual_eval_loss"] = sum(per_sample_losses) / len(per_sample_losses)
        results["perplexity"] = math.exp(results["manual_eval_loss"])
        results["loss_per_sample"] = per_sample_losses

    # 2. Token-Level Metrics
    if labels is not None:
        flattened_preds = []
        flattened_labels = []
        for pred, label in zip(predictions, labels):
            for p_token, l_token in zip(pred, label):
                if l_token != -100:
                    flattened_preds.append(p_token)
                    flattened_labels.append(l_token)

        if flattened_labels and flattened_preds:
            results["accuracy_score"] = accuracy_score(flattened_labels, flattened_preds)
            results["token_accuracy"] = compute_token_accuracy(predictions, labels)
            results["precision"] = precision_score(flattened_labels, flattened_preds, average="macro")
            results["recall"] = recall_score(flattened_labels, flattened_preds, average="macro")
            results["f1"] = f1_score(flattened_labels, flattened_preds, average="macro")
    return results

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=main_dataset,
    eval_dataset=eval_dataset,
    dataset_text_field="text",
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=6,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=1,
        learning_rate=2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        logging_strategy="steps",
        logging_steps=1,
    ),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(eval_results)