#### Install Dependencies

In [None]:
%%capture
%pip install unsloth
%pip install pandas
# Also get the latest nightly Unsloth!
%pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

In [None]:
!pip install git+https://github.com/huggingface/transformers



In [None]:
#!pip uninstall transformers -y
#!pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git"

#### Mount Google Drive

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/phi_unsloth_1

#### Path

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model_name = "unsloth/Phi-3.5-mini-instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 256, #Original: 16, # Best: 256
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj", ], # New:  "lora_magnitude_vector"
    lora_alpha = 128, # Original # Best: 128
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    #use_dora=True # Comment it out
    #lora_query=True, # Comment it out not the original setting.
    #lora_value=True # Comment it out not the original setting.

)

#### Data Preparation

In [None]:
import pandas as pd
from datasets import Dataset
# Load Data
df = pd.read_excel('LR_Dataset_Original_Sythetic_Final.xlsx')
df= df[['Sentence','Category' ,'Classification']]


df['Sentence'] = df['Sentence'].str.capitalize()
df=df.sample(frac=1).reset_index(drop=True)
df['Sentence'] = (df['Sentence']
                  .str.strip()
                  .str.replace(r'\n|\r', ' ', regex=True)
                  .str.replace(r'\s{2,}', ' ', regex=True))
df


In [None]:
train_df =df[df['Classification'] == 'TRAINING']
#train_df =train_df.shuffle(seed=42)
val_df =df[df['Classification'] == 'VALIDATION']
#eval_df =eval_df.shuffle(seed=42)
test_df = df[df['Classification'] == 'TEST']
#test_df =test_df.shuffle(seed=42)

In [None]:
print(train_df.columns)

In [None]:
print(train_df.head(1))

In [None]:
def create_conversation(row):
    sentence = row['Sentence']
    answer = row['Category']

    human = (
                "You are a researcher that should assign a classification to a sentence from scientific articles, choosing from one of the following seven categories. Each category corresponds to a specific aspect of scientific discourse, either related to a topic or a study. A topic is defined as a scientific domain, such as “Computer Science” or “Machine  Learning”. A previous study refers to a prior paper on the topic.\n"
                "Categories:\n "
                "1. OVERALL: Describes, introduces, classifies, or defines research topics often based on the discussion of multiple previous studies together.\n "
                "2. RESEARCH GAP: Highlights the need for further research within the topic.\n"
                "3. DESCRIPTION: Outlines the objectives, methodology, or design of one previous study, without mentioning results.\n"
                "4. RESULT: Describes specific findings or outcomes drawn from previous studies. This category includes empirical results, theoretical insights, and observed patterns reported by researchers. It often uses verbs like “showed”, “found”, “demonstrated”, and “observed” or phrases like “the findings indicate”.\n"
                "5. LIMITATION: Describes a constraint, challenge, or weakness inherent in the methodology of a previous study that hinders generalizability or reliability in a previous study.\n"
                "6. EXTENSION: Describes how the current study addresses or extends previous studies by stating the overall idea, contrasting ideas or elaborating further ideas. It usually uses the words “we” or “our”.\n"
                "7. OTHER: Any text that does not fit the above categories.\n"
                "Procedure:\n"
                "1. Determine whether the subject of the setence is a topic or a study.\n"
                "2. Identify the most suitable category based on the content. Do not create new categories. Use the categories given above.\n"
                "3. Provide the category number that best fits the sentence. Just provide the category number without any explanation.\n"

                f"Sentence: {sentence}.\n"
            )

    gpt = f"Classification: {answer}"



    return [
        {"from": "human", "value": human},
        {"from": "gpt", "value": gpt},
    ]

In [None]:
def create_conversation_test(row):
    sentence = row['Sentence']


    human = (
                "You are a researcher that should assign a classification to a sentence from scientific articles, choosing from one of the following seven categories. Each category corresponds to a specific aspect of scientific discourse, either related to a topic or a study. A topic is defined as a scientific domain, such as “Computer Science” or “Machine  Learning”. A previous study refers to a prior paper on the topic.\n"
                "Categories:\n "
                "1. OVERALL: Describes, introduces, classifies, or defines research topics often based on the discussion of multiple previous studies together.\n "
                "2. RESEARCH GAP: Highlights the need for further research within the topic.\n"
                "3. DESCRIPTION: Outlines the objectives, methodology, or design of one previous study, without mentioning results.\n"
                "4. RESULT: Describes specific findings or outcomes drawn from previous studies. This category includes empirical results, theoretical insights, and observed patterns reported by researchers. It often uses verbs like “showed”, “found”, “demonstrated”, and “observed” or phrases like “the findings indicate”.\n"
                "5. LIMITATION: Describes a constraint, challenge, or weakness inherent in the methodology of a previous study that hinders generalizability or reliability in a previous study.\n"
                "6. EXTENSION: Describes how the current study addresses or extends previous studies by stating the overall idea, contrasting ideas or elaborating further ideas. It usually uses the words “we” or “our”.\n"
                "7. OTHER: Any text that does not fit the above categories.\n"
                "Procedure:\n"
                "1. Determine whether the subject of the setence is a topic or a study.\n"
                "2. Identify the most suitable category based on the content. Do not create new categories. Use the categories given above.\n"
                "3. Provide the category number that best fits the sentence. Just provide the category number without any explanation.\n"

                f"Sentence: {sentence}.\n"
            )


    return [
        {"from": "human", "value": human},

    ]


In [None]:
train_df['conversations'] = train_df.apply(create_conversation, axis=1)
val_df['conversations'] = val_df.apply(create_conversation, axis=1)
test_df['conversations'] = test_df.apply(create_conversation_test, axis=1)

In [None]:
train_df['conversations']
#val_df['conversations']
#test_df['conversations']


In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "phi-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

In [None]:
from datasets import load_dataset

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)
val_dataset = val_dataset.map(formatting_prompts_func, batched = True,)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

In [None]:
print(train_dataset[100]["text"])

#### Train the model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, EarlyStoppingCallback
from unsloth import is_bfloat16_supported


# Project and Model Setup
project = "lro-finetune"
base_model_name = "Phi-Instruct"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,  # Numero di valutazioni senza miglioramenti prima di fermare
    early_stopping_threshold=0.01  # Soglia di miglioramento minimo
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        per_device_eval_batch_size = 8,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Orginal 60
        evaluation_strategy = "steps",  # Can also be "epoch"
        eval_steps = 10,  # Evaluate every 10 steps
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,# Original: 0.01
        lr_scheduler_type = "linear",
        seed = 3407,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        output_dir = output_dir,
        report_to = "none",
    ),
    callbacks = [early_stopping_callback] # Delete the , and the code below for original setting.
    #neftune_noise_alpha=5   # Add noise to embeddings

)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()



In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

#### Save the model

In [None]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
test_df_converted = test_dataset.to_pandas()
test_df_converted

In [None]:
prompt=test_df_converted['text'].loc[13]
prompt


In [None]:
FastLanguageModel.for_inference(model)

inputs = tokenizer(prompt,
                return_tensors="pt"
            ).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

answer

In [None]:
import re
def extract_text(text):
    # Extract text after "Classification: "
    match = re.search(r"Classification:\s*(.*)", text)
    if match:
        classification = match.group(1)
        # Remove extra spaces and convert to uppercase
        cleaned_classification = ' '.join(classification.split()).strip().upper()
        return cleaned_classification
    return ""

In [None]:
extract_text(answer)

In [None]:
# Getting the Classification
def get_classification(data_point,model,tokenizer):
    """
    Gets the classification for a data point using the fine-tuned model.
    """
    FastLanguageModel.for_inference(model)

    inputs = tokenizer(
                data_point['text'],
                return_tensors="pt"
            ).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    print(f"Sentence : {data_point.index[-1]}")
    print(answer)
    data_point['Prediction_Finetune']=answer
    data_point['Prediction_Finetune_Clean']=extract_text(answer)



    return data_point

# Apply the get_classification function to the dataset using map
test_df_converted = test_df_converted.apply(lambda row: get_classification(row, model, tokenizer), axis=1)

In [None]:
test_df_converted

In [None]:
test_df_converted.to_csv('Mistral7B_Nemo_FT_Test_Change2.csv', index=False)


#### Make Inference