#### Install Dependencies

In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

In [None]:
#!pip uninstall transformers -y
#!pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git

#### Mount Google Drive

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/gemma_unsloth_1

#### Path

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 1024
dtype = None
load_in_4bit = True



# Load the fine-tuned model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="GemmaInstruct-lro-finetune",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

# Prepare the model for inference
FastLanguageModel.for_inference(model)

#### Data Preparation

In [None]:
import pandas as pd
from datasets import Dataset

# Load Data
df = pd.read_excel('LR_Dataset_Original_Sythetic_Final.xlsx')
df= df[['Sentence','Category' ,'Classification']]


df['Sentence'] = df['Sentence'].str.capitalize()
df=df.sample(frac=1).reset_index(drop=True)
df['Sentence'] = (df['Sentence']
                  .str.strip()
                  .str.replace(r'\n|\r', ' ', regex=True)
                  .str.replace(r'\s{2,}', ' ', regex=True))
df

In [None]:
test_df = df[df['Classification'] == 'TEST']
#test_df =test_df.shuffle(seed=42)

In [None]:
print(test_df.columns)

In [None]:
print(test_df.head(1))

In [None]:
def create_conversation_test(row):
    sentence = row['Sentence']


    human = (
                "You are a researcher that should assign a classification to a sentence from scientific articles, choosing from one of the following seven categories. Each category corresponds to a specific aspect of scientific discourse, either related to a topic or a study. A topic is defined as a scientific domain, such as “Computer Science” or “Machine  Learning”. A previous study refers to a prior paper on the topic.\n"
                "Categories:\n "
                "1. OVERALL: Describes, introduces, classifies, or defines research topics often based on the discussion of multiple previous studies together.\n "
                "2. RESEARCH GAP: Highlights the need for further research within the topic.\n"
                "3. DESCRIPTION: Outlines the objectives, methodology, or design of one previous study, without mentioning results.\n"
                "4. RESULT: Describes specific findings or outcomes drawn from previous studies. This category includes empirical results, theoretical insights, and observed patterns reported by researchers. It often uses verbs like “showed”, “found”, “demonstrated”, and “observed” or phrases like “the findings indicate”.\n"
                "5. LIMITATION: Describes a constraint, challenge, or weakness inherent in the methodology of a previous study that hinders generalizability or reliability in a previous study.\n"
                "6. EXTENSION: Describes how the current study addresses or extends previous studies by stating the overall idea, contrasting ideas or elaborating further ideas. It usually uses the words “we” or “our”.\n"
                "7. OTHER: Any text that does not fit the above categories.\n"
                "Procedure:\n"
                "1. Determine whether the subject of the setence is a topic or a study.\n"
                "2. Identify the most suitable category based on the content. Do not create new categories. Use the categories given above.\n"
                "3. Provide the category number that best fits the sentence. Just provide the category number without any explanation.\n"

                f"Sentence: {sentence}.\n"
            )


    return [
        {"from": "human", "value": human},

    ]


In [None]:
test_df['conversations'] = test_df.apply(create_conversation_test, axis=1)

In [None]:
test_df['conversations']


In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

In [None]:
from datasets import load_dataset

test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

#### Train the model

In [None]:
test_df_converted = test_dataset.to_pandas()
test_df_converted

In [None]:
prompt=test_df_converted['text'].loc[4]
prompt


In [None]:
FastLanguageModel.for_inference(model)

inputs = tokenizer(prompt,
                return_tensors="pt"
            ).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

answer

In [None]:
import re
def extract_text(text):
    # Extract text after "Classification: "
    match = re.search(r"Classification:\s*(.*)", text)
    if match:
        classification = match.group(1)
        # Remove extra spaces and convert to uppercase
        cleaned_classification = ' '.join(classification.split()).strip().upper()
        return cleaned_classification
    return ""

In [None]:
extract_text(answer)

In [None]:
# Getting the Classification
def get_classification(data_point,model,tokenizer):
    """
    Gets the classification for a data point using the fine-tuned model.
    """
    FastLanguageModel.for_inference(model)

    inputs = tokenizer(
                data_point['text'],
                return_tensors="pt"
            ).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    print(f"Sentence : {data_point.index[-1]}")
    print(answer)
    data_point['Prediction_Finetune']=answer
    data_point['Prediction_Finetune_Clean']=extract_text(answer)



    return data_point

# Apply the get_classification function to the dataset using map
test_df_converted = test_df_converted.apply(lambda row: get_classification(row, model, tokenizer), axis=1)

In [None]:
test_df_converted

In [None]:
#test_df_converted.to_csv('Gemma_Instruct_FT_Test_NEFT_Augmented1.csv', index=False)
test_df_converted.to_csv('Gemma_Instruct_FT_Test_LoRA_Augmented1.csv', index=False)


#### Make Inference