#### Install Dependencies

In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

Collecting ninja
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)
Collecting flash-attn>=2.6.3
  Downloading flash_attn-2.6.3.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Using cached ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
Building wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash-attn: filename=flash_attn-2.6.3-cp310-cp310-linux_x86_64.whl size=187309225 sha256=237ef9c6157db394e1ddde4ba609a21ebb98382377a27041edc09318801a6f24
  Stored in directory: /root/.cache/pip/wheels/7e/e3/c3/89c7a2f3c4adc07cd1c675f8bb7b9ad4d18f64a72bccdfe826
Successfully built flash-attn
Installing collected packages: ninja, flash-attn
Successfully installed flash-attn-2.6.3 ninja-1.11.1.1


In [None]:
#!pip uninstall transformers -y
#!pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git

#### Mount Google Drive

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/gemma_unsloth_1

Mounted at /content/drive
/content/drive/MyDrive/gemma_unsloth_1


#### Path

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 1024
dtype = None
load_in_4bit = True



# Load the fine-tuned model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="GemmaInstruct-lro-finetune",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

# Prepare the model for inference
FastLanguageModel.for_inference(model)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.10.3: Fast Gemma2 patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 22.168 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers via:
`pip uninstall transformers -y && pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git"`
Unsloth 2024.10.3 patched 42 layers with 42 QKV layers, 42 O layers and 42 MLP layers.


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=256, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=256, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(


#### Data Preparation

In [None]:
import pandas as pd
from datasets import Dataset

# Load Data
df = pd.read_excel('LR_Dataset_Original_Sythetic_Final.xlsx')
df= df[['Sentence','Category' ,'Classification']]


df['Sentence'] = df['Sentence'].str.capitalize()
df=df.sample(frac=1).reset_index(drop=True)
df['Sentence'] = (df['Sentence']
                  .str.strip()
                  .str.replace(r'\n|\r', ' ', regex=True)
                  .str.replace(r'\s{2,}', ' ', regex=True))
df

Unnamed: 0,Sentence,Category,Classification
0,Despite the substantial time investment in loc...,RESEARCH GAP,TRAINING
1,"Despite ongoing investigations, scholars have ...",RESEARCH GAP,TRAINING
2,Although not explicitly mentioning double-coun...,DESCRIPTION,TEST
3,"Nonetheless, it fails to adequately maintain p...",RESULT,TRAINING
4,He wishes to convey his appreciation to macqua...,OTHER,TRAINING
...,...,...,...
2935,"Over the past few decades, video games have ga...",OVERALL,TRAINING
2936,The de-simple [13] approach integrates the sta...,DESCRIPTION,TRAINING
2937,"However, a couple of studies have revealed a p...",RESULT,VALIDATION
2938,Our findings indicate that no widely recognize...,RESEARCH GAP,TRAINING


In [None]:
test_df = df[df['Classification'] == 'TEST']
#test_df =test_df.shuffle(seed=42)

In [None]:
print(test_df.columns)

Index(['Sentence', 'Category', 'Classification'], dtype='object')


In [None]:
print(test_df.head(1))

                                            Sentence     Category  \
2  Although not explicitly mentioning double-coun...  DESCRIPTION   

  Classification  
2           TEST  


In [None]:
def create_conversation_test(row):
    sentence = row['Sentence']


    human = (
                "You are a researcher that should assign a classification to a sentence from scientific articles, choosing from one of the following seven categories. Each category corresponds to a specific aspect of scientific discourse, either related to a topic or a study. A topic is defined as a scientific domain, such as “Computer Science” or “Machine  Learning”. A previous study refers to a prior paper on the topic.\n"
                "Categories:\n "
                "1. OVERALL: Describes, introduces, classifies, or defines research topics often based on the discussion of multiple previous studies together.\n "
                "2. RESEARCH GAP: Highlights the need for further research within the topic.\n"
                "3. DESCRIPTION: Outlines the objectives, methodology, or design of one previous study, without mentioning results.\n"
                "4. RESULT: Describes specific findings or outcomes drawn from previous studies. This category includes empirical results, theoretical insights, and observed patterns reported by researchers. It often uses verbs like “showed”, “found”, “demonstrated”, and “observed” or phrases like “the findings indicate”.\n"
                "5. LIMITATION: Describes a constraint, challenge, or weakness inherent in the methodology of a previous study that hinders generalizability or reliability in a previous study.\n"
                "6. EXTENSION: Describes how the current study addresses or extends previous studies by stating the overall idea, contrasting ideas or elaborating further ideas. It usually uses the words “we” or “our”.\n"
                "7. OTHER: Any text that does not fit the above categories.\n"
                "Procedure:\n"
                "1. Determine whether the subject of the setence is a topic or a study.\n"
                "2. Identify the most suitable category based on the content. Do not create new categories. Use the categories given above.\n"
                "3. Provide the category number that best fits the sentence. Just provide the category number without any explanation.\n"

                f"Sentence: {sentence}.\n"
            )


    return [
        {"from": "human", "value": human},

    ]


In [None]:
test_df['conversations'] = test_df.apply(create_conversation_test, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['conversations'] = test_df.apply(create_conversation_test, axis=1)


In [None]:
test_df['conversations']


Unnamed: 0,conversations
2,"[{'from': 'human', 'value': 'You are a researc..."
29,"[{'from': 'human', 'value': 'You are a researc..."
51,"[{'from': 'human', 'value': 'You are a researc..."
65,"[{'from': 'human', 'value': 'You are a researc..."
115,"[{'from': 'human', 'value': 'You are a researc..."
...,...
2842,"[{'from': 'human', 'value': 'You are a researc..."
2849,"[{'from': 'human', 'value': 'You are a researc..."
2864,"[{'from': 'human', 'value': 'You are a researc..."
2872,"[{'from': 'human', 'value': 'You are a researc..."


In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

<|im_start|> is already a token. Skipping.
<|im_end|> is already a token. Skipping.


In [None]:
from datasets import load_dataset

test_dataset = Dataset.from_pandas(test_df)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/140 [00:00<?, ? examples/s]

#### Train the model

In [None]:
test_df_converted = test_dataset.to_pandas()
test_df_converted

Unnamed: 0,Sentence,Category,Classification,conversations,__index_level_0__,text
0,Although not explicitly mentioning double-coun...,DESCRIPTION,TEST,"[{'from': 'human', 'value': 'You are a researc...",2,<bos><|im_start|>user\nYou are a researcher th...
1,A related task to playlist recommendation is (...,OVERALL,TEST,"[{'from': 'human', 'value': 'You are a researc...",29,<bos><|im_start|>user\nYou are a researcher th...
2,There is lacking guidance for many of the pris...,RESEARCH GAP,TEST,"[{'from': 'human', 'value': 'You are a researc...",51,<bos><|im_start|>user\nYou are a researcher th...
3,Tie [41] is a time-aware incremental embedding...,DESCRIPTION,TEST,"[{'from': 'human', 'value': 'You are a researc...",65,<bos><|im_start|>user\nYou are a researcher th...
4,This result is most likely explained by both c...,RESULT,TEST,"[{'from': 'human', 'value': 'You are a researc...",115,<bos><|im_start|>user\nYou are a researcher th...
...,...,...,...,...,...,...
135,This procedure was approved by the ethics boar...,OTHER,TEST,"[{'from': 'human', 'value': 'You are a researc...",2842,<bos><|im_start|>user\nYou are a researcher th...
136,"Additionally, authors would like to give thank...",OTHER,TEST,"[{'from': 'human', 'value': 'You are a researc...",2849,<bos><|im_start|>user\nYou are a researcher th...
137,"With talebrush, we focus on visual encodings t...",EXTENSION,TEST,"[{'from': 'human', 'value': 'You are a researc...",2864,<bos><|im_start|>user\nYou are a researcher th...
138,Mcauley et al. [21] mine key attributes from t...,DESCRIPTION,TEST,"[{'from': 'human', 'value': 'You are a researc...",2872,<bos><|im_start|>user\nYou are a researcher th...


In [None]:
prompt=test_df_converted['text'].loc[4]
prompt


'<bos><|im_start|>user\nYou are a researcher that should assign a classification to a sentence from scientific articles, choosing from one of the following seven categories. Each category corresponds to a specific aspect of scientific discourse, either related to a topic or a study. A topic is defined as a scientific domain, such as “Computer Science” or “Machine  Learning”. A previous study refers to a prior paper on the topic.\nCategories:\n 1. OVERALL: Describes, introduces, classifies, or defines research topics often based on the discussion of multiple previous studies together.\n 2. RESEARCH GAP: Highlights the need for further research within the topic.\n3. DESCRIPTION: Outlines the objectives, methodology, or design of one previous study, without mentioning results.\n4. RESULT: Describes specific findings or outcomes drawn from previous studies. This category includes empirical results, theoretical insights, and observed patterns reported by researchers. It often uses verbs lik

In [None]:
FastLanguageModel.for_inference(model)

inputs = tokenizer(prompt,
                return_tensors="pt"
            ).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

answer

'user\nYou are a researcher that should assign a classification to a sentence from scientific articles, choosing from one of the following seven categories. Each category corresponds to a specific aspect of scientific discourse, either related to a topic or a study. A topic is defined as a scientific domain, such as “Computer Science” or “Machine  Learning”. A previous study refers to a prior paper on the topic.\nCategories:\n 1. OVERALL: Describes, introduces, classifies, or defines research topics often based on the discussion of multiple previous studies together.\n 2. RESEARCH GAP: Highlights the need for further research within the topic.\n3. DESCRIPTION: Outlines the objectives, methodology, or design of one previous study, without mentioning results.\n4. RESULT: Describes specific findings or outcomes drawn from previous studies. This category includes empirical results, theoretical insights, and observed patterns reported by researchers. It often uses verbs like “showed”, “foun

In [None]:
import re
def extract_text(text):
    # Extract text after "Classification: "
    match = re.search(r"Classification:\s*(.*)", text)
    if match:
        classification = match.group(1)
        # Remove extra spaces and convert to uppercase
        cleaned_classification = ' '.join(classification.split()).strip().upper()
        return cleaned_classification
    return ""

In [None]:
extract_text(answer)

'RESULT'

In [None]:
# Getting the Classification
def get_classification(data_point,model,tokenizer):
    """
    Gets the classification for a data point using the fine-tuned model.
    """
    FastLanguageModel.for_inference(model)

    inputs = tokenizer(
                data_point['text'],
                return_tensors="pt"
            ).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    print(f"Sentence : {data_point.index[-1]}")
    print(answer)
    data_point['Prediction_Finetune']=answer
    data_point['Prediction_Finetune_Clean']=extract_text(answer)



    return data_point

# Apply the get_classification function to the dataset using map
test_df_converted = test_df_converted.apply(lambda row: get_classification(row, model, tokenizer), axis=1)

Sentence : text
user
You are a researcher that should assign a classification to a sentence from scientific articles, choosing from one of the following seven categories. Each category corresponds to a specific aspect of scientific discourse, either related to a topic or a study. A topic is defined as a scientific domain, such as “Computer Science” or “Machine  Learning”. A previous study refers to a prior paper on the topic.
Categories:
 1. OVERALL: Describes, introduces, classifies, or defines research topics often based on the discussion of multiple previous studies together.
 2. RESEARCH GAP: Highlights the need for further research within the topic.
3. DESCRIPTION: Outlines the objectives, methodology, or design of one previous study, without mentioning results.
4. RESULT: Describes specific findings or outcomes drawn from previous studies. This category includes empirical results, theoretical insights, and observed patterns reported by researchers. It often uses verbs like “showe

In [None]:
test_df_converted

Unnamed: 0,Sentence,Category,Classification,conversations,__index_level_0__,text,Prediction_Finetune,Prediction_Finetune_Clean
0,Although not explicitly mentioning double-coun...,DESCRIPTION,TEST,"[{'from': 'human', 'value': 'You are a researc...",2,<bos><|im_start|>user\nYou are a researcher th...,user\nYou are a researcher that should assign ...,DESCRIPTION
1,A related task to playlist recommendation is (...,OVERALL,TEST,"[{'from': 'human', 'value': 'You are a researc...",29,<bos><|im_start|>user\nYou are a researcher th...,user\nYou are a researcher that should assign ...,OVERALL
2,There is lacking guidance for many of the pris...,RESEARCH GAP,TEST,"[{'from': 'human', 'value': 'You are a researc...",51,<bos><|im_start|>user\nYou are a researcher th...,user\nYou are a researcher that should assign ...,RESEARCH GAP
3,Tie [41] is a time-aware incremental embedding...,DESCRIPTION,TEST,"[{'from': 'human', 'value': 'You are a researc...",65,<bos><|im_start|>user\nYou are a researcher th...,user\nYou are a researcher that should assign ...,DESCRIPTION
4,This result is most likely explained by both c...,RESULT,TEST,"[{'from': 'human', 'value': 'You are a researc...",115,<bos><|im_start|>user\nYou are a researcher th...,user\nYou are a researcher that should assign ...,RESULT
...,...,...,...,...,...,...,...,...
135,This procedure was approved by the ethics boar...,OTHER,TEST,"[{'from': 'human', 'value': 'You are a researc...",2842,<bos><|im_start|>user\nYou are a researcher th...,user\nYou are a researcher that should assign ...,OTHER
136,"Additionally, authors would like to give thank...",OTHER,TEST,"[{'from': 'human', 'value': 'You are a researc...",2849,<bos><|im_start|>user\nYou are a researcher th...,user\nYou are a researcher that should assign ...,OTHER
137,"With talebrush, we focus on visual encodings t...",EXTENSION,TEST,"[{'from': 'human', 'value': 'You are a researc...",2864,<bos><|im_start|>user\nYou are a researcher th...,user\nYou are a researcher that should assign ...,EXTENSION
138,Mcauley et al. [21] mine key attributes from t...,DESCRIPTION,TEST,"[{'from': 'human', 'value': 'You are a researc...",2872,<bos><|im_start|>user\nYou are a researcher th...,user\nYou are a researcher that should assign ...,DESCRIPTION


In [None]:
#test_df_converted.to_csv('Gemma_Instruct_FT_Test_NEFT_Augmented1.csv', index=False)
test_df_converted.to_csv('Gemma_Instruct_FT_Test_LoRA_Augmented1.csv', index=False)


#### Make Inference