In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/olmo
# original:%cd drive/MyDrive/mistral

In [None]:
!pip install "torch==2.5.0"

In [None]:
!pip install -qU transformers accelerate bitsandbytes peft trl datasets evaluate #ai2-olmo

In [None]:
from huggingface_hub import login
login(token="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")

In [None]:
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig,TrainingArguments
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer

In [None]:
import pandas as pd
from datasets import Dataset

#data_type='Original'
data_type='Synthetic'

# Load Data
df = pd.read_excel('LR_Dataset_Original_Sythetic_Final.xlsx')
#df = pd.read_excel('LR_Dataset_Original_Sythetic_Experiment_70.xlsx') # 20% of the training and validation sets


if data_type=='Original':
 df = df[df['Source'] == 'Original'] # Only original

df= df[['Sentence','Category' ,'Classification']]

df['Sentence'] = df['Sentence'].str.capitalize()
df=df.sample(frac=1).reset_index(drop=True)
df['Sentence'] = (df['Sentence']
                  .str.strip()
                  .str.replace(r'\n|\r', ' ', regex=True)
                  .str.replace(r'\s{2,}', ' ', regex=True))
df

In [None]:
def create_instruction(row):
    sentence = row['Sentence']
    instruction = (
                "You are a researcher that should assign a classification to a sentence from scientific articles, choosing from one of the following seven categories. Each category corresponds to a specific aspect of scientific discourse, either related to a topic or a study. A topic is defined as a scientific domain, such as “Computer Science” or “Machine  Learning”. A previous study refers to a prior paper on the topic.\n"
                "Categories:\n "
                "1. OVERALL: Describes, introduces, classifies, or defines research topics often based on the discussion of multiple previous studies together.\n "
                "2. RESEARCH GAP: Highlights the need for further research within the topic.\n"
                "3. DESCRIPTION: Outlines the objectives, methodology, or design of one previous study, without mentioning results.\n"
                "4. RESULT: Describes specific findings or outcomes drawn from previous studies. This category includes empirical results, theoretical insights, and observed patterns reported by researchers. It often uses verbs like “showed”, “found”, “demonstrated”, and “observed” or phrases like “the findings indicate”.\n"
                "5. LIMITATION: Describes a constraint, challenge, or weakness inherent in the methodology of a previous study that hinders generalizability or reliability in a previous study.\n"
                "6. EXTENSION: Describes how the current study addresses or extends previous studies by stating the overall idea, contrasting ideas or elaborating further ideas. It usually uses the words “we” or “our”.\n"
                "7. OTHER: Any text that does not fit the above categories.\n"
                "Procedure:\n"
                "1. Determine whether the subject of the sentence is a topic or a study.\n"
                "2. Identify the most suitable category based on the content. Do not create new categories. Use the categories given above.\n"
                "3. Provide the category number that best fits the sentence. Just provide the category number without any explanation.\n"

                f"Sentence: {sentence}.\n"
            )

    return instruction


In [None]:
df['instruction'] = df.apply(create_instruction, axis=1)

df = df.rename(columns={'Category': 'response'})

df

In [None]:
train_dataset = df[df['Classification'] == 'TRAINING']
train_dataset= train_dataset.sample(frac=1, random_state=42).reset_index(drop=True)
eval_dataset = df[df['Classification'] == 'VALIDATION']
eval_dataset= eval_dataset.sample(frac=1, random_state=42).reset_index(drop=True)
test_dataset =df[df['Classification'] == 'TEST']
test_dataset= test_dataset.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
train_dataset

In [None]:
def generate_prompt(example):
    """Format prompt for training."""
    text = f"<|im_start|>user\n{example['instruction']}<|im_end|>\n<|im_start|>assistant\n{example['response']}<|im_end|>"
    return text

def generate_test_prompt(example):
    """Format prompt for training."""
    text = f"<|im_start|>user\n{example['instruction']}<|im_end|>"
    return text


In [None]:
train_dataset['text']= train_dataset.apply(generate_prompt, axis=1)
eval_dataset['text']= eval_dataset.apply(generate_prompt, axis=1)
test_dataset['text']=test_dataset.apply(generate_test_prompt, axis=1)

train_data = Dataset.from_pandas(train_dataset[['text']])
eval_data = Dataset.from_pandas(eval_dataset[['text']])

In [None]:
eval_data
train_data

In [None]:
#train_data['text'][0]
#eval_data
eval_data['text'][33]

In [None]:
#model_name="hamishivi/OLMo-1B-0724-Instruct-hf"
model_name="allenai/OLMo-7B-0724-Instruct-hf" # OLMo_FT_Test_NEST_2.csv
#model_name="allenai/OLMo-7B-Instruct-hf"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    #bnb_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Comment for hamishivi/OLMo-1B-0724-Instruct-hf
    device_map='auto',
    trust_remote_code=True,
    #torch_dtype=torch.float16, # This new #torch.bfloat16
)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"


In [None]:
tokenizer.all_special_tokens

In [None]:
prompt=test_dataset['text'].iloc[0]
prompt

In [None]:
inputs = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False)
inputs = {k: v.to('cuda') for k,v in inputs.items()}
#response = model.generate(**inputs, max_new_tokens=20, do_sample=True, top_k=50, top_p=0.95)
response = model.generate(**inputs, max_new_tokens=20, use_cache=True)


answer=tokenizer.batch_decode(response, skip_special_tokens=True)[0]
answer

In [None]:
peft_config = LoraConfig(
    r=16, # 256  Original 16
    lora_alpha=32, #128 Original 32
    target_modules=[
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.o_proj",
        "mlp.gate_proj",
        "mlp.up_proj",
        "mlp.down_proj",
        #"mlp.lm_head",
    ],
    bias="none",
    lora_dropout=0.1,  # Conventional
    task_type="CAUSAL_LM",
)

# Project and Model Setup
project = "lro-finetune"
base_model_name = "olmo_1B_Instruct"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name



training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,# Orginal 4
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True, # Original =True
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="none",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save the model every epoch

)


trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=1024,
    #neftune_noise_alpha=5 # Comment out for NEFT.
)




The following code will train the model using the trainer.train() method and then save the trained model to the trained-model directory. Using The standard GPU P100 offered by Kaggle, the training should be quite fast.

In [None]:
# Train model
trainer.train()

In [None]:
# Save trained model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

Afterwards, loading the TensorBoard extension and start TensorBoard, pointing to the logs/runs directory, which is assumed to contain the training logs and checkpoints for your model, will allow you to understand how the models fits during the training.

In [None]:
import gc

#del [model, tokenizer, peft_config, trainer, train_data, eval_data, bnb_config, training_arguments]
#del [df, train_dataset, eval_dataset]
#del [TrainingArguments, SFTTrainer, LoraConfig, BitsAndBytesConfig]

In [None]:
for _ in range(100):
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
!nvidia-smi

In [None]:
from peft import AutoPeftModelForCausalLM

finetuned_model = "./smollm2_7B_Instruct-lro-finetune/"
compute_dtype = getattr(torch, "float16")
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")

model = AutoPeftModelForCausalLM.from_pretrained(
     finetuned_model,
     torch_dtype=compute_dtype,
     return_dict=False,
     low_cpu_mem_usage=True,
     device_map=device,
)

merged_model = model.merge_and_unload()
merged_model.save_pretrained("./merged_model",safe_serialization=True, max_shard_size="2GB")
tokenizer.save_pretrained("./merged_model")

In [None]:
prompt=test_dataset['text'].iloc[0]
prompt

In [None]:
pipe = pipeline(task="text-generation",
                        model=merged_model,
                        tokenizer=tokenizer,
                        max_new_tokens = 100,
                        temperature = 0.0,
                       )
result = pipe(prompt, pad_token_id=pipe.tokenizer.eos_token_id)
answer = result[0]['generated_text']

In [None]:
def extract_text(text):
    # Define both markers to search for
    markers = ["<|im_end|>assistant", "assistant\n"]

    # Loop through markers and check if each is in the text
    for marker in markers:
        marker_position = text.find(marker)

        # If the marker is found, extract text after it
        if marker_position != -1:
            return text[marker_position + len(marker):].strip()  # Remove any leading/trailing whitespace

    # Return None if neither marker is found
    return text

In [None]:
answer

In [None]:
extract_text(answer)

In [None]:
# Getting the Classification
def get_classification_finetuning(data_point,merged_model,tokenizer):
    """
    Gets the classification for a data point using the fine-tuned model.
    """
    pipe = pipeline(task="text-generation",
                        model=merged_model,
                        tokenizer=tokenizer,
                        max_new_tokens = 100,
                        temperature = 0.0,
                       )
    result = pipe(data_point['text'], pad_token_id=pipe.tokenizer.eos_token_id)
    answer = result[0]['generated_text']
    print(f"Sentence : {data_point.index[-1]}")
    print(answer)
    #answer = answer.split("=")[-1].lower()
    data_point['Prediction_Finetune']=answer  # Assign the result to the data point
    data_point['Prediction_Finetune_Cleaned']=extract_text(answer)

    return data_point

# Apply the get_classification function to the dataset using map
test_dataset = test_dataset.apply(lambda row: get_classification_finetuning(row, merged_model, tokenizer), axis=1)

In [None]:
test_dataset

In [None]:
test_dataset.to_csv('SmolLM2_FT_Test_LoRA2.csv', index=False)
