# Building Reflection by Finetuning Gemma-2-2b

## Installing dependencies

In [None]:
!pip install transformers datasets peft trl bitsandbytes

## Importing necessary libraries

In [3]:
from dataclasses import dataclass, field
from typing import Optional
import torch

from transformers import AutoTokenizer, HfArgumentParser, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


## Writing important arguments

In [3]:

@dataclass
class ScriptArguments:
    """
    These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
    """
    per_device_train_batch_size: Optional[int] = field(default=4)
    per_device_eval_batch_size: Optional[int] = field(default=2)
    gradient_accumulation_steps: Optional[int] = field(default=4)
    learning_rate: Optional[float] = field(default=2e-4)
    max_grad_norm: Optional[float] = field(default=0.3)
    weight_decay: Optional[int] = field(default=0.001)
    lora_alpha: Optional[int] = field(default=16)
    lora_dropout: Optional[float] = field(default=0.1)
    lora_r: Optional[int] = field(default=8)
    max_seq_length: Optional[int] = field(default=2048)
    model_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
        }
    )
    dataset_name: Optional[str] = field(
        default="gsayak/pratibimb-test",
        # default="mayura-ai/pratibimb",
        metadata={"help": "The preference dataset to use."},
    )
    fp16: Optional[bool] = field(
        default=True,
        metadata={"help": "Enables fp16 training."},
    )
    bf16: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables bf16 training."},
    )
    # packing: Optional[bool] = field(
    #     default=True,
    #     metadata={"help": "Use packing dataset creating."},
    # )
    gradient_checkpointing: Optional[bool] = field(
        default=True,
        metadata={"help": "Enables gradient checkpointing."},
    )
    use_flash_attention_2: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables Flash Attention 2."},
    )
    optim: Optional[str] = field(
        default="paged_adamw_32bit",
        metadata={"help": "The optimizer to use."},
    )
    lr_scheduler_type: str = field(
        default="constant",
        metadata={"help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis"},
    )
    max_steps: int = field(default=100, metadata={"help": "How many optimizer update steps to take"})
    warmup_ratio: float = field(default=0.03, metadata={"help": "Fraction of steps to do a warmup for"})
    save_steps: int = field(default=50, metadata={"help": "Save checkpoint every X updates steps."})
    logging_steps: int = field(default=10, metadata={"help": "Log every X updates steps."})
    output_dir: str = field(
        default="./results",
        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
    )


In [4]:
parser = HfArgumentParser(ScriptArguments)

# Check if running in a Jupyter notebook
import sys
if 'ipykernel' in sys.modules:
    # In a Jupyter notebook, provide a list of arguments or an empty list
    script_args = parser.parse_args_into_dataclasses(args=[])[0]
else:
    # When running as a script, parse the actual command-line arguments
    script_args = parser.parse_args_into_dataclasses()[0]

In [6]:
dataset = load_dataset(script_args.dataset_name, split="train")

In [7]:
total_dataset = dataset.train_test_split(test_size=0.2)

In [8]:
total_dataset['train'].to_json('training_data.json')
total_dataset['test'].to_json('testing_data.json')

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 25.81ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 143.93ba/s]


402276

In [9]:
# Load the GG model - this is the local one, update it to the one on the Hub
model_id = "google/gemma-2-2b-it"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)


In [None]:
!pip install -U bitsandbytes

In [11]:
from huggingface_hub import login
login(token="HUGGINGFACE_TOKEN")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [12]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    torch_dtype=torch.float32,
    attn_implementation="sdpa" if not script_args.use_flash_attention_2 else "flash_attention_2",

)

# Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="right")
# tokenizer.pad_token_id = tokenizer.eos_token_id

`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.42s/it]


In [14]:
tokenizer.special_tokens_map

{'bos_token': '<bos>',
 'eos_token': '<eos>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}

In [15]:
special_tokens_dict = {'additional_special_tokens': ['<thinking_start>','<thinking_end>','<reflection_start>','<reflection_end>' ,'<output_start>','<output_end>', '<start_of_turn>', '<end_of_turn>']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(256006, 2304, padding_idx=0)

In [16]:
tokenizer.special_tokens_map

{'bos_token': '<bos>',
 'eos_token': '<eos>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'additional_special_tokens': ['<thinking_start>',
  '<thinking_end>',
  '<reflection_start>',
  '<reflection_end>',
  '<output_start>',
  '<output_end>',
  '<start_of_turn>',
  '<end_of_turn>']}

In [17]:
tokenizer.bos_token

'<bos>'

In [18]:
def apply_prompt_template(element):
    template = tokenizer.bos_token

    for message in element["messages"]:
        role = message["role"]
        content = message["content"]
        message_type = message["type"]

        if role == "user":
            template += f"<start_of_turn>{role}\n{content}<end_of_turn>\n"

        elif role == "model":
            if not any(tag in template for tag in ("<thinking_start>", "<reflection_starts>", "<output_starts>")):
                template += f"<start_of_turn>{role}\n"  # Only one start of turn for model

            if message_type == "thinking":
                template += f"<thinking_start>\n{content}\n<thinking_ends>\n"
            elif message_type == "reflection":
                template += f"<reflection_start>\n{content}\n<reflection_end>\n"
            elif message_type == "output":
                template += f"<output_start>\n{content}\n<output_end>\n"

    template += f"<end_of_turn>{tokenizer.eos_token}"
    return template

In [19]:
train_dataset = load_dataset('json', data_files='training_data.json', split='train')
test_dataset = load_dataset('json', data_files='testing_data.json', split='train')

Generating train split: 648 examples [00:00, 86293.78 examples/s]
Generating train split: 162 examples [00:00, 54345.14 examples/s]


In [20]:
train_dataset

Dataset({
    features: ['messages'],
    num_rows: 648
})

In [21]:
test_dataset

Dataset({
    features: ['messages'],
    num_rows: 162
})

In [25]:
max_length = model.config.max_position_embeddings

In [27]:
def preprocess_function(examples):
    inputs = []
    targets = []

    for element in examples['messages']:
        template = apply_prompt_template({'messages': element})
        start_of_model = "<start_of_turn>model\n"

        # Check if the template can be split correctly
        if start_of_model in template:
            user_input, model_output = template.split(start_of_model, 1)
        else:
            # Skip this example or handle the error
            continue

        # Construct the full input
        full_input = user_input + start_of_model + model_output

        # Ensure neither input nor model_output is empty
        if full_input.strip() and model_output.strip():
            inputs.append(full_input)
            targets.append(model_output)
        else:
            # Skip empty inputs/outputs or handle the error
            continue

    model_inputs = tokenizer(inputs, padding=True, truncation=True, max_length=max_length, add_special_tokens=False)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding=True, truncation=True, max_length=max_length, add_special_tokens=False)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply the function to the dataset
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 648/648 [00:00<00:00, 726.59 examples/s]
Map: 100%|██████████| 162/162 [00:00<00:00, 850.58 examples/s]


In [36]:
lora_config = LoraConfig(
    r=script_args.lora_r,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
    lora_alpha=script_args.lora_alpha,
    lora_dropout=script_args.lora_dropout
)

# train_dataset = load_dataset('gsayak/reflection', split="train")
# train_dataset['train'].to_json('new_data.json')
# train_dataset = load_dataset('json', data_files='new_data.json', split="train")

# TODO: make that configurable
YOUR_HF_USERNAME = "gsayak"
output_dir = f"{YOUR_HF_USERNAME}/"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    optim=script_args.optim,
    save_steps=script_args.save_steps,
    logging_steps=script_args.logging_steps,
    learning_rate=script_args.learning_rate,
    max_grad_norm=script_args.max_grad_norm,
    max_steps=script_args.max_steps,
    warmup_ratio=script_args.warmup_ratio,
    lr_scheduler_type=script_args.lr_scheduler_type,
    gradient_checkpointing=script_args.gradient_checkpointing,
    fp16=script_args.fp16,
    bf16=script_args.bf16,
)


In [38]:
tokenized_train_dataset.column_names

['messages', 'input_ids', 'attention_mask', 'labels']

In [39]:
# Remove the 'messages' column from both the train and test datasets
tokenized_train_dataset = tokenized_train_dataset.remove_columns(["messages"])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["messages"])

# Verify the column names
print(tokenized_train_dataset.column_names)
print(tokenized_test_dataset.column_names)

['input_ids', 'attention_mask', 'labels']
['input_ids', 'attention_mask', 'labels']


In [41]:
tokenized_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 648
})

In [44]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    peft_config=lora_config,
    tokenizer=tokenizer,
    max_seq_length=script_args.max_seq_length,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


In [47]:
trainer.train()

It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,0.9088
20,0.5571
30,0.4836
40,0.4634
50,0.3978
60,0.3367
70,0.3414
80,0.3354
90,0.2532
100,0.2365


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


TrainOutput(global_step=100, training_loss=0.43138901472091673, metrics={'train_runtime': 1162.1535, 'train_samples_per_second': 1.377, 'train_steps_per_second': 0.086, 'total_flos': 4.57510917390336e+16, 'train_loss': 0.43138901472091673, 'epoch': 2.4691358024691357})

In [48]:
trainer.save_model('results')



# Testing the MODEL

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained("./results", local_files_only=True)

In [51]:
from peft import PeftModel, PeftConfig

In [52]:
config = PeftConfig.from_pretrained("./results", local_files_only=True)
base_model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it")
model.resize_token_embeddings(len(tokenizer2))
model2 = PeftModel.from_pretrained(model, "./results", local_files_only=True)

In [53]:
question = [
    {
    "role" : "user",
    "content" : "Calculate the derivative of the function f(x) = 3x^2 sin(x) using the product rule and chain rule"
    }]

In [54]:
prompt = tokenizer2.apply_chat_template(question, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
print(prompt)

tensor([[     2,    106,   1645,    108,  51654,    573,  33635,    576,    573,
           1411,    517, 235278, 235297, 235275,    589, 235248, 235304, 235297,
         235393, 235284,   3270, 235278, 235297, 235275,   2177,    573,   3225,
           6933,    578,   9488,   6933,    107,    108,    106,   2516,    108]],
       device='cuda:0')


In [55]:
print(tokenizer2.decode(model2.generate(prompt, max_new_tokens=1000)[0]))

<bos><start_of_turn>user
Calculate the derivative of the function f(x) = 3x^2 sin(x) using the product rule and chain rule<end_of_turn>
<start_of_turn>model
<thinking_start>
To find the derivative of the given function, we will apply the product rule and chain rule. The product rule states that if we have a function of the form f(x) = u(x)v(x), then the derivative is given by f'(x) = u'(x)v(x) + u(x)v'(x). In this case, we have u(x) = 3x^2 and v(x) = sin(x).

First, we will find the derivatives of u(x) and v(x) separately. The derivative of u(x) = 3x^2 is u'(x) = 6x, and the derivative of v(x) = sin(x) is v'(x) = cos(x).

Now, we will apply the product rule to find the derivative of f(x). We have:

f'(x) = u'(x)v(x) + u(x)v'(x)
= 6x sin(x) + 3x^2 cos(x)

Next, we will apply the chain rule to the second term in the product rule. The chain rule states that if we have a function of the form f(x) = g(h(x)), then the derivative is given by f'(x) = g'(h(x))h'(x). In this case, we have g(x) =

In [None]:
prompt = tokenizer.apply_chat_template(question, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
print(prompt)

In [None]:
print(tokenizer.decode(model.generate(prompt, max_new_tokens=1000)[0]))

# Uploading the model to Huggingface

In [4]:
from huggingface_hub import HfApi
# os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')
api = HfApi()

In [6]:
api.upload_folder(
    folder_path="results",
    repo_id="gsayak/pratibimb-sdpa",
    repo_type="model"
)



adapter_model.safetensors:   0%|          | 0.00/4.76G [00:00<?, ?B/s]
adapter_model.safetensors:   0%|          | 16.4k/4.76G [00:00<22:38:42, 58.4kB/s]
training_args.bin: 100%|██████████| 5.43k/5.43k [00:01<00:00, 4.66kB/s].51MB/s]   
tokenizer.json: 100%|██████████| 17.5M/17.5M [00:05<00:00, 3.15MB/s], 4.52MB/s]
adapter_model.safetensors: 100%|██████████| 4.76G/4.76G [03:07<00:00, 25.4MB/s]


Upload 3 LFS files: 100%|██████████| 3/3 [03:08<00:00, 62.85s/it] 


CommitInfo(commit_url='https://huggingface.co/gsayak/pratibimb-sdpa/commit/8ab1c4066290ef7ba4866c25740250cc9d9d1145', commit_message='Upload folder using huggingface_hub', commit_description='', oid='8ab1c4066290ef7ba4866c25740250cc9d9d1145', pr_url=None, repo_url=RepoUrl('https://huggingface.co/gsayak/pratibimb-sdpa', endpoint='https://huggingface.co', repo_type='model', repo_id='gsayak/pratibimb-sdpa'), pr_revision=None, pr_num=None)