In [None]:
print("Step 1: Installing and upgrading libraries...")
!pip install -q --upgrade transformers bitsandbytes accelerate peft trl datasets

print("\nStep 2: Importing libraries...")
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer
import os

print("\nStep 3: Loading Gemma model and tokenizer...")
model_name = "google/gemma-2b-it"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("\nStep 4: Loading and formatting the dataset...")
dataset_path = "pandas_dataset.jsonl"
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Error: {dataset_path} not found. Please upload it to the Colab session.")

dataset = load_dataset("json", data_files=dataset_path, split="train")

def format_chat_template(row):
    return f"<bos><start_of_turn>user\n{row['instruction']}\n{row['input']}<end_of_turn>\n<start_of_turn>model\n{row['output']}<end_of_turn><eos>"

print("\nStep 5: Configuring LoRA...")
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

print("\nStep 6: Configuring Training Arguments...")
training_args = TrainingArguments(
    output_dir="./pandas_tutor_gemma",
    num_train_epochs=1,
    per_device_train_batch_size=1, # Reduced batch size
    gradient_accumulation_steps=8, # Increased gradient accumulation
    optim="paged_adamw_32bit",
    logging_steps=10,
    learning_rate=2e-4,
    fp16=True,
    push_to_hub=False,
    gradient_checkpointing=True, # Enabled gradient checkpointing
    gradient_checkpointing_kwargs={'use_reentrant':False} # Fix for potential reentrant error
)

print("\nStep 7: Creating SFTTrainer...")
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=lora_config,
    args=training_args,
    formatting_func=format_chat_template,
)

print("\n\n==================== STARTING TRAINING ====================")
trainer.train()
print("==================== TRAINING COMPLETE ====================\n")

print("Step 8: Saving the trained adapter weights...")
adapter_model_name = "pandas-tutor-gemma-adapters"
trainer.model.save_pretrained(adapter_model_name)
print(f"Adapter model saved successfully to '{adapter_model_name}'")

Step 1: Installing and upgrading libraries...

Step 2: Importing libraries...

Step 3: Loading Gemma model and tokenizer...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


Step 4: Loading and formatting the dataset...

Step 5: Configuring LoRA...

Step 6: Configuring Training Arguments...

Step 7: Creating SFTTrainer...


Applying formatting function to train dataset:   0%|          | 0/181 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/181 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/181 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/181 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.






<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mcsmishra952[0m ([33mcsmishra952-odisha-university-of-technology-and-cet[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,6.0323
20,4.2857





Step 8: Saving the trained adapter weights...
Adapter model saved successfully to 'pandas-tutor-gemma-adapters'


In [None]:
from peft import PeftModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

print("--- Loading base model for inference ---")
base_model_name = "google/gemma-2b-it"
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    ),
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token # Important for generation

print("--- Loading fine-tuned adapters ---")
# Loading the LoRA adapters
adapter_model_name = "pandas-tutor-gemma-adapters"
model = PeftModel.from_pretrained(model, adapter_model_name)

# --- Test Case 1: Code-to-NL ---
instruction = "Explain what this Pandas code does."
code_input = "df.groupby('department')['salary'].agg(['mean', 'max'])"

# Format the prompt using Gemma's required chat template
prompt = f"<bos><start_of_turn>user\n{instruction}\n{code_input}<end_of_turn>\n<start_of_turn>model\n"

print("\n--- Generating response for Test Case 1 (Code-to-NL) ---")
# Tokenize the input and generate the response
inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
outputs = model.generate(**inputs, max_new_tokens=100)
result_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Cleaning up the output to only show the model's response
model_response = result_text.split("model\n")[-1]
print(f"Instruction: {instruction}")
print(f"Input Code: {code_input}")
print(f"Model's Explanation:\n{model_response}")


# --- Test Case 2: NL-to-Code ---
instruction = "Write the Pandas code to select the 'name' and 'age' columns from a dataframe named df."
code_input = ""

prompt = f"<bos><start_of_turn>user\n{instruction}<end_of_turn>\n<start_of_turn>model\n"

print("\n--- Generating response for Test Case 2 (NL-to-Code) ---")
inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
outputs = model.generate(**inputs, max_new_tokens=50)
result_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

model_response = result_text.split("model\n")[-1]
print(f"Instruction: {instruction}")
print(f"Model's Generated Code:\n{model_response}")

--- Loading base model for inference ---


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

--- Loading fine-tuned adapters ---

--- Generating response for Test Case 1 (Code-to-NL) ---
Instruction: Explain what this Pandas code does.
Input Code: df.groupby('department')['salary'].agg(['mean', 'max'])
Model's Explanation:
This code is used to calculate the mean and maximum salary for each department.
It uses the groupby method to group the data by department and then uses the aggregate method to calculate the mean and maximum salary for each department.

--- Generating response for Test Case 2 (NL-to-Code) ---
Instruction: Write the Pandas code to select the 'name' and 'age' columns from a dataframe named df.
Model's Generated Code:
Sure, here is the code to select the 'name' and 'age' columns from a dataframe named df:
```python
df['name']
df['age']
```


In [None]:
from peft import PeftModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

print("--- Merging the adapter with the base model ---")

# Reloading the base model
base_model_name = "google/gemma-2b-it"
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# Loading the PEFT model with your adapters
adapter_model_name = "pandas-tutor-gemma-adapters"
merged_model = PeftModel.from_pretrained(base_model, adapter_model_name)

# Mergeing the LoRA layers into the base model
merged_model = merged_model.merge_and_unload()

print("--- Saving the merged model ---")
# Defining a name for your final model
final_model_name = "Pandas-Tutor-Gemma-2B"

# Saving the merged model and its tokenizer
merged_model.save_pretrained(final_model_name)
tokenizer.save_pretrained(final_model_name)

print(f"Merged model and tokenizer saved to '{final_model_name}'")

--- Merging the adapter with the base model ---


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

--- Saving the merged model ---
Merged model and tokenizer saved to 'Pandas-Tutor-Gemma-2B'


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# The local folder containing my final model
final_model_name = "Pandas-Tutor-Gemma-2B"

hf_repo_name = "csmishra952/Pandas-Tutor-Gemma-2B"

print(f"Uploading model to Hugging Face repository: {hf_repo_name}")

# Uploading the folder
merged_model.push_to_hub(hf_repo_name)
tokenizer.push_to_hub(hf_repo_name)

print("--- Upload Complete! ---")
print(f"You can find your model at: https://huggingface.co/{hf_repo_name}")

Uploading model to Hugging Face repository: csmishra952/Pandas-Tutor-Gemma-2B


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/34.4M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

--- Upload Complete! ---
You can find your model at: https://huggingface.co/csmishra952/Pandas-Tutor-Gemma-2B
