In [None]:
# Project Name: Chain of Thought Reasoning
# Team Number: 6
# Members: Anvesh, Eshita, Neha, Sandeep, Saumya

# File Name: airflow-llama-ft.ipynb
# File Usage: 
# This is a Jupyter Notebook file that contains the code for fine-tuning the Llama model using the Hugging Face library.
# It includes the necessary libraries, data loading, model training, and evaluation steps.
# It is used to train the Llama model on a DPO specific dataset.
# This is triggered remotely on Kaggle using the Airflow DAG.

# Metadata:
# {
#     "id": "eshitagupta151991/airflow-llama-ft-v2",
#     "title": "airflow-llama-ft-v2",
#     "code_file": "airflow-llama-ft.ipynb",
#     "language": "python",
#     "kernel_type": "notebook",
#     "is_private": true,
#     "enable_gpu": true,
#     "enable_tpu": false,
#     "enable_internet": true,
#     "dataset_sources": ["eshitagupta151991/secrets"],
#     "competition_sources": [],
#     "kernel_sources": [],
#     "model_sources": []
#   }

# Train Model with DPO - LLaMA-3.2-1B-Instruct

In [None]:
!pip install -q  datasets trl accelerate

In [1]:
from datasets import load_dataset
from trl import DPOConfig, DPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

2025-05-08 06:10:04.201306: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746684604.224662     310 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746684604.231785     310 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
import json

with open('/kaggle/input/secrets/secrets.json') as f:
    secrets = json.load(f)

HF_TOKEN = secrets['HF_TOKEN']
WANDB_API_KEY = secrets['WANDB_API_KEY']
AWS_ACCESS_KEY_ID = secrets['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = secrets['AWS_SECRET_ACCESS_KEY']

In [None]:
from huggingface_hub import login
hf_token = HF_TOKEN
login(token=hf_token)

In [None]:
import os
import wandb

os.environ["WANDB_API_KEY"] = WANDB_API_KEY
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mshtgupta8[0m ([33mshtgupta8-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import boto3
s3 = boto3.client(
    's3',
    aws_access_key_id= AWS_ACCESS_KEY_ID,
    aws_secret_access_key= AWS_SECRET_ACCESS_KEY
)

bucket_name = 'finetuning-stage'

In [5]:
import io
from io import BytesIO
import pyarrow as pa
import pyarrow.parquet as pq
from datasets import DatasetDict, Dataset

train_file_key = 'train_dataset.parquet'
valid_file_key = 'valid_dataset.parquet'

def load_parquet_from_s3(bucket_name, file_key):
    # Create an in-memory buffer
    buffer = BytesIO()
    
    # Download the file from S3 into the buffer
    s3.download_fileobj(bucket_name, file_key, buffer)
    
    # Move the buffer cursor to the beginning
    buffer.seek(0)
    
    # Load the Parquet data into a pandas DataFrame using pyarrow
    table = pq.read_table(buffer)
    df = table.to_pandas()
    
    return df

# Load the train and valid datasets from S3
train_df = load_parquet_from_s3(bucket_name, train_file_key)
valid_df = load_parquet_from_s3(bucket_name, valid_file_key)

# Convert the pandas DataFrames to Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)

# Combine into a DatasetDict (similar to load_dataset)
dataset = DatasetDict({
    'train': train_ds,
    'valid': valid_ds,
})

# Print the dataset to verify
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'rejected', '__index_level_0__'],
        num_rows: 27
    })
    valid: Dataset({
        features: ['prompt', 'chosen', 'rejected'],
        num_rows: 3
    })
})


In [None]:
# dataset = load_dataset("Eshita-ds/cot-llm-dpo-dataset-final")

In [None]:
!rm -rf /kaggle/working/*

In [6]:
model_name = "Eshita-ds/Llama-3.2-1B-DPO"

# model = AutoModelForCausalLM.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.gradient_checkpointing_enable()
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token # set pad token

In [7]:
def format_chat_prompt(user_input, system_message="You are a helpful AI Tutor."):

    # Format user message
    user_prompt = f"<|chat_start|>user\n{user_input}<|chat_end|>\n"

    # Start assistant's turn
    tutor_prompt = "<|chat_start|>tutor\n"

    # Combine prompts
    formatted_prompt = user_prompt + tutor_prompt

    return formatted_prompt

In [8]:
# Set up text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Example prompt
prompt = format_chat_prompt(dataset['valid']['prompt'][0][0]['content'])

# Generate output
outputs = generator(prompt, max_length=700, truncation=True, num_return_sequences=1, temperature=0.7)

print(outputs[0]['generated_text'])

Device set to use cuda:0


<|chat_start|>user
You are an AI tutor that thinks and provides detailed and step-by-step explanations for the provided maths question.

**Question**: what is 2+2 ?<|chat_end|>
<|chat_start|>tutor
**Answer**: The result of 2+2 is 4.

Here's a step-by-step explanation:

1. We are given the numbers 2 and 2.
2. We need to find the sum of these two numbers.
3. When we add two numbers, we add their individual digits together.
4. So, 2 + 2 = 2 + 0 + 0 = 2 + 2 = 4.

In other words, when we combine two identical numbers, the result is always the same number.

**Why is this true?** Because when you add two identical numbers, you are essentially adding the same number twice. The result of this operation is always the same, which is the original number.

**Example:** 2 + 2 = 4, 3 + 3 = 6, 4 + 4 = 8

I hope this explanation helps clarify the answer to the question. Do you have any further questions or would you like to try another question?


In [9]:
ft_model_name = model_name.split('/')[1].replace("Instruct", "DPO")

training_args = DPOConfig(
    output_dir=ft_model_name,
    logging_steps=25,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    load_best_model_at_end=False,
    metric_for_best_model="eval_loss",
    save_strategy="steps",  # Change to steps for finer control
    save_steps=5000,
    eval_strategy="steps",
    eval_steps=1000,
    gradient_checkpointing=True,
    save_only_model=True,
    save_total_limit=1
)

device = torch.device('cuda')

In [10]:
trainer = DPOTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=dataset['train'],
    eval_dataset=dataset['valid'],
)
trainer.train()

Extracting prompt in train dataset:   0%|          | 0/27 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/27 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/27 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/3 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/3 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/3 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss


TrainOutput(global_step=9, training_loss=0.5953776041666666, metrics={'train_runtime': 248.4117, 'train_samples_per_second': 0.326, 'train_steps_per_second': 0.036, 'total_flos': 0.0, 'train_loss': 0.5953776041666666, 'epoch': 2.2962962962962963})

In [11]:
# Load the fine-tuned model
ft_model = trainer.model

In [None]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")

model_id = f"Eshita-ds/{ft_model_name}-{timestamp}"
trainer.push_to_hub(model_id)

training_args.bin:   0%|          | 0.00/6.20k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/15.7M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1746684677.23f90a5e6159.310.0:   0%|          | 0.00/6.45k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Eshita-ds/Llama-3.2-1B-DPO/commit/4a55e273fe340fbdb320231212ee3c67d889b647', commit_message='Eshita-ds/Llama-3.2-1B-DPO-V2', commit_description='', oid='4a55e273fe340fbdb320231212ee3c67d889b647', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Eshita-ds/Llama-3.2-1B-DPO', endpoint='https://huggingface.co', repo_type='model', repo_id='Eshita-ds/Llama-3.2-1B-DPO'), pr_revision=None, pr_num=None)

In [None]:
s3.delete_object(Bucket=bucket_name, Key="train_dataset.parquet")
s3.delete_object(Bucket=bucket_name, Key="valid_dataset.parquet")