In [1]:
!pip install datasets
!pip install trl
!pip install peft

[0m

In [2]:
from huggingface_hub import login
import json

with open("config.json", "r") as config_file:
    config = json.load(config_file)
    access_token = config["HF_ACCESS_TOKEN"]

login(token=access_token)

In [3]:
import os
from dataclasses import dataclass, field
from typing import Optional
from datasets.arrow_dataset import Dataset
import torch
from datasets import load_dataset
from peft import LoraConfig
from peft import AutoPeftModelForCausalLM
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)

from trl import SFTTrainer
from argparse import Namespace


script_args = Namespace(
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=4,
    learning_rate=3e-4,
    max_grad_norm=0.3,
    weight_decay=0.01,
    lora_alpha=16,
    lora_dropout=0.0,
    lora_r=8,
    max_seq_length=256,
    model_name="meta-llama/Llama-3.2-1B-Instruct",
    tokenizer_path="tokenizer.model",
    # dataset_name="tatsu-lab/alpaca",
    dataset_name="instruction-data.json",
    device_map="cuda",
    use_4bit=True,
    bnb_4bit_compute_dtype="float16",
    num_train_epochs=10,
    fp16=False,
    bf16=True,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    max_steps=200,
    warmup_steps=50,
    group_by_length=True, # Group sequences into batches with same length
    eval_steps=10,
    save_steps=10,
    logging_steps=10, # Log every X updates steps
    report_to="wandb",
    output_dir="./results_packing",
)

2025-01-09 02:15:24.974573: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-09 02:15:24.974627: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-09 02:15:24.975660: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-09 02:15:24.981234: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
import os
import json

def split_data(args):
    """ Split the saved data into train and val"""
    with open(args.dataset_name,"r") as f:
        ds=json.load(f)
    train_split=int(0.9*len(ds))
    ds_train=ds[:train_split]
    ds_val=ds[train_split:]

    # write into json file
    train_file="instruction-data-train.json"
    val_file="instruction-data-val.json"

    with open(train_file, "w") as train_f:
        json.dump(ds_train, train_f, indent=4)
    print(f"Training data saved to {train_file}")

    # write validation data to a JSON file
    with open(val_file, "w") as val_f:
        json.dump(ds_val, val_f, indent=4)
    print(f"Validation data saved to {val_file}")

split_data(script_args)

def gen_train_input():
    """ Format all data input in alpaca style
        Return: a data object which can be accessed via for loop
    """
    # load data
    data_file="instruction-data-train.json"
    with open(data_file,"r") as f:
        ds= json.load(f)

    for sample in iter(ds):
        # extract instruction, input and output text
        instruction=sample['instruction']
        input_text=sample['input']
        output_text=sample['output']
        formatted_prompt=None

        if input_text is None or input_text == "":
            formatted_prompt=(
                f"<|start_header_id|>user<|end_header_id|>\n\n"
                f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
                f"### Instruction:\n{instruction}\n\n"
                f"### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" # format to signal the model's response
                f"{output_text}<|eot_id|><|end_of_text|>"
            )
        else:
            formatted_prompt=(
                f"<|start_header_id|>user<|end_header_id|>\n\n"
                f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
                f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n"
                f"### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
                f"{output_text}<|eot_id|><|end_of_text|>"
            )
        formatted_prompt="".join(formatted_prompt) # exclude trailing white spaces
        yield {'text': formatted_prompt}           # stream text into the dataloader, one by one

        
def gen_val_input():
    """ Format all data input in alpaca style
        Return: a data object which can be accessed via for loop
    """
    # load data
    data_file="instruction-data-val.json"
    with open(data_file,"r") as f:
        ds= json.load(f)

    for sample in iter(ds):
        # extract instruction, input and output text
        instruction=sample['instruction']
        input_text=sample['input']
        output_text=sample['output']
        formatted_prompt=None

        if input_text is None or input_text == "":
            formatted_prompt=(
                f"<|start_header_id|>user<|end_header_id|>\n\n"
                f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
                f"### Instruction:\n{instruction}\n\n"
                f"### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" # format to signal the model's response
                f"{output_text}<|eot_id|><|end_of_text>|"
            )
        else:
            formatted_prompt=(
                f"<|start_header_id|>user<|end_header_id|>\n\n"
                f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
                f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n"
                f"### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
                f"{output_text}<|eot_id|><|end_of_text|>"
            )
        formatted_prompt="".join(formatted_prompt) # exclude trailing white spaces
        yield {'text': formatted_prompt} # stream text into the dataloader, one by one
        
train_gen = Dataset.from_generator(gen_train_input)
val_gen=Dataset.from_generator(gen_val_input)


Training data saved to instruction-data-train.json
Validation data saved to instruction-data-val.json


In [5]:
def create_and_prepare_model(args):
    device_map ="auto"
    model=AutoModelForCausalLM.from_pretrained(
        args.model_name,
        # quantization_config=args.bnb_config,
        device_map=args.device_map,
        token=True
    )

    peft_config=LoraConfig(
        lora_alpha=script_args.lora_alpha,
        lora_dropout=script_args.lora_dropout,
        r=script_args.lora_r,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=['q_proj', 'k_proj', 'v_proj'],
        # target_modules=["query_key_value"]
    )
    tokenizer=AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True)
    tokenizer.add_special_tokens
    tokenizer.pad_token="<|end_of_text|>" # this token is already available in tokenizer list
    tokenizer.padding_side = "right"

    return model,peft_config,tokenizer

model,peft_config,tokenizer=create_and_prepare_model(script_args)

In [6]:
training_arguments=TrainingArguments(
    output_dir=script_args.output_dir,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    optim=script_args.optim,
    save_steps=script_args.save_steps,
    logging_steps=script_args.logging_steps,
    learning_rate=script_args.learning_rate,
    fp16=script_args.fp16,
    bf16=script_args.bf16,
    max_grad_norm=script_args.max_grad_norm,
    max_steps=script_args.max_steps,
    warmup_steps=script_args.warmup_steps,
    group_by_length=script_args.group_by_length,
    lr_scheduler_type=script_args.lr_scheduler_type,
    report_to="none", # prevent error with wandb
    eval_strategy="steps",  # Evaluate periodically
    eval_steps=script_args.eval_steps,  # Perform evaluation every X steps
)


trainer=SFTTrainer(
    model=model,
    train_dataset=train_gen,
    eval_dataset=val_gen,
    peft_config=peft_config,
    #dataset_text_field="text",
    #max_seq_length=script_args.max_seq_length,
    processing_class=tokenizer,
    args=training_arguments,
)

trainer.train()



Map:   0%|          | 0/110 [00:00<?, ? examples/s]

[2025-01-09 02:15:36,114] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Step,Training Loss,Validation Loss
10,3.3308,3.688517
20,3.0691,3.073312
30,2.1651,2.045742
40,1.1654,1.425161
50,0.8246,1.313111
60,0.7455,1.32016
70,0.7307,1.308774
80,0.7253,1.303558
90,0.6666,1.297198
100,0.6719,1.278009


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=200, training_loss=1.023242733478546, metrics={'train_runtime': 91.3593, 'train_samples_per_second': 70.053, 'train_steps_per_second': 2.189, 'total_flos': 2323685094236160.0, 'train_loss': 1.023242733478546, 'epoch': 6.451612903225806})

## Generate text

In [14]:
def generate(model, prompt, tokenizer, max_new_tokens, context_size=256, temperature=0.0, top_k=1, eos_id=[128001,128009]):
    """ Generate till reaching max_new_tokens or till eos_id=<|end_of_text|>"""
    # format prompt
    # formatted_prompt=(
    #     f"<|start_header_id|>user<|end_header_id|>\n\n"
    #     f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
    #     f"### Instruction:\n{prompt}"
    # )
    formatted_prompt=(
        f"<|start_header_id|>user<|end_header_id|>\n\n"
        f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
        f"### Instruction:\n{prompt}"
        f"### Response:\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
    )
    idx=tokenizer.encode(formatted_prompt)
    idx=torch.tensor(idx).unsqueeze(0).to(script_args.device_map) # add batch dimension
    _,num_tokens=idx.shape
    #print("Number of input tokens: ",num_tokens)

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            outputs = model.forward(idx_cond)
            logits=outputs.logits
        # last time step
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next in eos_id:  # Stop generating early if <|eot_id|> or<|end_of_text|> token is encountered 
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)
        
    #print(f"Output tensor: {idx.shape}")
    # remove batch dimension
    idx_flat=idx.squeeze(0)
    generated_ids=idx_flat[num_tokens:] # take out the input prompt
    generated_text=tokenizer.decode(generated_ids)
    
    return generated_text


In [16]:
prompt="explain the function of human heart"
generate(model, prompt, tokenizer, max_new_tokens=100)

'The human heart is a muscular organ that pumps blood throughout the body, supplying oxygen and nutrients to tissues and organs. It also helps to regulate blood pressure and maintain blood flow to the brain and other vital organs.'

In [20]:
# save model dict
model_file_name="LLAMA32_fine_tuned.pth"
torch.save(model.state_dict(), model_file_name)
print(f"Model saved as {model_file_name}")

Model saved as LLAMA32_fine_tuned.pth


In [18]:
from tqdm import tqdm

# load test data
test_data_path="instruction-data-val.json"
with open(test_data_path,"r") as f:
    test_data=json.load(f)

for i,entry in tqdm(enumerate(test_data),total=len(test_data)):
    generated_text=generate(model, entry["instruction"], tokenizer, max_new_tokens=100)
    test_data[i]["model response"]=generated_text

# write into a file
test_data_path="test-data-with-response.json"

with open(test_data_path,"w") as file:
    json.dump(test_data,file, indent=4)
print(f"Response saved as {test_data_path}")



100%|██████████| 110/110 [00:35<00:00,  3.09it/s]

Response saved as test-data-with-response.json





In [19]:
# download file
from IPython.display import FileLink

# Replace 'filename.ext' with your file name
FileLink("/notebooks/Llama32-finetune/LLAMA32_fine_tuned.pt")