<a href="https://colab.research.google.com/github/cybersamurai2410/sql-instruct-qlora-llm/blob/main/peft_qlora_llm_instruct.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Instruction Tuning LLMs using QLoRA**

# Install Dependencies

In [3]:
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/peft.git
!pip install git+https://github.com/huggingface/accelerate.git
!pip install git+https://github.com/huggingface/trl.git
!pip install bitsandbytes datasets huggingface_hub

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-kqw58fzt
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-kqw58fzt
  Resolved https://github.com/huggingface/transformers.git to commit 0b5b5e6a70249837293499e9363a64765a57111c
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.47.0.dev0-py3-none-any.whl size=10106230 sha256=78fae9540689f346a5e1d8f5403c9f39e1a85470e28d585c09f83e810a581fc7
  Stored in directory: /tmp/pip-ephem-wheel-cache-5d4n6m3_/wheels/e7/9c/5b/e1a9c8007c343041e61cc484433d512ea9274272e3fcbe7c16
Successfully b

In [4]:
!pip install tensorboard
!pip install wandb



In [None]:
!pip show bitsandbytes
!pip show transformers
!pip show peft
!pip show accelerate
!pip show trl
!pip show datasets
!pip show huggingface_hub
!pip show wandb

Name: bitsandbytes
Version: 0.43.3
Summary: k-bit optimizers and matrix multiplication routines.
Home-page: https://github.com/TimDettmers/bitsandbytes
Author: Tim Dettmers
Author-email: dettmers@cs.washington.edu
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, torch
Required-by: 
Name: transformers
Version: 4.42.4
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, trl
Name: peft
Version: 0.12.0
Summary: Parameter-Efficient Fine-Tuning (PEFT)
Home-page: https://github.com/huggingface

In [7]:
import torch
import bitsandbytes as bnb
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, Trainer, TrainingArguments, TrainerCallback
from peft import AutoPeftModelForCausalLM, PeftModel, PeftConfig, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM

In [None]:
help(SFTTrainer)
help(SFTConfig)
help(LoraConfig)

# Load Model

In [None]:
# https://huggingface.co/tiiuae/tiiuae/falcon-11B
model_id = "tiiuae/falcon-11B"
device = 0 if torch.cuda.is_available() else -1

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, # Reduce model weights to 4-bit precision
    bnb_4bit_use_double_quant=True, # Apply additional quantization layer
    bnb_4bit_quant_type="nf4", # Normal float 4-bit format to optimize weights storage
    bnb_4bit_compute_dtype=torch.bfloat16 # 4-bit weights temporarily upscaled to brain float 16-bit for matrix computations
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto"
    )
model.config.pretraining_tp = 1 # Tensor parallelism for distributed computing; 1 degree operates on single GPU

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Set padding token same as end-of-sequence token
tokenizer.padding_side = "right"

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/31.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/2.43G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.30k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/448 [00:00<?, ?B/s]

In [None]:
# Allocated memory - memory currently allocated on the GPU by PyTorch and Indicates the memory directly being used for tensors and operations
# Reserved memory - total memory reserved by PyTorch on the GPU, including memory allocated and potentially available for future use.

print(f"Memory Allocated after loading model: {torch.cuda.memory_allocated()/1e9} GB")
print(f"Memory Reserved after loading model: {torch.cuda.memory_reserved()/1e9} GB")

Memory Allocated after loading model: 6.519184896 GB
Memory Reserved after loading model: 6.725566464 GB


In [None]:
# Falcon-11B model weights at full precision is around 24 GB
for name, param in model.named_parameters():
    print(f"{name}: dtype={param.dtype}, type={type(param)}")

base_model.model.transformer.word_embeddings.weight: dtype=torch.float32, type=<class 'torch.nn.parameter.Parameter'>
base_model.model.transformer.h.0.self_attention.query_key_value.base_layer.weight: dtype=torch.uint8, type=<class 'bitsandbytes.nn.modules.Params4bit'>
base_model.model.transformer.h.0.self_attention.query_key_value.lora_A.default.weight: dtype=torch.float32, type=<class 'torch.nn.parameter.Parameter'>
base_model.model.transformer.h.0.self_attention.query_key_value.lora_B.default.weight: dtype=torch.float32, type=<class 'torch.nn.parameter.Parameter'>
base_model.model.transformer.h.0.self_attention.dense.weight: dtype=torch.uint8, type=<class 'bitsandbytes.nn.modules.Params4bit'>
base_model.model.transformer.h.0.mlp.dense_h_to_4h.weight: dtype=torch.uint8, type=<class 'bitsandbytes.nn.modules.Params4bit'>
base_model.model.transformer.h.0.mlp.dense_4h_to_h.weight: dtype=torch.uint8, type=<class 'bitsandbytes.nn.modules.Params4bit'>
base_model.model.transformer.h.0.input_

In [None]:
print(model.config)

FalconConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "tiiuae/falcon-11B",
  "activation": "gelu",
  "alibi": false,
  "architectures": [
    "FalconForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "tiiuae/falcon-11B--configuration_falcon.FalconConfig",
    "AutoModel": "tiiuae/falcon-11B--modeling_falcon.FalconModel",
    "AutoModelForCausalLM": "tiiuae/falcon-11B--modeling_falcon.FalconForCausalLM",
    "AutoModelForQuestionAnswering": "tiiuae/falcon-11B--modeling_falcon.FalconForQuestionAnswering",
    "AutoModelForSequenceClassification": "tiiuae/falcon-11B--modeling_falcon.FalconForSequenceClassification",
    "AutoModelForTokenClassification": "tiiuae/falcon-11B--modeling_falcon.FalconForTokenClassification"
  },
  "bias": false,
  "bos_token_id": 11,
  "eos_token_id": 11,
  "ff_factor": 4,
  "ffn_hidden_size": 16384,
  "hidden_dropout": 0.0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "ma

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


## Move Pretrained Model from Cache to Drive (Optional)

In [None]:
import os
from transformers import file_utils

print(f"Default cache directory: {file_utils.default_cache_path}")

# List the contents of the Hugging Face cache directory
cache_dir = "/root/.cache/huggingface/hub"
print(f"Files and directories in cache directory: {os.listdir(cache_dir)}")

# Path to the model in the cache directory
model_cache_dir = "/root/.cache/huggingface/hub/models--tiiuae--falcon-11B"

# List all files and directories within the model's cache directory
for root, dirs, files in os.walk(model_cache_dir):
    level = root.replace(model_cache_dir, '').count(os.sep)
    indent = ' ' * 4 * (level)
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 4 * (level + 1)
    for f in files:
        print(f"{subindent}{f}")


Default cache directory: /root/.cache/huggingface/hub
Files and directories in cache directory: ['version.txt', '.locks', 'models--tiiuae--falcon-11B']
models--tiiuae--falcon-11B/
    snapshots/
        64c2a7d3b48022973d881bb100ec52ec572567d1/
            model-00003-of-00005.safetensors
            model-00001-of-00005.safetensors
            config.json
            generation_config.json
            tokenizer_config.json
            model-00002-of-00005.safetensors
            model-00005-of-00005.safetensors
            tokenizer.json
            model-00004-of-00005.safetensors
            special_tokens_map.json
            model.safetensors.index.json
    blobs/
        1ad7f5e2d082410b40baa5762366f86eb506bc41
        5f0df799812b7ce0b3ae6ed859f34d188e85b4ebb2f31154b1bdf6c87e562367
        6271c1adaaa7446af30f1e3539b7f490e2b78b8d
        43299e48da013aff2037099c2a79c50788143c1eb7d39e3e7c7e29f54a8bd135
        60e1bbe9c4f99844b703e9a675fb6c6041542a3e
        7747d394a507c6e6f84d7

In [None]:
import shutil

# Define the source and destination directories
cache_dir = "/root/.cache/huggingface/hub/models--tiiuae--falcon-mamba-7b"
destination_dir = "/content/drive/MyDrive/hf_models/falcon_mamba_7b_local"

# Copy the directory
shutil.copytree(cache_dir, destination_dir)

print(f"Model files copied to {destination_dir}")


Model files copied to /content/drive/MyDrive/hf_models/falcon_mamba_7b_local


## Run Base Model

In [None]:
pipe = pipeline("text-generation", model=model_id, device=0)
input = "In the future, AI will"
output = pipe(input, max_new_tokens=50)
print(output[0]["generated_text"])

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In the future, AI will be able to help us make better decisions, solve complex problems, and even create new technologies that we can’t even imagine yet.
AI is already being used in a variety of industries, and its potential applications are endless.
AI is already


# Preprocessing Model

In [None]:
model.gradient_checkpointing_enable() # Gradient checkpointing for memory efficiency
model = prepare_model_for_kbit_training(model) # Freezes layers for quantization except for those specified for fine-tuning via peft lora config

In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
print_trainable_parameters(model)

trainable params: 0 || all params: 5817999360 || trainable%: 0.0


In [None]:
# Target modules
for name, module in model.named_modules():
    print("name: ", name)
    print("module: ", module)

name:  
module:  FalconForCausalLM(
  (transformer): FalconModel(
    (word_embeddings): Embedding(65024, 4096)
    (h): ModuleList(
      (0-59): 60 x FalconDecoderLayer(
        (self_attention): FalconAttention(
          (query_key_value): Linear4bit(in_features=4096, out_features=6144, bias=False)
          (dense): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (attention_dropout): Dropout(p=0.0, inplace=False)
          (rotary_emb): FalconRotaryEmbedding()
        )
        (mlp): FalconMLP(
          (dense_h_to_4h): Linear4bit(in_features=4096, out_features=16384, bias=False)
          (act): GELUActivation()
          (dense_4h_to_h): Linear4bit(in_features=16384, out_features=4096, bias=False)
        )
        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_f): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
    (rotary_emb): FalconRotaryEmbedding()
  )
  (lm_head): Linear(in_features=4096, out_f

In [None]:
for name, _ in model.named_modules():
    print("name: ", name)

name:  
name:  transformer
name:  transformer.word_embeddings
name:  transformer.h
name:  transformer.h.0
name:  transformer.h.0.self_attention
name:  transformer.h.0.self_attention.query_key_value
name:  transformer.h.0.self_attention.dense
name:  transformer.h.0.self_attention.attention_dropout
name:  transformer.h.0.self_attention.rotary_emb
name:  transformer.h.0.mlp
name:  transformer.h.0.mlp.dense_h_to_4h
name:  transformer.h.0.mlp.act
name:  transformer.h.0.mlp.dense_4h_to_h
name:  transformer.h.0.input_layernorm
name:  transformer.h.1
name:  transformer.h.1.self_attention
name:  transformer.h.1.self_attention.query_key_value
name:  transformer.h.1.self_attention.dense
name:  transformer.h.1.self_attention.attention_dropout
name:  transformer.h.1.self_attention.rotary_emb
name:  transformer.h.1.mlp
name:  transformer.h.1.mlp.dense_h_to_4h
name:  transformer.h.1.mlp.act
name:  transformer.h.1.mlp.dense_4h_to_h
name:  transformer.h.1.input_layernorm
name:  transformer.h.2
name:  t

# LoRA Configuration

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 4915200 || all params: 5822914560 || trainable%: 0.08441133644248422


# Load Dataset Q/A Tasks

In [None]:
data = load_dataset("HuggingFaceH4/testing_self_instruct_small", split="train")
data

README.md:   0%|          | 0.00/461 [00:00<?, ?B/s]

(…)-00000-of-00001-a61a142eaa61eaa0.parquet:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

(…)-00000-of-00001-a74d6359be9ca599.parquet:   0%|          | 0.00/20.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['prompt', 'completion'],
    num_rows: 100
})

In [None]:
data[0]

{'prompt': 'Find the word that is closest in meaning to "fear".\n\nOutput:',
 'completion': 'fear - fear, dread, terror, alarm, fright, horror, panic, dismay, consternation, trepidation, apprehension, disquiet, uneasiness, concern, worry, anxiety, agitation, perturbation, solicitude, misgiving, qualm, quandary, dilemma, doubt, suspicion, mistrust, wariness, watchfulness, care, heed, circumspection, vigilance, alertness, caution, concern, solicitude, uneasiness, disquiet, apprehension, misgiving, distrust, suspicion, mistrust, wariness, watchfulness, care, heed, circumspection, vigilance, alertness, caution.'}

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['prompt'])):
        text = f"### Question: {example['prompt'][i]}\n ### Answer: {example['completion'][i]}"
        output_texts.append(text)
    return output_texts

response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)

In [None]:
len(data['prompt'])

100

In [None]:
formatting_prompts_func(data[0]) # no loop return string

'### Question: Find the word that is closest in meaning to "fear".\n\nOutput:\n### Answer: fear - fear, dread, terror, alarm, fright, horror, panic, dismay, consternation, trepidation, apprehension, disquiet, uneasiness, concern, worry, anxiety, agitation, perturbation, solicitude, misgiving, qualm, quandary, dilemma, doubt, suspicion, mistrust, wariness, watchfulness, care, heed, circumspection, vigilance, alertness, caution, concern, solicitude, uneasiness, disquiet, apprehension, misgiving, distrust, suspicion, mistrust, wariness, watchfulness, care, heed, circumspection, vigilance, alertness, caution.'

In [None]:
formatting_prompts_func(data) # loop return list

['### Question: Find the word that is closest in meaning to "fear".\n\nOutput:\n ### Answer: fear - fear, dread, terror, alarm, fright, horror, panic, dismay, consternation, trepidation, apprehension, disquiet, uneasiness, concern, worry, anxiety, agitation, perturbation, solicitude, misgiving, qualm, quandary, dilemma, doubt, suspicion, mistrust, wariness, watchfulness, care, heed, circumspection, vigilance, alertness, caution, concern, solicitude, uneasiness, disquiet, apprehension, misgiving, distrust, suspicion, mistrust, wariness, watchfulness, care, heed, circumspection, vigilance, alertness, caution.',
 '### Question: Tell me what you would do if your boss asked you to perform a task that is against the law. Output:\n ### Answer: I would tell my boss that it is against the law and refuse to do it.',
 '### Question: In a news article, identify the location (i.e., "state", "country") based on the description.\nInput: The United States is considering imposing new sanctions on Russi

In [None]:
collator

DataCollatorForCompletionOnlyLM(tokenizer=GemmaTokenizerFast(name_or_path='google/gemma-7b', vocab_size=256000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<eos>', 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedToken("<2mass

## SQL Dataset

In [None]:
data = load_dataset("kaxap/pg-wikiSQL-sql-instructions-80k", split="train")
data = data.filter(lambda x: x['sql_query'] and x['create_table_statement']) # Filter out rows with missing values
truncated_data = data.select(range(1000)) # Truncate dataset to n rows
truncated_data

README.md:   0%|          | 0.00/746 [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/19.4M [00:00<?, ?B/s]

dev.csv:   0%|          | 0.00/2.88M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/5.47M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/56312 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8411 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/15871 [00:00<?, ? examples/s]

Filter:   0%|          | 0/56312 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'create_table_statement', 'sql_query', 'wiki_sql_table_id'],
    num_rows: 1000
})

In [None]:
def instruction_prompt_format(example):
  return [f"""Use the Instruction and Input to write Output as SQL query.

    ### Instruction:
    {example['question']}

    ### Input:
    {example['create_table_statement']}

    ### Output:
    {example['sql_query']}
    """]

collator = DataCollatorForCompletionOnlyLM(response_template=" ### Output:", tokenizer=tokenizer)

# Training

In [None]:
path = "/content/gdrive/MyDrive/hf_models/falcon11b-sql_instruct"

In [None]:
import wandb

wandb.login() # b14d0df440f0a1de4fa058ac7e53005e5ceffdfc

%env WANDB_PROJECT=sql_llm-instruct
%env WANDB_LOG_MODEL=end
%env WANDB_WATCH=false
# 'checkpoint' logging for each checkpoint or 'end' for end of training

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


env: WANDB_PROJECT=sql_llm-instruct
env: WANDB_LOG_MODEL=end
env: WANDB_WATCH=false


In [None]:
# training_args = TrainingArguments(
#         per_device_train_batch_size=4,
#         gradient_accumulation_steps=4,
#         warmup_steps=100,
#         max_steps=200,
#         learning_rate=2e-4,
#         fp16=True,
#         logging_steps=1,
#         output_dir='falcon-mamba_instruct'
#         )

# trainer = Trainer(
#     model=model,
#     train_dataset=tokenized_datasets['train'],
#     args=training_args,
#     data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) # Ensures batched data is padded correctly using information from the tokenizer
#     )

# model.config.use_cache = False

# trainer.train()
# trainer.save_model()

In [None]:
!nvidia-smi


Wed Nov 27 03:42:56 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0              47W / 400W |   8965MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import torch
torch.cuda.empty_cache()
print(f"Memory allocated after: {torch.cuda.memory_allocated() / 1e9} GB")
print(f"Memory reserved after: {torch.cuda.memory_reserved() / 1e9} GB")

Memory allocated after: 7.661559296 GB
Memory reserved after: 7.85383424 GB


In [None]:
# Supervised Fine-Tuning
sft_config = SFTConfig(
    output_dir=path, # Directory to save the fine-tuned model (mount drive)
    overwrite_output_dir=True, # Overwrites the output directory if it exists
    num_train_epochs=5, # number of examples / batch size per step = total steps per epoch
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    weight_decay=0.001, # Regularization
    max_grad_norm=0.3, # Gradient clipping
    warmup_ratio=0.03,  # n*100% steps before scheduler
    optim="paged_adamw_32bit",
    per_device_train_batch_size=8, # Batch size per device (GPU); examples processed per step
    gradient_accumulation_steps=4, # Number of steps before applying gradients (total accumulated gradients applied on nth step; delaying parameter update to handle large batch sizes e.g. 4 steps accumulated * 8 batches = 32 examples then update params)
    gradient_checkpointing=True, # Save memory by recomputing activations instead of storing in memory
    save_steps=100, # Saves the model state every n steps
    logging_dir=f"{path}/logs",
    logging_steps=25, # Log training metrics every n steps
    max_seq_length=2048, # Set to max context length of llm
    # packing=True, # Combines sequences to fit context length (not compatilble with DataCollatorForCompletionOnlyLM)
    report_to="wandb", # Logging to Weights & Biases
)

sfttrainer = SFTTrainer(
    model,
    train_dataset=truncated_data,
    args=sft_config,
    peft_config=lora_config,
    formatting_func=instruction_prompt_format, # The `formatting_func` should return a list of processed strings since it can lead to silent bugs.
    data_collator=collator,
    processing_class=tokenizer,
    )


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
sfttrainer.train()
sfttrainer.save_model()
wandb.finish()



[34m[1mwandb[0m: Currently logged in as: [33madityas-ai2410[0m ([33madityas-ai2410-upwork[0m). Use [1m`wandb login --relogin`[0m to force relogin



    ### Instruction:
    ['Tell me what the notes are for South Australia ', 'What is the current series where the new series began in June 2011?', 'What is the format for South Australia?', 'Name the background colour for the Australian Capital Territory', 'how many times is the fuel propulsion is cng?', 'what is the fuel propulsion where the fleet series (quantity) is 310-329 (20)?', 'who is the manufacturer for the order year 1998?', 'how many times is the model ge40lfr?', 'how many times is the fleet series (quantity) is 468-473 (6)?', 'what is the powertrain (engine/transmission) when the order year is 2000?', 'What if the description of a ch-47d chinook?', 'What is the max gross weight of the Robinson R-22?', 'What school did player number 6 come from?', 'What school did the player that has been in Toronto from 2012-present come from?', 'What school did the player that has been in Toronto from 2010-2012 go to?', 'What position did the player from Baylor play?', 'Who played in th

Step,Training Loss


VBox(children=(Label(value='1.071 MB of 18.806 MB uploaded\r'), FloatProgress(value=0.05693396499818889, max=1…

0,1
train/epoch,▁
train/global_step,▁

0,1
total_flos,666095572746240.0
train/epoch,5.0
train/global_step,5.0
train_loss,0.0
train_runtime,48.9489
train_samples_per_second,0.102
train_steps_per_second,0.102


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

sfttrainer.save_model(path)

Mounted at /content/gdrive


In [None]:
%load_ext tensorboard
%tensorboard --logdir /content/drive/MyDrive/falcon11b_instruction_tuning/logs

# Push Fine-Tuned Model to HuggingFace Hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
repo_id = "adityas2410/falcon11b-sql_instruct"
model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

In [None]:
# repo_id = "adityas2410/falcon11b-sql_instruct"
# # Merge LoRA with the base model and save the merged model
# merged = trained_model.merge_and_unload()
# merged.save_pretrained("merged", safe_serialization=True)
# tokenizer.save_pretrained("merged")

# merged.push_to_hub(repo_id)
# tokenizer.push_to_hub(repo_id)

# Load Fine-Tuned Model

*Note: Fine-tuned models do not preserve quantization once saved and uses the original precision.*


In [5]:
# Load model from drive
from google.colab import drive
drive.mount('/content/gdrive')
model_id = '/content/gdrive/MyDrive/hf_models/falcon11b-sql_instruct'

Mounted at /content/gdrive


In [None]:
# model_repos = "tiiuae/falcon-11B"
# base_model = AutoModelForCausalLM.from_pretrained(model_repos)
# tokenizer = AutoTokenizer.from_pretrained(model_repos)
# peft_model = PeftModel.from_pretrained(base_model, model_id)


In [9]:
# model_id = "adityas2410/falcon11b-sql_instruct" # Repos ID pushed to HF hub
model_id = '/content/gdrive/MyDrive/hf_models/falcon11b-sql_instruct'

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, # Reduce model weights to 4-bit precision
    bnb_4bit_use_double_quant=True, # Apply additional quantization layer
    bnb_4bit_quant_type="nf4", # Normal float 4-bit format to optimize weights storage
    bnb_4bit_compute_dtype=torch.bfloat16 # 4-bit weights temporarily upscaled to brain float 16-bit for matrix computations
)

# Load local peft adapters with remote base model
instruction_tuned_model = AutoPeftModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

configuration_falcon.py:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-11B:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_falcon.py:   0%|          | 0.00/76.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-11B:
- modeling_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/31.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/2.43G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

# Inference

In [10]:
question = "Tell me what the notes are for South Australia"

create_table_statement = """
CREATE TABLE "table1_1000181_1" ( "state_territory" text, "text_background_colour" text, "format" text, "current_slogan" text, "current_series" text, "notes" text );
"""

prompt = f"""Use the Instruction and Input to write Output as SQL query.

### Instruction:
{question}

### Input:
{create_table_statement}

### Output:

"""


In [13]:
import torch
import time

start_memory = torch.cuda.memory_allocated()
start_time = time.time()

instruction_tuned_model.eval()  # Set model in inference mode

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch = tokenizer(prompt, return_tensors='pt').to(device)

# Use mixed precision and disable gradients
with torch.no_grad(), torch.amp.autocast('cuda'):
    output_tokens = instruction_tuned_model.generate(**batch, max_new_tokens=100)

result = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
print(result, '\n')

end_time = time.time()
print(f"Inference time: {end_time - start_time} seconds")

end_memory = torch.cuda.memory_allocated()
peak_memory = torch.cuda.max_memory_allocated()
print(f"Memory allocated before inference: {start_memory} bytes")
print(f"Memory allocated after inference: {end_memory} bytes")
print(f"Memory used during inference: {end_memory - start_memory} bytes")
print(f"Peak allocated memory: {peak_memory}")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Use the Instruction and Input to write Output as SQL query.

### Instruction:
Tell me what the notes are for South Australia

### Input:

CREATE TABLE "table1_1000181_1" ( "state_territory" text, "text_background_colour" text, "format" text, "current_slogan" text, "current_series" text, "notes" text );


### Output:

 SELECT "notes" FROM "table1_1000181_1" WHERE "state_territory" = 'South Australia' 

Inference time: 4.747222423553467 seconds
Memory allocated before inference: 6799026688 bytes
Memory allocated after inference: 6799028224 bytes
Memory used during inference: 1536 bytes
Peak allocated memory: 7420221440


In [None]:
import wandb
wandb.login()

wandb.init(project='sql_llm-instruct', name='inference_run')
wandb.log({
    'inference_time_seconds': end_time - start_time,
    'memory_allocated_before_inference_bytes': start_memory,
    'memory_allocated_after_inference_bytes': end_memory,
    'memory_used_during_inference_bytes': end_memory - start_memory,
    'peak_memory_allocated_bytes': peak_memory
})
wandb.finish()

0,1
inference_time_seconds,▁
memory_allocated_after_inference_bytes,▁
memory_allocated_before_inference_bytes,▁
memory_used_during_inference_bytes,▁
peak_memory_allocated_bytes,▁

0,1
inference_time_seconds,5.532
memory_allocated_after_inference_bytes,14976009216.0
memory_allocated_before_inference_bytes,14976005632.0
memory_used_during_inference_bytes,3584.0
peak_memory_allocated_bytes,15598865408.0


In [None]:
"""
qlora with wandb example - https://abvijaykumar.medium.com/fine-tuning-llm-parameter-efficient-fine-tuning-peft-lora-qlora-part-2-d8e23877ac6f
transformers wandb docs - https://docs.wandb.ai/guides/integrations/huggingface
qlora kaggle example - https://www.kaggle.com/code/harpdeci/how-to-instuct-tune-a-huggingface-model
"""