In [None]:
import warnings 
warnings.filterwarnings("ignore")

In [None]:
!nvidia-smi

Thu Apr 11 05:45:04 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla T4                       Off | 00000000:00:05.0 Off |  

In [None]:
!pip install -q -U transformers torch==2.2.1 datasets huggingface_hub wandb
!pip install -q -U accelerate bitsandbytes peft trl python-dotenv
!echo "Installations completed!"

Installations completed!


In [None]:
import transformers
import datasets
import trl
import accelerate
import peft
import bitsandbytes
import torch
import huggingface_hub
import wandb

print("transformers version:", transformers.__version__)
print("datasets version:", datasets.__version__)
print("trl version:", trl.__version__)
print("accelerate version:", accelerate.__version__)
print("peft version:", peft.__version__)
print("bitsandbytes version:", bitsandbytes.__version__)
print("torch version:", torch.__version__)
print("huggingface_hub version:", huggingface_hub.__version__)
print("wandb version:", wandb.__version__)

transformers version: 4.39.3
datasets version: 2.18.0
trl version: 0.8.1
accelerate version: 0.29.2
peft version: 0.10.0
bitsandbytes version: 0.43.0
torch version: 2.2.1+cu121
huggingface_hub version: 0.22.2
wandb version: 0.16.6


In [None]:
from datasets import load_dataset, Dataset
from datasets.exceptions import DatasetNotFoundError

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from torch import bfloat16

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer

from huggingface_hub import notebook_login, logging
import wandb

2024-04-11 05:48:07.234531: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 05:48:07.234656: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 05:48:07.402845: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
try:
    from google.colab import drive
    import os
    
    print("Using Google Colab")
    drive.mount('/content/drive')
    os.chdir('/content/drive/MyDrive/SQL-Query-Generator/')
    
    load_dotenv()

    huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
    wandb_api_key = os.environ.get("WANDB_API_KEY")

    print(huggingface_token)

    wandb.login(key=wandb_api_key)

#     HUGGINGFACE_TOKEN = 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
#     WANDB_API_KEY = ac4525a27cdcb34c068674c5fed00841eb0d9f4c
    
except ModuleNotFoundError:
    notebook_login()
    logging.set_verbosity(logging.CRITICAL)
    wandb.login()
    print("Using other environment")

In [None]:
def load_data(use_my_dataset=True):
  try:
    if use_my_dataset:
      print("Using fawern/Text-to-sql-query-generation")
      dataset = load_dataset("fawern/Text-to-sql-query-generation", split='train')
      print(dataset[0])
    else:
      raise DatasetNotFoundError

  except DatasetNotFoundError:
    print("Clinton/Text-to-sql-v1")
    dataset = load_dataset("Clinton/Text-to-sql-v1", split='train')

    print(dataset[0])

    def get_prompt(text):
      input_text = text['instruction']
      output_text = text['response']

      prompt = f""" <s> [INST] You are a SQL query generator (text-to-sql). Your task is to generate a SQL query from the given question.
      Question : {input_text} [/INST] SQL Query : {output_text} </s>"""
      return {'prompt' : prompt}

    dataset = dataset.map(get_prompt, remove_columns=dataset.features)
    dataset.push_to_hub("fawern/Text-to-sql-query-generation")

  train_rate = int(len(dataset) * 0.8)

  train_dataset = Dataset.from_dict(dataset[:train_rate])
  val_dataset = Dataset.from_dict(dataset[train_rate:])

  return train_dataset, val_dataset

train_dataset, val_dataset = load_data()

Using fawern/Text-to-sql-query-generation


Downloading readme:   0%|          | 0.00/283 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 31.8M/31.8M [00:00<00:00, 40.3MB/s]


Generating train split:   0%|          | 0/262208 [00:00<?, ? examples/s]

{'prompt': ' <s> [INST] You are a SQL query generator (text-to-sql). Your task is to generate a SQL query from the given question.\n      Question : Name the home team for carlton away team [/INST] SQL Query : SELECT home_team FROM table_name_77 WHERE away_team = "carlton" </s>'}


In [11]:
base_model_name= 'defog/sqlcoder-7b-2'

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.add_eos_token = True
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name, 
    quantization_config=bnb_config,
    torch_dtype=bfloat16,
    device_map='auto'
)

model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model)

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [12]:
 lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    bias='none',
    lora_dropout=0.05,
    task_type='CAUSAL_LM',
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
    "lm_head",
    ]
)

model = get_peft_model(model, lora_config)

In [13]:
training_args = TrainingArguments(
    output_dir='./sql-coder-7B-2-results',
    num_train_epochs=3,
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=41,
    optim='paged_adamw_32bit',
    save_strategy='steps', 
    save_steps=25,
    weight_decay=0.001,
    max_steps=50, 
    evaluation_strategy='steps',
    eval_steps=25,
    do_eval=True,
    report_to='wandb'
)

trainer = SFTTrainer(
    model = model ,
    args=training_args,
    tokenizer = tokenizer,
    train_dataset=train_dataset,
    eval_dataset = val_dataset,
    peft_config=lora_config,
    dataset_text_field='prompt'
)

model.config.use_cache=False

Map:   0%|          | 0/209766 [00:00<?, ? examples/s]

Map:   0%|          | 0/52442 [00:00<?, ? examples/s]

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


In [None]:
trained_model_name = "fawern/sqlcoder-7b2-SQL-query-generator"
trainer.model.push_to_hub(trained_model_name)
tokenizer.push_to_hub(trained_model_name)

In [None]:
'completed'