In [1]:
%%bash
export CUDA_VISIBLE_DEVICES=0,1
#jupyter notebook

In [2]:
#from utils import clean_mem

In [3]:
#clean_mem()

In [4]:
import torch; torch.cuda.is_available()

True

In [5]:


# This command is run in a bash shell due to '%%bash' at the beginning.
# 'pip -q install' is used to install Python packages with pip, Python's package installer, in a quiet mode which reduces the output verbosity.
# 'huggingface_hub', 'transformers', 'peft', and 'bitsandbytes' are the packages being installed by the first command.
# These packages are necessary for the fine-tuning and inference of the Phi-3 model.
# 'trl' and 'xformers' are additional packages being installed by the second command.
# 'datasets' is a package for providing access to a vast range of datasets, installed by the third command.
# The last command ensures that 'torch' version is at least 1.10. If it's already installed but the version is lower, it will be upgraded.
# %%bash
# pip -q install huggingface_hub transformers peft bitsandbytes
# pip -q install trl xformers
# pip -q install datasets
# pip install torch>=1.10
     

# Import necessary modules from the transformers library
# AutoModelForCausalLM: This is a class for causal language models. It's used for tasks like text generation.
# AutoTokenizer: This class is used for tokenizing input data, a necessary step before feeding data into a model.
# TrainingArguments: This class is used for defining the parameters for model training, like learning rate, batch size, etc.
# BitsAndBytesConfig: This class is used for configuring the BitsAndBytes quantization process.
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

# Import necessary modules from the huggingface_hub library
# ModelCard: This class is used for creating a model card, which provides information about a model.
# ModelCardData: This class is used for defining the data of a model card.
# HfApi: This class provides an interface to the Hugging Face API, allowing you to interact with the Hugging Face Model Hub.
from huggingface_hub import ModelCard, ModelCardData, HfApi

# Import the load_dataset function from the datasets library. This function is used for loading datasets.
from datasets import load_dataset

# Import the Template class from the jinja2 library. This class is used for creating dynamic HTML templates.
from jinja2 import Template

# Import the SFTTrainer class from the trl library. This class is used for training models.
from trl import SFTTrainer

# Import the yaml module. This module is used for working with YAML files.
import yaml

# Import the torch library. This library provides tools for training and running deep learning models.
import torch
     

# MODEL_ID is a string that specifies the identifier of the pre-trained model that will be fine-tuned. 
# In this case, the model is 'Phi-3-mini-4k-instruct' from Microsoft.
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"

# NEW_MODEL_NAME is a string that specifies the name of the new model after fine-tuning.
# Here, the new model will be named 'opus-samantha-phi-3-mini-4k'.
NEW_MODEL_NAME = "sql-leetcoder-phi-3-mini-4k"
     

# DATASET_NAME is a string that specifies the name of the dataset to be used for fine-tuning.
# Replace "replace with your dataset" with the actual name of your dataset.
DATASET_NAME = "gretelai/synthetic_text_to_sql"

# SPLIT specifies the portion of the dataset to be used. In this case, the 'train' split of the dataset will be used.
SPLIT = "train"

# MAX_SEQ_LENGTH is an integer that specifies the maximum length of the sequences that the model will handle.
MAX_SEQ_LENGTH = 2048

# num_train_epochs is an integer that specifies the number of times the training process will go through the entire dataset.
num_train_epochs = 1

# license is a string that specifies the license under which the model is distributed. In this case, it's Apache License 2.0.
license = "apache-2.0"

# username is a string that specifies the GitHub username of the person who is fine-tuning the model.
username = "dapopov-st"

# learning_rate is a float that specifies the learning rate to be used during training.
learning_rate = 1.41e-5

# per_device_train_batch_size is an integer that specifies the number of samples to work through before updating the internal model parameters.
per_device_train_batch_size = 4

# gradient_accumulation_steps is an integer that specifies the number of steps to accumulate gradients before performing a backward/update pass.
gradient_accumulation_steps = 1
     

# This code checks if the current CUDA device supports bfloat16 (Brain Floating Point) computations.
# If bfloat16 is supported, it sets the compute_dtype to torch.bfloat16.
# If not, it sets the compute_dtype to torch.float16.
# bfloat16 and float16 are both half-precision floating-point formats, but bfloat16 provides better performance on some hardware.
if torch.cuda.is_bf16_supported():
  compute_dtype = torch.bfloat16
else:
  compute_dtype = torch.float16
     

# Load the pre-trained model specified by MODEL_ID using the AutoModelForCausalLM class.
# The 'trust_remote_code=True' argument allows the execution of code from the model card (if any).
#model = AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True).save_pretrained('./phi3')
model = AutoModelForCausalLM.from_pretrained('./phi3')
#model = AutoModelForCausalLM.from_pretrained('./phi3')
tokenizer = AutoTokenizer.from_pretrained('./phi3')

  from .autonotebook import tqdm as notebook_tqdm


[2024-09-23 13:41:21,859] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/mainuser/anaconda3/envs/sqlft/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 13.28it/s]


In [6]:
model= model.to('cuda')

In [7]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
      )
    )
    (norm): Phi3RMSNorm((3072,), eps=1e-05)
  )
 

In [8]:
prompt = 'Tell me about yourself'
inputs = tokenizer(prompt,
                   return_tensors="pt").to("cuda")

# Generate a response from the model
# do_sample=True means the model will generate text by sampling from the distribution of possible outputs
# max_new_tokens=120 limits the length of the generated text to 120 tokens
outputs = model.generate(**inputs,
                         do_sample=True, max_new_tokens=120)

You are not running the flash-attention implementation, expect numerical differences.


In [9]:
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     

# Print the generated response from the model
print(response)
     

Tell me about yourself, Jack."

Jack looked down at his hands, fidgeting with his wedding ring. He took a deep breath and began, "Well, I'm Jack Turner, first-generation American. My parents, Jameson and Ella Turner, were both immigrants from Ireland. My father worked as a carpenter in the bustling shipyards during the Great Depression, and my mother was a seamstress in a local factory. Despite the financial hardships they faced, they worked tirelessly to give their children a better life."



In [10]:
from datasets import load_from_disk
#dataset.save_to_disk('./data')
dataset = load_from_disk('./data')

In [11]:

# Load the tokenizer associated with the pre-trained model specified by MODEL_ID using the AutoTokenizer class.
# The 'trust_remote_code=True' argument allows the execution of code from the model card (if any).
#tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

# Load the dataset specified by DATASET_NAME using the load_dataset function.
# The 'split="train"' argument specifies that we want to load the training split of the dataset.
#dataset = load_dataset(DATASET_NAME, split="train")

# Get the ID of the end-of-sentence (EOS) token from the tokenizer and store it in EOS_TOKEN.
# This token is used to mark the end of a sentence in the input data.
EOS_TOKEN=tokenizer.eos_token_id
     

# This line simply prints the contents of the 'dataset' variable.
# 'dataset' is expected to be a Dataset object loaded from the 'datasets' library.
# Printing it will display information about the dataset such as the number of samples, the features, and a few example data points.
dataset
     

# Select a subset of the data for faster processing
#dataset = dataset.select(range(100))
     

# This line simply prints the contents of the 'dataset' variable.
# 'dataset' is expected to be a Dataset object loaded from the 'datasets' library.
# Printing it will display information about the dataset such as the number of samples, the features, and a few example data points.
#dataset


Dataset({
    features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
    num_rows: 100000
})

In [12]:
#tokenizer
#tokenizer.save_pretrained('./phi3')

In [13]:
def filter_func(example):
    # Replace 'column_name' and 'value' with the actual column name and value
    return (example['sql_complexity'] == 'window functions' 
            or example['sql_complexity'] == 'multiple_joins'
            or example['sql_complexity'] == 'subqueries'
            )
dataset = dataset.filter(filter_func)

In [14]:
dataset

Dataset({
    features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
    num_rows: 13264
})

In [15]:


def formatting_prompts_func(sample):
    """Given a sample dictionary with keys "title" and "abstract" format into a prompt.

    Args:
      sample: A sample dictionary from a Hugging Face dataset.

    Returns:
      sample: sample dictionary with "text" key for the formatted prompt.
    """
    #sample['text']=f"[INST] <> Write SQL code to answer the question based on the context. Please wrap your code answer using ```: <> {sample['question']} {sample['context']} [/INST] {sample['answer']}"
    sample['text']=f"""
<|system|>
You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|end|>
<|user|>
Context: {sample['sql_context']}
Question: {sample['sql_prompt']}<|end|>
<|assistant|>
SQL Query:
{sample['sql']}
<|end|>
"""
    return sample
    #return {"text":sample['text']}

In [16]:
     

# Define a function to format the prompts in the dataset.
# This function takes a batch of examples and returns a dictionary with the key 'text' and the value being a list of formatted texts.
# def formatting_prompts_func(examples):
#     # Extract the conversations from the examples.
#     convos = examples["conversations"]
#     # Initialize an empty list to store the formatted texts.
#     texts = []
#     # Define a dictionary to map the 'from' field in the conversation to a prefix.
#     mapper = {"system": "system\n", "human": "\nuser\n", "gpt": "\nassistant\n"}
#     # Define a dictionary to map the 'from' field in the conversation to a suffix.
#     end_mapper = {"system": "", "human": "", "gpt": ""}
#     # Iterate over each conversation.
#     for convo in convos:
#         # Format the conversation by joining each turn with its corresponding prefix and suffix.
#         # Append the EOS token to the end of the conversation.
#         text = "".join(f"{mapper[(turn := x['from'])]} {x['value']}\n{end_mapper[turn]}" for x in convo)
#         texts.append(f"{text}{EOS_TOKEN}")
#     # Return the formatted texts.
#     return {"text": texts}

# Apply the formatting function to the dataset using the map method.
# The 'batched=True' argument means that the function is applied to batches of examples.
dataset = dataset.map(formatting_prompts_func,
                      remove_columns=['id','domain','domain_description',
                                      'sql_complexity','sql_complexity_description',
                                      'sql_task_type','sql_task_type_description',
                                      'sql_explanation','sql_prompt', 'sql_context', 'sql',])#, batched=True)

# Print the 9th example from the 'text' field of the dataset to check the result.
print(dataset['text'][8])
     



<|system|>
You are a helpful assistant who is a SQL expert. Your task is to write a SQL query based on the given context and question.<|end|>
<|user|>
Context: CREATE TABLE cotton_source (brand VARCHAR(255), country VARCHAR(255), quantity INT); INSERT INTO cotton_source (brand, country, quantity) VALUES ('BrandA', 'USA', 1500), ('BrandB', 'USA', 2000), ('BrandC', 'China', 1000);
Question: What is the total quantity of cotton sourced from the United States by brands that have committed to fair labor practices?<|end|>
<|assistant|>
SQL Query:
SELECT SUM(quantity) FROM cotton_source WHERE country = 'USA' AND brand IN (SELECT brand FROM fair_labor WHERE commitment = 'yes');
<|end|>



In [17]:
dataset

Dataset({
    features: ['text'],
    num_rows: 13264
})

In [18]:
# from peft import LoraConfig
# peft_config = LoraConfig(
#       lora_alpha=16,
#       lora_dropout=0.1,
#       r=64,
#       bias="none",
#       task_type="CAUSAL_LM",
#       use_dora=True,
#       target_modules=[
#         "model.layers.*.self_attn.qkv_proj.*",
#         "model.layers.*.self_attn.o_proj.*",
#         "model.layers.*.mlp.gate_up_proj.*",
#         "model.layers.*.mlp.down_proj.*",
#     ]
# )

In [19]:
from peft import LoraConfig
peft_config = LoraConfig(
      lora_alpha=16,
      lora_dropout=0.1,
      r=64,
      bias="none",
      task_type="CAUSAL_LM",
      use_dora=True,
      target_modules=[
        "qkv_proj",
        "o_proj",
        "mlp.gate_up_proj",
        "mlp.down_proj",
    ]
)

In [20]:
for name in model.state_dict():
    print(name)

model.embed_tokens.weight
model.layers.0.self_attn.o_proj.weight
model.layers.0.self_attn.qkv_proj.weight
model.layers.0.mlp.gate_up_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.o_proj.weight
model.layers.1.self_attn.qkv_proj.weight
model.layers.1.mlp.gate_up_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.2.self_attn.o_proj.weight
model.layers.2.self_attn.qkv_proj.weight
model.layers.2.mlp.gate_up_proj.weight
model.layers.2.mlp.down_proj.weight
model.layers.2.input_layernorm.weight
model.layers.2.post_attention_layernorm.weight
model.layers.3.self_attn.o_proj.weight
model.layers.3.self_attn.qkv_proj.weight
model.layers.3.mlp.gate_up_proj.weight
model.layers.3.mlp.down_proj.weight
model.layers.3.input_layernorm.weight
model.layers.3.post_attention_layernorm.weight
model.layers.4

In [22]:
model.config.use_cache = False
# Create a TrainingArguments object, which is used to define the parameters for model training.

args = TrainingArguments(
    # 'evaluation_strategy' is set to "steps", which means evaluation is done at each logging step.
    evaluation_strategy="steps",

    # 'per_device_train_batch_size' is set to 7, which means each training batch will contain 7 samples per device.
    per_device_train_batch_size=2, #7,

    # 'gradient_accumulation_steps' is set to 4, which means gradients are accumulated for 4 steps before performing a backward/update pass.
    gradient_accumulation_steps=16,

    # 'gradient_checkpointing' is set to True, which means model gradients are stored in memory during training to reduce memory usage.
    gradient_checkpointing=True,

    # 'learning_rate' is set to 1e-4, which is the learning rate for the optimizer.
    learning_rate=1e-4,

    # 'fp16' is set to True if bfloat16 is not supported, which means the model will use 16-bit floating point precision for training if possible.
    fp16 = not torch.cuda.is_bf16_supported(),

    # 'bf16' is set to True if bfloat16 is supported, which means the model will use bfloat16 precision for training if possible.
    bf16 = torch.cuda.is_bf16_supported(),

    # 'max_steps' is set to -1, which means there is no maximum number of training steps.
    max_steps=10, #-1

    # 'num_train_epochs' is set to 3, which means the training process will go through the entire dataset 3 times.
    num_train_epochs=3,

    # 'save_strategy' is set to "epoch", which means the model is saved at the end of each epoch.
    save_strategy="epoch",

    # 'logging_steps' is set to 10, which means logging is done every 10 steps.
    logging_steps=10,

    # 'output_dir' is set to NEW_MODEL_NAME, which is the directory where the model and its configuration will be saved.
    output_dir=NEW_MODEL_NAME,

    # 'optim' is set to "paged_adamw_32bit", which is the optimizer to be used for training.
    optim="paged_adamw_32bit",

    # 'lr_scheduler_type' is set to "linear", which means the learning rate scheduler type is linear.
    lr_scheduler_type="linear",
    
)
     

# Create an instance of the SFTTrainer class, which is used to fine-tune the model.

trainer = SFTTrainer(
    # 'model' is the pre-trained model that will be fine-tuned.
    model=model,

    # 'args' are the training arguments that specify the training parameters.
    args=args,

    # 'train_dataset' is the dataset that will be used for training.
    train_dataset=dataset,

    # 'dataset_text_field' is the key in the dataset that contains the text data.
    dataset_text_field="text",

    # 'max_seq_length' is the maximum length of the sequences that the model will handle.
    max_seq_length=128,

    # 'formatting_func' is the function that will be used to format the prompts in the dataset.
    formatting_func=formatting_prompts_func,
    peft_config=peft_config,
    
)
     

# 'device' is set to 'cuda', which means the CUDA device will be used for computations if available.
device = 'cuda'

# Import the 'gc' module, which provides an interface to the garbage collector.
import gc

# Import the 'os' module, which provides a way of using operating system dependent functionality.
import os

# Call the 'collect' method of the 'gc' module to start a garbage collection, which can help free up memory.
gc.collect()

# Call the 'empty_cache' method of 'torch.cuda' to release all unused cached memory from PyTorch so that it can be used by other GPU applications.
torch.cuda.empty_cache()
     

# Call the 'train' method of the 'trainer' object to start the training process.
# This method will fine-tune the model on the training dataset according to the parameters specified in the 'args' object.
trainer.train()
     
