In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 langchain

In [1]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig, Trainer, pipeline
from peft import LoraConfig
from datasets import Dataset
from langchain.prompts.prompt import PromptTemplate

from trl import SFTTrainer
from peft import PeftModel

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
new_model = "./finetune_models/llama2"

# 1. Load based model

In [None]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [None]:
# based_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map={"":0})

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map={"":0}
)

# 2. Prepare Dataset

Dataset used to fine-tuning Llama must following structure :
```
<s>[INST] <<SYS>>
{{ system_prompt }}
<</SYS>>

{{ user_message }} [/INST]
```

**For example :**

```CMD
<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

There's a llama in my garden 😱 What should I do? [/INST]

```

In [2]:
df_data = pd.read_csv("./data/finetune_data_llama2b.csv")

In [3]:
df_data.head()

Unnamed: 0,instruction,question,sql_tbl_1,sql_tbl_2,answer
0,\nYou are an agent designed to interact with a...,Calculate the distance between the 'user' and ...,\n CREATE TABLE user (\n user_id INT...,\n CREATE TABLE user_reference (\n u...,"SELECT target_id, reference_id, distancetype, ..."
1,\nYou are an agent designed to interact with a...,Determine the distance between the 'item' and ...,\n CREATE TABLE item (\n item_id INT...,\n CREATE TABLE item_reference (\n i...,"SELECT target_id, reference_id, distancetype, ..."
2,\nYou are an agent designed to interact with a...,Find the distance between the 'inventory' and ...,\n CREATE TABLE inventory (\n produc...,\n CREATE TABLE inventory_reference (\n ...,"SELECT target_id, reference_id, distancetype, ..."
3,\nYou are an agent designed to interact with a...,Compute the distance between the 'user' and 'u...,\n CREATE TABLE user (\n user_id INT...,\n CREATE TABLE user_reference (\n u...,"SELECT target_id, reference_id, distancetype, ..."
4,\nYou are an agent designed to interact with a...,Calculate the distance between the 'transactio...,\n CREATE TABLE transaction (\n tran...,\n CREATE TABLE transaction_reference (\n ...,"SELECT target_id, reference_id, distancetype, ..."


**Convert prompt template**

In [4]:
prompt_template = PromptTemplate(
    input_variables=["instruction", "sql_table_1", "sql_table_2", "question", "answer"], template="<s>[INST] <<SYS>>{instruction}. Here is structure of table 1 {sql_table_1}, table 2 {sql_table_2}<</SYS>>{question}[/INST]{answer}</s>"
)

prompt_data = []
for i, row in df_data.iterrows():
  prompt_data.append(prompt_template.format(instruction=row.instruction, sql_table_1=row.sql_tbl_1, sql_table_2=row.sql_tbl_2, question=row.question, answer=row.answer))

dataset = Dataset.from_dict({"inputs": prompt_data})

In [5]:
print(dataset[2].get('inputs'))

<s>[INST] <<SYS>>
You are an agent designed to interact with a SQL database.
Given an input question, 
create a syntactically correct teradatasql query to run, then look at the results of the query and return the answer.
. Here is structure of table 1 
    CREATE TABLE inventory (
        product_id INT PRIMARY KEY,
        name VARCHAR(255),
        description TEXT,
        quantity INT,
        price DECIMAL(10, 2)
    );
    , table 2 
    CREATE TABLE inventory_reference (
        product_id INT PRIMARY KEY,
        name VARCHAR(255),
        description TEXT,
        quantity INT,
        price DECIMAL(10, 2)
    );
    <</SYS>>Find the distance between the 'inventory' and 'inventory_reference' tables using TD_VectorDistance.[/INST]SELECT target_id, reference_id, distancetype, cast(distance as decimal(36,8)) as distance FROM TD_VECTORDISTANCE (
    ON inventory AS TargetTable
    ON inventory_reference AS ReferenceTable DIMENSION
    USING
        TargetIDColumn('product_id')
   

In [7]:
dataset.to_json("./data/finetune_data.json")

Creating json from Arrow format: 100%|████████████████████████████████████████████████████| 1/1 [00:00<00:00, 89.52ba/s]


107132

# 3. FineTuning

In [17]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model.config.use_cache = False
model.config.pretraining_tp = 1

In [18]:
peft_params = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [19]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    logging_steps=1,
    learning_rate=2e-4,
    fp16=True
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="inputs",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

In [21]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.5748
2,2.5211
3,2.2155
4,2.0287
5,1.845
6,1.6819
7,1.5846
8,1.4631
9,1.3215
10,1.2217


TrainOutput(global_step=95, training_loss=0.30292751963593456, metrics={'train_runtime': 559.4676, 'train_samples_per_second': 0.679, 'train_steps_per_second': 0.17, 'total_flos': 3329827315974144.0, 'train_loss': 0.30292751963593456, 'epoch': 5.0})

# 4. Save model

In [22]:
new_model = "./drive/MyDrive/finetune_model/llama2_taradata"

trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('./drive/MyDrive/finetune_model/llama2_taradata/tokenizer_config.json',
 './drive/MyDrive/finetune_model/llama2_taradata/special_tokens_map.json',
 './drive/MyDrive/finetune_model/llama2_taradata/tokenizer.model',
 './drive/MyDrive/finetune_model/llama2_taradata/added_tokens.json',
 './drive/MyDrive/finetune_model/llama2_taradata/tokenizer.json')

# 5. Reload and predict

If you're using Google Colab T4, you need to restart the notebook to free up memory used during the fine-tuning step. After that, please rerun from this point onward.

In [4]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
new_model = "./drive/MyDrive/finetune_model/llama2_taradata"

In [None]:
# 1. Load based model
base_model = AutoModelForCausalLM.from_pretrained(model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)

In [12]:
# 2. Load new fine-tuned model. Then merge this two model
finetuned_model = PeftModel.from_pretrained(base_model, new_model)
merged_model = finetuned_model.merge_and_unload()

In [None]:
# 3. Load tokenizer,
tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [19]:
# 4. Define pipeline
pipe = pipeline(task="text-generation", model=merged_model, tokenizer=tokenizer, max_length=2000)

In [20]:
prompt_template = PromptTemplate(
    input_variables=["instruction", "sql_table_1", "sql_table_2", "question"], template="<s>[INST] <<SYS>>{instruction}. Here is structure of table 1 {sql_table_1}, table 2 {sql_table_2}<</SYS>>{question}[/INST]")


instruction = """You are an agent designed to interact with a SQL database.\nGiven an input question,
create a syntactically correct teradatasql query to run, then look at the results of the query and return the answer."""

question = "Calculate the separation between the 'item' and 'item_reference' tables using TD_VectorDistance."

sql_table_1 = """
    CREATE TABLE item (
        item_id INT PRIMARY KEY,
        name VARCHAR(255),
        description TEXT,
        price DECIMAL(10, 2),
        stock_quantity INT,
        category VARCHAR(100),
        weight INT,
        color_id INT,
        height INT,
        width INT
    );
"""

sql_table_2 = """
    CREATE TABLE item_reference (
        item_id INT PRIMARY KEY,
        name VARCHAR(255),
        description TEXT,
        price DECIMAL(10, 2),
        stock_quantity INT,
        category VARCHAR(100),
        weight INT,
        color_id INT,
        height INT,
        width INT
    );
"""

input = prompt_template.format(instruction=instruction, sql_table_1=sql_table_1, sql_table_2=sql_table_2, question=question)

In [21]:
input

"<s>[INST] <<SYS>>You are an agent designed to interact with a SQL database.\nGiven an input question, \ncreate a syntactically correct teradatasql query to run, then look at the results of the query and return the answer.. Here is structure of table 1 \n    CREATE TABLE item (\n        item_id INT PRIMARY KEY,\n        name VARCHAR(255),\n        description TEXT,\n        price DECIMAL(10, 2),\n        stock_quantity INT,\n        category VARCHAR(100),\n        weight INT,\n        color_id INT,\n        height INT,\n        width INT\n    );\n, table 2 \n    CREATE TABLE item_reference (\n        item_id INT PRIMARY KEY,\n        name VARCHAR(255),\n        description TEXT,\n        price DECIMAL(10, 2),\n        stock_quantity INT,\n        category VARCHAR(100),\n        weight INT,\n        color_id INT,\n        height INT,\n        width INT\n    );\n<</SYS>>Calculate the separation between the 'item' and 'item_reference' tables using TD_VectorDistance.[/INST]"

In [24]:
# 5. Run prompt and pipeline
result = pipe(input)

In [29]:
print(result[0]['generated_text'].split("/INST")[1][1:])

SELECT target_id, reference_id, distancetype, cast(distance as decimal(36,8)) as distance FROM TD_VECTORDISTANCE (
    ON item AS TargetTable
    ON item_reference AS ReferenceTable DIMENSION
    USING
        TargetIDColumn('item_id')
        TargetFeatureColumns('price','stock_quantity','category','weight','color_id','height','width')
        RefIDColumn('item_id')
        RefFeatureColumns('price','stock_quantity','category','weight','color_id','height','width')
        DistanceMeasure('euclidean','cosine','manhattan')
        topk(2)
) AS dt order by 3,1,2,4;</s>

The TD_VectorDistance function is used in the query to calculate the distance between the 'item' and 'item_reference' tables. The TargetIDColumn, TargetFeatureColumns, RefIDColumn, RefFeatureColumns, DistanceMeasure, and topk parameters are used to define how the distance is calculated. The results are then organized in a table with the target ID, reference ID, distance type, and distance values. 

The TargetIDColumn para