<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/upload_model_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
from huggingface_hub import PyTorchModelHubMixin


class MyModel(nn.Module, PyTorchModelHubMixin):
    def __init__(self, config: dict):
        super().__init__()
        self.param = nn.Parameter(torch.rand(config["num_channels"], config["hidden_size"]))
        self.linear = nn.Linear(config["hidden_size"], config["num_classes"])

    def forward(self, x):
        return self.linear(x + self.param)

# create model
config = {"num_channels": 3, "hidden_size": 32, "num_classes": 10}
model = MyModel(config=config)

# save locally
model.save_pretrained("my-awesome-model-poc", config=config)

# push to the hub
model.push_to_hub("my-awesome-model-poc", config=config)

# reload
model = MyModel.from_pretrained("frankmorales2020/my-awesome-model-poc")

pytorch_model.bin:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

In [2]:
!pip install colab-env --quiet

import colab_env
import os

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for colab-env (setup.py) ... [?25l[?25hdone
Mounted at /content/gdrive


In [3]:
#!pip install huggingface_hub --quiet
from huggingface_hub import HfApi

api = HfApi()
api.get_token_permission(token=access_token_write)
#api.set_access_token(access_token)


# frankmorales2020/Mistral-7B-text-to-sql Good

repo_id = "my-awesome-model-poc"
#api.create_repo(repo_id=repo_id, private=False)

api.delete_repo(repo_id=repo_id)

#api.upload_folder(
#    folder_path="./model",
#    repo_id=repo_id,
#    repo_type="model",
#)


In [None]:
# Install Pytorch & other libraries
!pip install torch tensorboard --quiet

# Install Hugging Face libraries
!pip install  --upgrade transformers datasets accelerate evaluate bitsandbytes --quiet

#FlashAttention only supports Ampere GPUs or newer. #NEED A100 IN GOOGLE COLAB
#!pip install -U transformers
!pip install -U flash-attn --no-build-isolation --quiet


! pip install peft --quiet
! pip install datasets trl ninja packaging --quiet

# Uncomment only if you're using A100 GPU
#!pip install flash-attn --no-build-isolation
!pip install diffusers safetensors  --quiet
!pip install colab-env --quiet


In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)

In [19]:
#frankmorales2020/Mistral-7B-text-to-sql
# Chose the base model you want
model_name = "frankmorales2020/Mistral-7B-text-to-sql"
# set device
device = 'cuda'
#v Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
# We redefine the pad_token and pad_token_id with out of vocabulary token (unk_token)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
from datasets import load_dataset

# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500))

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
# split dataset into 10,000 training samples and 2,500 test samples
dataset = dataset.train_test_split(test_size=2500/12500)

print(dataset["train"][345]["messages"])

# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

Downloading readme:   0%|          | 0.00/4.43k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_name_28 (to_par INTEGER, total VARCHAR)', 'role': 'system'}, {'content': 'What is the to par for the player with total of 155?', 'role': 'user'}, {'content': 'SELECT AVG(to_par) FROM table_name_28 WHERE total = 155', 'role': 'assistant'}]


Creating json from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

1181532

In [9]:
from datasets import load_dataset
from random import randint

# Load our test dataset and Tokenizer again
tokenizer = AutoTokenizer.from_pretrained("frankmorales2020/Mistral-7B-text-to-sql")
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
# generate the same prompt as for the first local test
prompt = tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
request= {"inputs":prompt,"parameters":{"temperature":0.2, "top_p": 0.95, "max_new_tokens": 256}}


In [11]:
! pip install peft --quiet
! pip install datasets trl ninja packaging --quiet

Parameter-Efficient Fine-Tuning (PEFT)

In [12]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline

peft_model_id = "frankmorales2020/Mistral-7B-text-to-sql"
# peft_model_id = args.output_dir

# Load Model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="auto",
  torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
# load into pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

adapter_config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


adapter_model.safetensors:   0%|          | 0.00/1.87G [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'M

In [13]:
from datasets import load_dataset
from random import randint


# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

# Test on sample
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)

print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")



Query:
Who is the home team that played at MCG?
Original Answer:
SELECT home_team FROM table_name_19 WHERE venue = "mcg"
Generated Answer:
SELECT home_team FROM table_name_19 WHERE venue = "mcg"


https://huggingface.co/frankmorales2020

In [14]:
from tqdm import tqdm

# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()
    print()
    print()
    print('Question: %s\n'%sample["messages"][1]["content"])
    #print(sample["messages"][2]["content"])
    #print('Predicted Answer: %s'%sample["messages"][2]["content"])
    print()
    if predicted_answer == sample["messages"][2]["content"]:
        #print('Success!')
        print('Success Answer: %s'%sample["messages"][2]["content"])
        return 1
    else:
        print('Failed Answer: %s'%sample["messages"][2]["content"])
        return 0

success_rate = []
number_of_eval_samples = 10
# iterate over eval dataset and predict
for s in tqdm(eval_dataset.shuffle().select(range(number_of_eval_samples))):
    success_rate.append(evaluate(s))

# compute accuracy
accuracy = sum(success_rate)/len(success_rate)

print(f"Accuracy: {accuracy*100:.2f}%")

 10%|█         | 1/10 [00:01<00:16,  1.83s/it]



Question: For Tie #2, who was the home team?


Failed Answer: SELECT home_team FROM table_name_33 WHERE tie_no = "2"


 20%|██        | 2/10 [00:03<00:16,  2.03s/it]



Question: How many times did episode 6 originally air?


Success Answer: SELECT COUNT(original_air_date) FROM table_21550897_1 WHERE _number = 6


 30%|███       | 3/10 [00:06<00:14,  2.08s/it]



Question: How many points did Bob Gerard Racing have in 1965?


Success Answer: SELECT points FROM table_name_37 WHERE entrant = "bob gerard racing" AND year = 1965


 40%|████      | 4/10 [00:07<00:11,  1.93s/it]



Question: What was the overall draft pick of the player who was a db and attended college in iowa?


Success Answer: SELECT overall FROM table_name_82 WHERE college = "iowa" AND position = "db"


 50%|█████     | 5/10 [00:09<00:09,  1.88s/it]



Question: What is Venue, when Date is "January 6, 1995"?


Success Answer: SELECT venue FROM table_name_35 WHERE date = "january 6, 1995"


 60%|██████    | 6/10 [00:11<00:07,  1.92s/it]



Question: What is the average decile of the school with a state authority and a roll number of 888?


Success Answer: SELECT AVG(decile) FROM table_name_62 WHERE authority = "state" AND roll = 888


 70%|███████   | 7/10 [00:14<00:06,  2.11s/it]



Question: When did the term begin that ended January 3, 1995?


Success Answer: SELECT term_began FROM table_224672_2 WHERE term_ended = "January 3, 1995"


 80%|████████  | 8/10 [00:16<00:04,  2.09s/it]



Question: Which exaltation has a domicile of Saturn and Capricorn as a sign?


Success Answer: SELECT exaltation FROM table_name_10 WHERE domicile = "saturn" AND sign = "capricorn"


 90%|█████████ | 9/10 [00:18<00:02,  2.14s/it]



Question: Name the fsb for atom z540


Success Answer: SELECT fsb FROM table_16729930_11 WHERE model_number = "Atom Z540"


100%|██████████| 10/10 [00:19<00:00,  1.98s/it]



Question: What is the promotion when the event was Acid-fest?


Success Answer: SELECT promotion FROM table_name_7 WHERE event = "acid-fest"
Accuracy: 90.00%





When evaluated on 10 samples from the evaluation dataset, our model achieved an impressive accuracy of 90.00%.


In [18]:
from tqdm import tqdm

# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()
    if predicted_answer == sample["messages"][2]["content"]:
        return 1
    else:
        return 0

success_rate = []
number_of_eval_samples = 1000
# iterate over eval dataset and predict
for s in tqdm(eval_dataset.shuffle().select(range(number_of_eval_samples))):
    success_rate.append(evaluate(s))

# compute accuracy
accuracy = sum(success_rate)/len(success_rate)

print(f"Accuracy: {accuracy*100:.2f}%")

100%|██████████| 1000/1000 [33:33<00:00,  2.01s/it]

Accuracy: 76.30%






When evaluated on 1000 samples from the evaluation dataset, our model achieved an impressive accuracy of 76.30%. However, there's room for improvement. We could enhance the model's performance by exploring techniques like few-shot learning, RAG, and Self-healing to generate the SQL query.