In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [1]:
from datasets import load_dataset

# Convert dataset to OAI messages
system_message = """You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.
SCHEMA:
{schema}"""

def create_conversation(sample):
  return {
    "messages": [
      {"role": "system", "content": system_message.format(schema=sample["context"])},
      {"role": "user", "content": sample["question"]},
      {"role": "assistant", "content": sample["answer"]}
    ]
  }

# Load dataset from the hub
dataset = load_dataset("b-mc2/sql-create-context", split="train")
dataset = dataset.shuffle().select(range(12500))

# Convert dataset to OAI messages
dataset = dataset.map(create_conversation, remove_columns=dataset.features,batched=False)
# split dataset into 10,000 training samples and 2,500 test samples
dataset = dataset.train_test_split(test_size=2500/12500)

print(dataset["train"][345]["messages"])

# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 12500/12500 [00:00<00:00, 24457.95 examples/s]


[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_20704243_6 (directed_by VARCHAR, original_air_date VARCHAR)', 'role': 'system'}, {'content': 'Who directed the episode that aired on july15,2012?', 'role': 'user'}, {'content': 'SELECT directed_by FROM table_20704243_6 WHERE original_air_date = "July15,2012"', 'role': 'assistant'}]


Creating json from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 105.05ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 131.13ba/s]


1192140

In [8]:
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

Creating json from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 117.84ba/s]
Creating json from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 138.31ba/s]


1192140

In [9]:
from datasets import load_dataset

# Load jsonl data from disk
dataset = load_dataset("json", data_files="train_dataset.json", split="train")

Generating train split: 10000 examples [00:00, 1053951.15 examples/s]


#### Create a Huggingface dataset from list of dicts

In [2]:
#from unsloth import FastLanguageModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
MAX_SEQ_LENGTH = 3047
LOAD_IN_4BIT = True # Use 4bit quantization to reduce memory usage. Can be False.

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "mistralai/Mistral-7B-v0.1"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    #device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id, pad_token = "[PAD]")
tokenizer.padding_side = 'right' # to prevent warnings

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
gpu_id = 0
device = torch.device(f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu')
model.to(device)

You shouldn't move a model when it is dispatched on multiple devices.


ValueError: `.to` is not supported for `4-bit` or `8-bit` bitsandbytes models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.

In [3]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Will add EOS_TOKEN, otherwise your generation will go on forever!
def format_prompts(samples):
    instructions = samples["instruction"]
    inputs       = samples["input"]
    outputs      = samples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN # Input is empty as no completion history
        texts.append(text)
    return { "text" : texts, }
pass

train_dataset_in_prompt_format = train_dataset.map(format_prompts, batched = True,)
test_dataset_in_prompt_format = test_dataset.map(format_prompts, batched = True,)
print(train_dataset_in_prompt_format[0]['text'])


NameError: name 'train_dataset' is not defined

#### Load the base pre-trained model and tokenizer
- Unsloth has it's own from_pretrained method.
- "load_in_4bit" indicates that the model will be quantized with bitsandbytes NormalFloat4 data type. This is the standard data type for QLoRA fine-tuning

In [11]:
from peft import LoraConfig

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
        lora_alpha=128,
        lora_dropout=0.05,
        r=256,
        bias="none",
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",]
)

#### Transform our prompt/response dataset into Alpaca prompt template format 

#### Split the dataset into training and testing sets
Note by default train_test_split shuffles the data when splitting for random split.

In [12]:
from transformers import TrainingArguments
MAX_STEPS=100
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,                     # number of training epochs
    per_device_train_batch_size=3,          # batch size per device during training
    gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
)

In [1]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    #eval_dataset = test_dataset_in_prompt_format,
    #dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    tokenizer = tokenizer,
    args = training_arguments,
    peft_config=peft_config,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)


NameError: name 'model' is not defined

#### Show current memory stats

In [16]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3090. Max memory = 23.691 GB.
6.166 GB of memory reserved.


#### Train the model

In [17]:
trainer_stats = trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss


KeyboardInterrupt: 

#### Show final memory and time stats

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

#### Inference
Infer from the model using the earlier defined Alpca prompt format, leaving response blank

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence for the next 10 numbers.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # response, leaving blank for generation!
    )
]*1, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

#### Inference with text streamer


In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence for the next 10 numbers.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # response, leaving blank for generation!
    )
]*1, return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

#### Saving only the LoRA adapters and NOT the full model
Using Huggingface's push_to_hub for an online save or save_pretrained for a local save.

In [None]:
if False: model.save_pretrained("mistral_lora_model") # local saving
if False: model.push_to_hub("your_name/lora_model", token = "...") # Huggingface hub Online saving

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')

#### Example saving as transformers architecture for VLLM
Note unsloth push_to_hub_merged is same as HF push_to_hub, with additional perf features

Be aware if I save here, then it merges into n-bit then clears the LoRAs, so will get a NoneType error if later saving the save_pretrained_gguf format

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

#### GGUF / llama.cpp Conversion
Unsloth provides native GGUF/llama.cpp save. Clones llama.cpp and default save it to q8_0. Other quants include q4_k_m. Use save_pretrained_gguf for local saving and push_to_hub_gguf for uploading to HF.

In [None]:
### GGUF / llama.cpp Conversion
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model_q4_k_m_gguf", tokenizer, quantization_method = "q4_k_m")
if True: model.push_to_hub_gguf("CorticalStack/travel-mistral-7B-GGUF", tokenizer, quantization_method = "q4_k_m", token = HF_TOKEN)