#### Upload custom .jsonl collection of city information extraction instructions and outputs

In [1]:
import json
file_path = '../cities.jsonl'

data = []
with open(file_path, 'r') as file:
    for line in file:
        json_line = json.loads(line.strip())
        data.append(json_line)

print(data[0])

{'prompt': 'What are some of the best museums to visit in Paris?', 'response': 'Paris is a city rich in museums, catering to a wide range of interests. Here are some you might want to consider:\n\n1. The Louvre: The world\'s largest art museum and a historic monument in Paris, home to thousands of works of art, including the Mona Lisa.\n\n2. Musée d\'Orsay: Located in a former railway station, this museum houses a large collection of Impressionist and Post-Impressionist masterpieces.\n\n3. Centre Pompidou: Known for its high-tech architecture, this museum houses the National Museum of Modern Art.\n\n4. Musée de l\'Orangerie: Located in the Tuileries Gardens, it\'s famous for Monet\'s "Water Lilies" series.\n\n5. Musée Rodin: Dedicated to the works of the French sculptor Auguste Rodin.\n\n6. Musée du Quai Branly: Features indigenous art, cultures and civilizations from Africa, Asia, Oceania, and the Americas.\n\n7. Musée Picasso: Home to an impressive collection of more than 5,000 works

#### Transform city list of dictionaries into a list of dicts, which is suitable for Huggingface Dataset.from_dict()

In [2]:
transformed_data = {}
for key in data[0].keys():  # initialize keys
    transformed_data[key] = []

for item in data:
    for key in item:
        transformed_data[key].append(item[key])

#### Create a Huggingface dataset from list of dicts

In [3]:
from datasets import Dataset
dataset = Dataset.from_dict(transformed_data)
print(transformed_data)

{'prompt': ['What are some of the best museums to visit in Paris?', 'What is the local cuisine like in Paris?', 'What are some popular outdoor activities in Paris?', 'What is the currency used in Paris and where can I exchange my money?', 'What are some of the most famous landmarks in Paris?', 'What is the best time of year to visit Paris?', 'What is the weather like in Paris throughout the year?', 'What are some popular outdoor activities in Paris?', 'What are some popular markets in Paris that I should visit?'], 'response': ['Paris is a city rich in museums, catering to a wide range of interests. Here are some you might want to consider:\n\n1. The Louvre: The world\'s largest art museum and a historic monument in Paris, home to thousands of works of art, including the Mona Lisa.\n\n2. Musée d\'Orsay: Located in a former railway station, this museum houses a large collection of Impressionist and Post-Impressionist masterpieces.\n\n3. Centre Pompidou: Known for its high-tech architectu

#### Load the base pre-trained model and tokenizer
- Unsloth has it's own from_pretrained method.
- "load_in_4bit" indicates that the model will be quantized with bitsandbytes NormalFloat4 data type. This is the standard data type for QLoRA fine-tuning

In [4]:
from unsloth import FastLanguageModel
import torch
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-bnb-4bit", # "unsloth/mistral-7b" for 16bit loading
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = LOAD_IN_4BIT # Use 4bit quantization to reduce memory usage. Can be False.
)

ImportError: tokenizers>=0.11.1,!=0.11.3,<0.14 is required for a normal functioning of this module, but found tokenizers==0.15.0.
Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git main

#### Transform our prompt/response dataset into Alpaca prompt template format 
Note by default, the Mistral tokenizer only adds <s> (BOS token) to the prompt but not </s> (EOS token), so (EOS) token added at end of prompt

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Will add EOS_TOKEN, otherwise your generation will go on forever!
def format_prompts(samples):
    instructions = samples["prompt"]
    responses = samples["response"]
    texts = []
    for prompt, response in zip(instructions, responses):
        text = alpaca_prompt.format(prompt, "", response) + EOS_TOKEN # Input is empty as no completion history
        texts.append(text)
    return { "text" : texts, }
pass

dataset_in_prompt_format = dataset.map(format_prompts, batched = True,)
print(dataset_in_prompt_format[0]['text'])


#### Split the dataset into training and testing sets

In [None]:
train_test_split = dataset_in_prompt_format.train_test_split(test_size=0.1)  # 10% for testing
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

#### Do model patching and add fast LoRA weights
r and lora_aplha are the most important parameters in LoRA configuration.

** r is the rank of the LoRA matrices:
- A higher r-value means more trainable parameters, allowing for more expressivity. But, on the negative side, there is a compute tradeoff, and may also lead to overfitting.
- A lower r-value means less trainable parameters, it can reduce overfitting at the cost of expressiveness.


** lora_aplha is a scaling factor for LoRA weights:
- Higher alpha will put more emphasis on LoRA weights.
- Lower alpha will put reduced emphasis on LoRA weights, hence model will be more dependent on its original weights.


** Important tips:
- Golden rule: lora_aplha = 2*r, i.e., if r=128 and lora_aplha should be 256
- Both r and lora_aplha should be in 2**x value, a good range for selection will be [8, 16, 32, 64, 128, 256, 512]
- If your fine-tuning data is very different from the pre-training data of your model, I recommend selecting r and lora_aplha from the higher values from the above range and vice versa.

In [None]:

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    max_seq_length = MAX_SEQ_LENGTH,
)

#### Define training arguments
Train for MAX_STEPS with a total batch size of 24 (per_device_train_batch_size*gradient_accumulation_steps)

In [None]:
MAX_STEPS=100
from transformers import TrainingArguments
training_arguments = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="steps",
        do_eval=True,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=6,
        per_device_eval_batch_size=4,
        log_level="debug",
        save_steps=100,
        logging_steps=25, 
        learning_rate=2e-4,
        eval_steps=50,
        optim='adamw_8bit',
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        weight_decay=0.1,
        max_steps=MAX_STEPS,
        warmup_ratio=0.01,
        lr_scheduler_type="linear",
)

#### Construct the model trainer
- Will train the model with TRL (Transformer Reinforcement Learning), with the SFT (Supervised Fine Tuning) trainer
- Use the text column of the dataset for training

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQ_LENGTH,
    tokenizer = tokenizer,
    args = training_arguments,
)


#### Show current memory stats

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

#### Train the model

In [None]:
trainer_stats = trainer.train()

#### Show final memory and time stats

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

#### Inference
Infer from the model using the earlier defined Alpca prompt format, leaving response blank

In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence for the next 10 numbers.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # response, leaving blank for generation!
    )
]*1, return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

#### Inference with text streamer


In [None]:
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence for the next 10 numbers.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # response, leaving blank for generation!
    )
]*1, return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

#### Saving only the LoRA adapters and NOT the full model
Using Huggingface's push_to_hub for an online save or save_pretrained for a local save.

In [None]:
model.save_pretrained("mistral_lora_model") # local saving
# model.push_to_hub("your_name/lora_model", token = "...") # Huggingface hub Online saving

#### Example saving to float16 for VLLM
Be aware if I save here, then it merges into n-bit then clears the LoRAs, so will get a NoneType error if later saving the save_pretrained_gguf format

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

#### GGUF / llama.cpp Conversion
Unsloth provides native GGUF/llama.cpp save. Clones llama.cpp and default save it to q8_0. Other quants include q4_k_m. Use save_pretrained_gguf for local saving and push_to_hub_gguf for uploading to HF.

In [None]:
### GGUF / llama.cpp Conversion
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if True: model.save_pretrained_gguf("model_q4_k_m_gguf", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

#### Now, use the model-unsloth.gguf file or model-unsloth-Q4_K_M.gguf file in llama.cpp or a UI based system like GPT4All

In [None]:
from peft import PeftModel
model = PeftModel.from_pretrained(model, "mistral_lora_model")

In [None]:
from typing import List

def get_response(query:str, input="")->List[str]:
  inputs = tokenizer(
       [
    alpaca_prompt.format(
        query, # instruction
        input, # input
        "", # output
    )
    ]*1, return_tensors = "pt").to("cuda")
  outputs = model.generate(**inputs, max_new_tokens = 1024, use_cache = True)
  return tokenizer.batch_decode(outputs)

query = "Give me some highlights of the following city"
input = "Paris"
resp = get_response(query, input)
def format_msg(message):
    split_msg = message.split("### ")
    final_str = split_msg[1]+split_msg[3]
    return final_str
print(format_msg(resp[0]))

In [None]:
from optimum.gptq import GPTQQuantizer, load_quantized_model
quantizer = GPTQQuantizer(bits=4, dataset="c4", block_name_to_quantize = "model.decoder.layers", model_seqlen = MAX_SEQ_LENGTH)

In [None]:
save_folder = "./quantized_test/"
quantizer.save(model,save_folder)