# Finetune LLama-3-8b

In [1]:
# %%capture
# # Installs Unsloth, Xformers (Flash Attention) and all other packages!
# ! pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# # We have to check which Torch version for Xformers (2.3 -> 0.0.27)
# from torch import __version__; from packaging.version import Version as V
# xformers = "xformers==0.0.27" if V(__version__) < V("2.4.0") else "xformers"
# ! pip install --no-deps {xformers} trl peft accelerate bitsandbytes triton

In [2]:
from unsloth import FastLanguageModel 
import torch
from transformers import TextStreamer

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


## Prompt

In [3]:
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit", 
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",      
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct", 
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/Meta-Llama-3-8B",
    max_seq_length=max_seq_length, # 4096 # Choose any! We auto support RoPE Scaling internally!
    dtype=dtype, # dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit=load_in_4bit # load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf 
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.691 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [4]:
# model_peft
model = FastLanguageModel.get_peft_model(
    model,
    r=16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"],
    lora_alpha=16,
    lora_dropout=0, # Supports any, but = 0 is optimized
    bias="none", # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth", # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None, # And LoftQ
)

Unsloth: Offloading input_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


### Data Prep

In [5]:
# Config data following the Llama3.1-instruct format string
PROMPT_DICT = "llama-3.1-8b" 
if "llama-3.1-8b" == PROMPT_DICT: # llama31 và llama3 dùng chung format promp
    PROMPT_DICT = {
        "prompt_input": (
            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "<|eot_id|><|start_header_id|>Instruction<|end_header_id|>:\n{}\n\n"
            "<|eot_id|><|start_header_id|>Input<|end_header_id|>:\n{}\n\n"
            "<|eot_id|><|start_header_id|>Response<|end_header_id|>:"
        ), 
        "prompt_no_input": (
            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
            "Below is an instruction that describes a task. " 
            "Write a response that appropriately completes the request.\n\n"
            "<|eot_id|><|start_header_id|>Instruction<|end_header_id|>:\n{}\n\n"
            "<|eot_id|><|start_header_id|>Response<|end_header_id|>:\n{}\n\n"
        ), 
    }

if "llama-2-7b" == PROMPT_DICT:
    PROMPT_DICT = {
        "prompt_input": (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
        ), 
        "prompt_no_input": (
            "Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction: \n{instruction}\n\n### Response:"
        ), 
    }
    
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs      = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = PROMPT_DICT["prompt_no_input"].format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}


from datasets import load_dataset
dataset = load_dataset("chwenjun225/Instruction_top_5_insurance_brands_june_news_and_twitter_only", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

import os
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False, # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        num_train_epochs=5, # Choose 1 to 5 passes
        max_grad_norm=0.3,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=500,
        save_steps=500,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="./results",
        report_to="tensorboard"
    ),
)

Map (num_proc=2):   0%|          | 0/5434 [00:00<?, ? examples/s]

In [7]:
# Nếu tiếp tục train sử dụng --> trainer_stats = trainer.train("checkpoint-9500") 
trainer_stats = trainer.train()

Counting untrained tokens:   0%|          | 0/5434 [00:00<?, ? examples/s]

Unsloth: Setting embed_tokens & lm_head untrained tokens to mean(trained) to counteract NaNs during training.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 5,434 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 3,395
 "-____-"     Number of trainable parameters = 1,092,616,192


Step,Training Loss
500,0.7823
1000,0.3725
1500,0.2527
2000,0.1478
2500,0.0823
3000,0.0513


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [8]:
inference_instruction = conversation = """
Extract information that you have learned from this source text:  
MUSIC
Pucker up! Kiss to open final 'End of the Road' tour in Cincinnati 💋
Portrait of Luann GibbsLuann Gibbs
Cincinnati Enquirer

The final leg of the Kiss "End of the Road" tour begins in Cincinnati. The iconic band are wrapping up a 50-year career with a North American tour that starts at Heritage Bank Center in Cincinnati, and ends at New York City's Madison Square Garden. Tickets go on sale Friday, June 9, 2023.
The end of the road begins in Cincinnati. The legendary rock 'n' roll band Kiss is closing out a 50-year career, but before the band packs away its iconic makeup and wild costumes, the boys are taking one last ride around the world with a final tour, fittingly titled the "End of the Road" tour. It will span 50 dates around the world, and the North American leg kicks off Oct. 19 right here in Cincinnati.

Tickets go on sale Friday, June 9, for the show, which will take place at Heritage Bank Center (100 Broadway, Downtown). The tour wraps up in December with a massive final show at Madison Square Garden in New York City.

Concert dates:Cincinnati's full 2023 concert calendar 🎵

Kiss was formed in New York City in 1973 by members Paul Stanley, Gene Simmons, Ace Frehley and Peter Criss. With greasepaint makeup and outrageous costumes, the bandmembers took on the personae of comic book-style characters, and their "shock-rock" style live performances have been known to feature fire-breathing, blood-spitting, levitating drum kits and pyrotechnics. Considered one of the most influential rock bands of all time and one of the best-selling bands of all time, Kiss has sold more than 75 million records worldwide, earned 30 gold albums, and all four original members have been inducted into the Rock and Roll Hall of Fame.

The current lineup includes Stanley, Simmons, guitarist Tommy Thayer and drummer Eric Singer.

Need a break? Play the USA TODAY Daily Crossword Puzzle.

Kiss 2023 North American End of the Road tour dates:
Oct. 19: Cincinnati, Heritage Bank Center
Oct. 20: Detroit, Little Caesars Arena
Oct. 22: Cleveland, Rocket Mortgage FieldHouse
Oct. 23: Nashville, Bridgestone Arena
Oct. 25: St. Louis, Enterprise Center
Oct. 27: Fort Worth, Texas, Dickies Arena           
Oct. 29: Austin, Moody Center
Nov. 1: Palm Springs, Calif. Acrisure Arena
Nov. 3: Los Angeles, Hollywood Bowl
Nov. 6: Seattle, Climate Pledge Arena
Nov. 8: Vancouver, Rogers Arena
Nov. 10: Edmonton, Alberta, Rogers Place
Nov. 12: Calgary, Alberta, Scotiabank Saddledome
Nov. 13: Saskatoon, Saskatchewan, SaskTel Centre
Nov. 15: Winnipeg, Manitoba, Canada Life Centre
Nov. 18: Montreal, Quebec, Centre Bell
Nov. 19: Quebec, Videotron Centre
Nov. 21: Ottawa, Ontario, Canadian Tire Centre
Nov. 22: Toronto, Ontario, Scotiabank Arena
Nov. 24: Knoxville, Tenn., Thompson-Boling Arena
Nov. 25: Indianapolis, Gainbridge Fieldhouse
Nov. 27: Rosemont, Illinois, Allstate Arena
Nov. 29: Baltimore, CFG Bank Arena
Dec. 1: New York City, Madison Square Garden
Dec. 2: New York City, Madison Square Garden
"""

In [9]:
from transformers import TextStreamer

my_prompt = PROMPT_DICT['prompt_no_input']

# my_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer([
    my_prompt.format(
        inference_instruction, 
        ""
    )], return_tensors = "pt").to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=4096, use_cache=True)

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>Below is an instruction that describes a task. Write a response that appropriately completes the request.

<|eot_id|><|start_header_id|>Instruction<|end_header_id|>:

Extract information that you have learned from this source text:  
MUSIC
Pucker up! Kiss to open final 'End of the Road' tour in Cincinnati 💋
Portrait of Luann GibbsLuann Gibbs
Cincinnati Enquirer

The final leg of the Kiss "End of the Road" tour begins in Cincinnati. The iconic band are wrapping up a 50-year career with a North American tour that starts at Heritage Bank Center in Cincinnati, and ends at New York City's Madison Square Garden. Tickets go on sale Friday, June 9, 2023.
The end of the road begins in Cincinnati. The legendary rock 'n' roll band Kiss is closing out a 50-year career, but before the band packs away its iconic makeup and wild costumes, the boys are taking one last ride around the world with a final tour, fittingly titled t

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

#### Reload merge_and_unload PeftModel

Chỉ sử dụng cái này để tạo file `config.json` và các tệp như `model-00001-of-00007.safetensors` sau khi đã lưu `adapter_json` file

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM 
from peft import PeftModel, get_peft_config, get_peft_model 

In [2]:
# "unsloth/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B")
model = PeftModel.from_pretrained(model, "./results/llama-3-8b-json_extract-lora_adapter")
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
# Local saving
model.save_pretrained("./results/llama-3-8b-json_extract-lora_adapter") 
tokenizer.save_pretrained("./results/llama-3-8b-json_extract-lora_adapter")

('./results/llama-3-8b-json_extract-lora_adapter/tokenizer_config.json',
 './results/llama-3-8b-json_extract-lora_adapter/special_tokens_map.json',
 './results/llama-3-8b-json_extract-lora_adapter/tokenizer.json')

In [4]:
# Online saving
model.push_to_hub("chwenjun225/llama-3-8b-json_extract-lora_adapter", token="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")
tokenizer.push_to_hub("chwenjun225/llama-3-8b-json_extract-lora_adapter", token="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")

model-00004-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/4.89G [00:00<?, ?B/s]

Upload 7 LFS files:   0%|          | 0/7 [00:00<?, ?it/s]

'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/df/85/df852441e5931d5de5ba3f82947b5e0a8db9bbc2f6e057b531984d31ef3d1b09/143cb2e9a8caa0533bf96accbfcf1e6582946416301bcfed70d34a596d757c80?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20240915%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240915T062557Z&X-Amz-Expires=86400&X-Amz-Signature=09fabceeb03f9a475b1d6eac66a5944181cfe3660f8ca639fc90c354a1b5a6cc&X-Amz-SignedHeaders=host&partNumber=17&uploadId=wtpQGFJYfJ5lHD0y.OQ_lk07LLwLB9OeMORfX6yE9S2nXen1Ei0IOhJM0DfKBpPZbPzhADb7Debg5CsRskmtLpuHHqQwwGxK1aTmnRPR6hGPSsJ1vE7fa6ScjtrrzaPi&x-id=UploadPart (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2427)')))"), '(Request ID: 708d5b2b-9351-4c43-a793-bdf04774e246)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/df/85/

model-00007-of-00007.safetensors:   0%|          | 0.00/2.57G [00:00<?, ?B/s]

'(MaxRetryError("HTTPSConnectionPool(host='hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com', port=443): Max retries exceeded with url: /repos/df/85/df852441e5931d5de5ba3f82947b5e0a8db9bbc2f6e057b531984d31ef3d1b09/143cb2e9a8caa0533bf96accbfcf1e6582946416301bcfed70d34a596d757c80?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20240915%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240915T062557Z&X-Amz-Expires=86400&X-Amz-Signature=c9cfc64cc6f71cbf81f23f46a035f6cef869b697c2b43363645fcc638257dbdd&X-Amz-SignedHeaders=host&partNumber=183&uploadId=wtpQGFJYfJ5lHD0y.OQ_lk07LLwLB9OeMORfX6yE9S2nXen1Ei0IOhJM0DfKBpPZbPzhADb7Debg5CsRskmtLpuHHqQwwGxK1aTmnRPR6hGPSsJ1vE7fa6ScjtrrzaPi&x-id=UploadPart (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:2427)')))"), '(Request ID: 7c484a3c-bb1e-4e06-b131-73e07994ae62)')' thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3-accelerate.amazonaws.com/repos/df/85

RuntimeError: Error while uploading 'model-00001-of-00007.safetensors' to the Hub.

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

### Inference after save as lora adapter - `lora_llama3-8b`

In [6]:
# Config data following the Llama3.1-instruct format string
PROMPT_DICT = "llama-3.1-8b" # Llama3.1 and Llama3 dùng chung prompt format 
if "llama-3.1-8b" == PROMPT_DICT:
    PROMPT_DICT = {
        "prompt_input": (
            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "<|eot_id|><|start_header_id|>Instruction<|end_header_id|>:\n{}\n\n"
            "<|eot_id|><|start_header_id|>Input<|end_header_id|>:\n{}\n\n"
            "<|eot_id|><|start_header_id|>Response<|end_header_id|>:"
        ), 
        "prompt_no_input": (
            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
            "Below is an instruction that describes a task. " 
            "Write a response that appropriately completes the request.\n\n"
            "<|eot_id|><|start_header_id|>Instruction<|end_header_id|>:\n{}\n\n"
            "<|eot_id|><|start_header_id|>Response<|end_header_id|>:\n{}\n\n"
        ), 
    }

if "llama-2-7b" == PROMPT_DICT:
    PROMPT_DICT = {
        "prompt_input": (
            "Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
        ), 
        "prompt_no_input": (
            "Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction: \n{instruction}\n\n### Response:"
        ), 
    }

inference_instruction = conversation = """
Extract information that you have learned from this source text:  
MUSIC
Pucker up! Kiss to open final 'End of the Road' tour in Cincinnati 💋
Portrait of Luann GibbsLuann Gibbs
Cincinnati Enquirer

The final leg of the Kiss "End of the Road" tour begins in Cincinnati. The iconic band are wrapping up a 50-year career with a North American tour that starts at Heritage Bank Center in Cincinnati, and ends at New York City's Madison Square Garden. Tickets go on sale Friday, June 9, 2023.
The end of the road begins in Cincinnati. The legendary rock 'n' roll band Kiss is closing out a 50-year career, but before the band packs away its iconic makeup and wild costumes, the boys are taking one last ride around the world with a final tour, fittingly titled the "End of the Road" tour. It will span 50 dates around the world, and the North American leg kicks off Oct. 19 right here in Cincinnati.

Tickets go on sale Friday, June 9, for the show, which will take place at Heritage Bank Center (100 Broadway, Downtown). The tour wraps up in December with a massive final show at Madison Square Garden in New York City.

Concert dates:Cincinnati's full 2023 concert calendar 🎵

Kiss was formed in New York City in 1973 by members Paul Stanley, Gene Simmons, Ace Frehley and Peter Criss. With greasepaint makeup and outrageous costumes, the bandmembers took on the personae of comic book-style characters, and their "shock-rock" style live performances have been known to feature fire-breathing, blood-spitting, levitating drum kits and pyrotechnics. Considered one of the most influential rock bands of all time and one of the best-selling bands of all time, Kiss has sold more than 75 million records worldwide, earned 30 gold albums, and all four original members have been inducted into the Rock and Roll Hall of Fame.

The current lineup includes Stanley, Simmons, guitarist Tommy Thayer and drummer Eric Singer.

Need a break? Play the USA TODAY Daily Crossword Puzzle.

Kiss 2023 North American End of the Road tour dates:
Oct. 19: Cincinnati, Heritage Bank Center
Oct. 20: Detroit, Little Caesars Arena
Oct. 22: Cleveland, Rocket Mortgage FieldHouse
Oct. 23: Nashville, Bridgestone Arena
Oct. 25: St. Louis, Enterprise Center
Oct. 27: Fort Worth, Texas, Dickies Arena           
Oct. 29: Austin, Moody Center
Nov. 1: Palm Springs, Calif. Acrisure Arena
Nov. 3: Los Angeles, Hollywood Bowl
Nov. 6: Seattle, Climate Pledge Arena
Nov. 8: Vancouver, Rogers Arena
Nov. 10: Edmonton, Alberta, Rogers Place
Nov. 12: Calgary, Alberta, Scotiabank Saddledome
Nov. 13: Saskatoon, Saskatchewan, SaskTel Centre
Nov. 15: Winnipeg, Manitoba, Canada Life Centre
Nov. 18: Montreal, Quebec, Centre Bell
Nov. 19: Quebec, Videotron Centre
Nov. 21: Ottawa, Ontario, Canadian Tire Centre
Nov. 22: Toronto, Ontario, Scotiabank Arena
Nov. 24: Knoxville, Tenn., Thompson-Boling Arena
Nov. 25: Indianapolis, Gainbridge Fieldhouse
Nov. 27: Rosemont, Illinois, Allstate Arena
Nov. 29: Baltimore, CFG Bank Arena
Dec. 1: New York City, Madison Square Garden
Dec. 2: New York City, Madison Square Garden
"""

max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

if True:
    from transformers import TextStreamer
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        # model_name="chwenjun225/llama-3-8b-json_extract-lora_adapter", # Test model on HF
        model_name="./results/llama-3-8b-json_extract-lora_adapter", # Test model local 
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

my_prompt = PROMPT_DICT['prompt_no_input']

inputs = tokenizer([
    my_prompt.format(inference_instruction, "")
    ], return_tensors="pt").to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=4096, use_cache=True)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


RuntimeError: Unsloth: Your repo has a LoRA adapter and a base model.
You have 2 files `config.json` and `adapter_config.json`.
We must only allow one config file.
Please separate the LoRA and base models to 2 repos.

### Evaluation using `llm-evaluation-harness`

In [None]:
import os
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

In [5]:
# # Local eval `llama3_8b_instruct_lora_5epo_merged16bit_gguf`
# !lm_eval --model hf --model_args pretrained="/rhome/f111169109/alby-the-scraper/llama3/web_crawler_llama3_8b_instruct_lora_5epo_merged16bit_gguf" --tasks lambada_openai,hellaswag,piqa,arc_easy,arc_challenge,winogrande,openbookqa --device cuda --batch_size auto 

# # Local eval `llama3_8b_instruct_lora_5epo_merged16bit`
# !lm_eval --model hf --model_args pretrained="/rhome/f111169109/alby-the-scraper/notebooks/llama3/web_crawler_llama3_8b_instruct_lora_5epo_merged16bit" --tasks lambada_openai,hellaswag,piqa,arc_easy,arc_challenge,winogrande,openbookqa --device cuda --batch_size auto 

# Local eval `chwenjun225/llama-3-8b-json_extract-lora_adapter`
!lm_eval --model hf \
--model_args pretrained="./results/llama-3-8b-json_extract-lora_adapter" \
--tasks lambada_openai,hellaswag,piqa,arc_easy,arc_challenge,winogrande,openbookqa \
--device cuda \
--batch_size auto 

2024-09-15:23:39:35,020 INFO     [__main__.py:279] Verbosity set to INFO
2024-09-15:23:39:35,490 INFO     [__init__.py:491] `group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. `tag` will be used to allow to call a collection of tasks just like `group`. `group` will be removed in order to not cause confusion with the new ConfigurableGroup which will be the official way to create groups with addition of group-wide configurations.
2024-09-15:23:39:40,102 INFO     [__main__.py:383] Selected Tasks: ['arc_challenge', 'arc_easy', 'hellaswag', 'lambada_openai', 'openbookqa', 'piqa', 'winogrande']
2024-09-15:23:39:40,109 INFO     [evaluator.py:161] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
2024-09-15:23:39:40,109 INFO     [evaluator.py:198] Initializing hf model, with arguments: {'pretrained': './results/llama-3-8b-json_extract-lora_adapter'}
2024-09-15:23:39:40,192 INFO     [huggingface.py:130] 

### Release VRAM

In [None]:
# nvidia-smi | grep 'python' | awk '{ print $5 }' | xargs -n1 kill -9

In [None]:
# zip -r llama-3-8b-json_extract-lora_adapter.zip llama-3-8b-json_extract-lora_adapter