In [3]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

# 4bit pre-quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-Instruct-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50)  # Adjust max_new_tokens as needed
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("philschmid/guanaco-sharegpt-style", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)


from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Here is a sentence, please extract the keyword of the sentence into (entity,entity,relation) format.  Sentence: Recent methods for learning vector space representations of words have succeeded in capturing syntactic regularities."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
print(tokenizer.batch_decode(outputs))



==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: NVIDIA H100 PCIe. Max memory: 79.109 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 9.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
Downloading readme: 100%|██████████| 442/442 [00:00<00:00, 4.89MB/s]
Downloading data: 100%|██████████| 8.24M/8.24M [00:00<00:00, 19.0MB/s]
Generating train split: 100%|██████████| 9033/9033 [00:00<00:00, 217466.02 examples/s]
Map: 100%|██████████| 9033/9033 [00:00<00:00, 31225.55 examples/s]
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHere is a sentence, please extract the keyword of the sentence into (entity,entity,relation) format.  Sentence: Recent methods for learning vector space representations of words have succeeded in capturing syntactic regularities.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHere is the extracted keyword in (entity, entity, relation) format:\n\n* Entity 1: Recent methods\n* Entity 2: vector space representations of words\n* Relation: succeeded in capturing\n\nNote that "syntactic regularities" is not an entity, but rather a concept or a property, so']


In [6]:




messages = [
    {"from": "human", "value": "Here is a sentence, please extract the keyword of the sentence into (entity,entity,relation) format.  Sample: Sentence: Apple invest meta for VR. Output: (Apple,Meta,Invest) \n Here is the Sentence:I am well positioned to advance the proposed endeavor. Directly give the output in output format"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
print(tokenizer.batch_decode(outputs))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHere is a sentence, please extract the keyword of the sentence into (entity,entity,relation) format.  Sample: Sentence: Apple invest meta for VR. Output: (Apple,Meta,Invest) \n Here is the Sentence:I am well positioned to advance the proposed endeavor. Directly give the output in output format<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe output is:\n\n(I,Proposed Endeavor,Positioned)<|eot_id|>']


: 