## Installations

In [2]:
!python --version

Python 3.10.13


In [None]:
# !pip install torch==2.5.1 transformers==4.46.2 datasets wandb huggingface_hub python-dotenv --no-cache-dir | tail -n 1 
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install --no-deps trl peft accelerate bitsandbytes xformers==0.0.28.post3 --no-cache | tail -n 1 

In [7]:
import torch

# Print Torch version (greater than 2.4 for xformers)
print(torch.__version__)

# Check if CUDA (GPU) is available
print("CUDA available:", torch.cuda.is_available())

# Print the number of GPUs
print("Number of GPUs:", torch.cuda.device_count())

# Print the name of the current GPU device
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

# Print the current GPU device index
print("Current GPU Device:", torch.cuda.current_device())

2.4.1+cu121
CUDA available: True
Number of GPUs: 1
GPU Name: NVIDIA L40S
Current GPU Device: 0


## Setting Up Weights & Biases

In [9]:
import os
import wandb
from dotenv import load_dotenv
load_dotenv()

def setup_wandb(project_name: str, run_name: str):
    # Set up your API KEY
    try:
        api_key = os.getenv("WANDB_API_KEY")
        wandb.login(key=api_key)
        print("Successfully logged into WandB.")
    except KeyError:
        raise EnvironmentError("WANDB_API_KEY is not set in the environment variables.")
    except Exception as e:
        print(f"Error logging into WandB: {e}")
    
    # Optional: Log models
    os.environ["WANDB_LOG_MODEL"] = "checkpoint"
    os.environ["WANDB_WATCH"] = "all"
    os.environ["WANDB_SILENT"] = "true"
    
    # Initialize the WandB run
    try:
        wandb.init(project=project_name, name=run_name)
        print(f"WandB run initialized: Project - {project_name}, Run - {run_name}")
    except Exception as e:
        print(f"Error initializing WandB run: {e}")

# Run it as a function
setup_wandb(project_name="llama-3.1-8B-ft-lora-fc", run_name="run-v1")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mgautamgc75[0m ([33mgautamgc75-org[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/wsuser/.netrc


Successfully logged into WandB.


WandB run initialized: Project - llama-3.1-8B-ft-lora-fc, Run - run-v1


## HuggingFace Authentication

In [10]:
from huggingface_hub import login

hf_token = os.getenv("HUGGINGFACE_TOKEN")
if hf_token is None:
    raise EnvironmentError("HUGGINGFACE_TOKEN is not set in the environment variables.")
login(hf_token)

## Loading the Base Model

In [None]:
import torch
from unsloth import FastLanguageModel

max_seq_length = 2048     # Unsloth auto supports RoPE Scaling internally!
dtype = None              # None for auto detection
load_in_4bit = False      # Use 4bit quantization to reduce memory usage. 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct",  
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

## Configuring LoRA for Fine-Tuning

In [12]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,              # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,    # Supports any, but = 0 is optimized
    bias = "none",       # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth",     # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.10.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Loading and Processing the Dataset

In [13]:
from datasets import load_dataset

# Loading the dataset
dataset = load_dataset("Salesforce/xlam-function-calling-60k", split="train", token=hf_token)

# Selecting a subset of 15K samples for fine-tuning
dataset = dataset.select(range(20000))
print(f"Using a sample size of {len(dataset)} for fine-tuning.")

Using a sample size of 20000 for fine-tuning.


In [20]:
from unsloth.chat_templates import get_chat_template

# Initialize the tokenizer with the chat template and mapping
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3",  # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True,        # Maps <|im_end|> to <|eot_id|> instead
)

def formatting_prompts_func(examples):
    convos = []
    
    # Iterate through each item in the batch (examples are structured as lists of values)
    for query, tools, answers in zip(examples['query'], examples['tools'], examples['answers']):
        tool_user = {
            "content": f"You are a helpful assistant with access to the following tools or function calls. Your task is to produce a sequence of tools or function calls necessary to generate response to the user utterance. Use the following tools or function calls as required:\n{tools}",
            "role": "system"
        }
        ques_user = {
            "content": f"{query}",
            "role": "user"
        }
        assistant = {
            "content": f"{answers}",
            "role": "assistant"
        }

        # Combine the user1, user2, and assistant into a single conversation structure
        convos.append([tool_user, ques_user, assistant])

    # Apply the chat template tokenizer for each conversation
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

In [None]:
dataset = dataset.map(formatting_prompts_func, batched = True,)
len(dataset)

In [22]:
print(dataset[0]["text"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant with access to the following tools or function calls. Your task is to produce a sequence of tools or function calls necessary to generate response to the user utterance. Use the following tools or function calls as required:
[{"name": "live_giveaways_by_type", "description": "Retrieve live giveaways from the GamerPower API based on the specified type.", "parameters": {"type": {"description": "The type of giveaways to retrieve (e.g., game, loot, beta).", "type": "str", "default": "game"}}}]<|eot_id|><|start_header_id|>user<|end_header_id|>

Where can I find live giveaways for beta access and games?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

[{"name": "live_giveaways_by_type", "arguments": {"type": "beta"}}, {"name": "live_giveaways_by_type", "arguments": {"type": "game"}}]<|eot_id|>


## Training with SFTTrainer and Unsloth

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 2,    
        warmup_steps = 5,
        learning_rate = 2e-4,
        num_train_epochs = 3,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "wandb",  
        logging_steps = 1,  
        logging_strategy = "steps",
        save_strategy = "no",
        load_best_model_at_end = True,
        save_only_model = False
    )

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,       
    args = args
)

In [28]:
# Show current memory stats

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA L40S. Max memory = 44.521 GB.
15.887 GB of memory reserved.


In [29]:
import torch
import gc

# Clear PyTorch's cache
torch.cuda.empty_cache()

# Run Python's garbage collector
gc.collect()

807

In [30]:
from unsloth import unsloth_train
trainer_stats = unsloth_train(trainer)   # << Fixed gradient accumulation
# trainer_stats = trainer.train()        # << Buggy if using gradient accumulation

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 20,000 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 2
\        /    Total batch size = 8 | Total steps = 7,500
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss


Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
wandb.finish()

In [32]:
print(trainer_stats)

TrainOutput(global_step=7500, training_loss=0.14057622442046802, metrics={'train_runtime': 18099.7207, 'train_samples_per_second': 3.315, 'train_steps_per_second': 0.414, 'total_flos': 1.8451314530340372e+18, 'train_loss': 0.14057622442046802, 'epoch': 3.0})


In [33]:
trainer_stats.metrics

{'train_runtime': 18099.7207,
 'train_samples_per_second': 3.315,
 'train_steps_per_second': 0.414,
 'total_flos': 1.8451314530340372e+18,
 'train_loss': 0.14057622442046802,
 'epoch': 3.0}

In [34]:
# Show final memory and time stats

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

18099.7207 seconds used for training.
301.66 minutes used for training.
Peak reserved memory = 39.855 GB.
Peak reserved memory for training = 23.968 GB.
Peak reserved memory % of max memory = 89.52 %.
Peak reserved memory for training % of max memory = 53.835 %.


## Saving the Model

After training, the fine-tuned model is saved locally and pushed to Hugging Face's hub for further access and deployment. However, this only saves the LoRA adapters.

In [None]:
# Local saving
model.save_pretrained("Llama3-FineTune-Function-Calling-LoRA-Model-V2") 
tokenizer.save_pretrained("Llama3-FineTune-Function-Calling-LoRA-Model-V2")

# Online saving
model.push_to_hub("gautamgc17/Llama3-FineTune-Function-Calling-LoRA-Model-V2", token = hf_token)
tokenizer.push_to_hub("gautamgc17/Llama3-FineTune-Function-Calling-LoRA-Model-V2", token = hf_token) 

## Saving to float16 for VLLM
For merging the LoRA adapters with the base model and save the model to 16-bit precision for optimized performance with vLLM, use:

In [None]:
# Merge to 16bit
model.save_pretrained_merged("Llama3-FineTune-Function-Calling-Model-V2", tokenizer, save_method = "merged_16bit")
model.push_to_hub_merged("gautamgc17/Llama3-FineTune-Function-Calling-Model-V2", tokenizer, save_method = "merged_16bit", token = hf_token)

## Evaluations

In [9]:
from unsloth import FastLanguageModel
from transformers import TextStreamer

max_seq_length = 2048     
dtype = None              
load_in_4bit = False     

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Llama3-FineTune-Function-Calling-Model-V2",  
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)   # Enable native 2x faster inference

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.10.7: Fast Llama patching. Transformers = 4.46.1.
   \\   /|    GPU: NVIDIA L40S. Max memory: 44.521 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaExtendedRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
  

In [22]:
WEATHER_API_KEY = os.getenv("WEATHER_API_KEY")
NASA_API_KEY = os.getenv("NASA_API_KEY")
STOCK_API_KEY = os.getenv("STOCK_API_KEY")

In [67]:
import re
import json
import requests
from datetime import datetime
import nasapy


def get_current_date() -> str:
    """
    Fetches the current date in the format YYYY-MM-DD.
    Returns:
        str: A string representing the current date.
    """
    print("Getting the current date")
    
    try:
        current_date = datetime.now().strftime("%Y-%m-%d")
        return current_date
    except Exception as e:
        print(f"Error fetching current date: {e}")
        return "NA"
    
    
def get_current_weather(location: str) -> dict:
    """
    Fetches the current weather for a given location (default: San Francisco).
    Args:
        location (str): The name of the city for which to retrieve the weather information.
    Returns:
        dict: A dictionary containing weather information such as temperature, weather description, and humidity.
    """
    print(f"Getting current weather for {location}")
    
    try:
        weather_url = f"http://api.openweathermap.org/data/2.5/weather?q={location}&appid={WEATHER_API_KEY}&units=metric"
        weather_data = requests.get(weather_url)
        data = weather_data.json()
        weather_description = data["weather"][0]["description"]
        temperature = data["main"]["temp"]
        humidity = data["main"]["humidity"]
        return {
            "description": weather_description,
            "temperature": temperature,
            "humidity": humidity
        }
    except Exception as e:
        print(f"Error fetching weather data: {e}")
        return {"weather": "NA"}
    
    
def celsius_to_fahrenheit(celsius: float):
    """
    Converts a temperature from Celsius to Fahrenheit.
    
    Args:
        celsius (float): Temperature in degrees Celsius.
        
    Returns:
        float: Temperature in degrees Fahrenheit.
    """
    print(f"Converting {celsius}°C to Fahrenheit")
    
    try:
        fahrenheit = (celsius * 9/5) + 32
        return fahrenheit
    except Exception as e:
        print(f"Error converting temperature: {e}")
        return None
    
    
def get_nasa_picture_of_the_day(date: str) -> dict:
    """
    Fetches NASA's Picture of the Day information for a given date.
    
    Args:
        date (str): The date for which to retrieve the picture in 'YYYY-MM-DD' format.
        
    Returns:
        dict: A dictionary containing the title, explanation, and URL of the image or video.
    """
    print(f"Getting NASA's Picture of the Day for {date}")
    
    try:
        nasa = nasapy.Nasa(key = NASA_API_KEY)
        apod = nasa.picture_of_the_day(date = date, hd=True)
        title = apod.get("title", "No Title")
        explanation = apod.get("explanation", "No Explanation")
        url = apod.get("url", "No URL")
        return {
            "title": title,
            "explanation": explanation,
            "url": url
        }
    except Exception as e:
        print(f"Error fetching NASA's Picture of the Day: {e}")
        return {"error": "Unable to fetch NASA Picture of the Day"}
    
    
def get_stock_price(ticker: str, date: str) -> tuple[str, str]:
    """
    Retrieves the lowest and highest stock prices for a given ticker and date.
    Args:
        ticker (str): The stock ticker symbol, e.g., "IBM".
        date (str): The date in "YYYY-MM-DD" format for which you want to get stock prices.
    Returns:
        tuple: A tuple containing the low and high stock prices on the given date, or ("none", "none") if not found.
    """
    print(f"Getting stock price for {ticker} on {date}")
    try:
        stock_url = f"https://www.alphavantage.co/query?function=TIME_SERIES_DAILY&symbol={ticker}&apikey={STOCK_API_KEY}"
        stock_data = requests.get(stock_url)
        stock_low = stock_data.json()["Time Series (Daily)"][date]["3. low"]
        stock_high = stock_data.json()["Time Series (Daily)"][date]["2. high"]
        return stock_low, stock_high
    except Exception as e:
        print(f"Error fetching stock data: {e}")
        return "none", "none"
    
    
available_function_calls = {"get_current_date": get_current_date, "get_current_weather": get_current_weather, "celsius_to_fahrenheit": celsius_to_fahrenheit,
                      "get_nasa_picture_of_the_day": get_nasa_picture_of_the_day, "get_stock_price": get_stock_price}

In [68]:
functions = [
    {
        "name": "get_current_date",
        "description": "Fetches the current date in the format YYYY-MM-DD.",
        "parameters": {
            "type": "object",
            "properties": {},
            "required": [],
        },
    },
    {
        "name": "get_current_weather",
        "description": "Get the current weather",
        "parameters": {
            "type": "object",
            "properties": {
                "location": {
                    "type": "string",
                    "description": "The city and country code, e.g. San Francisco, US",
                }
            },
            "required": ["location"],
        },
    },
    {
        "name": "celsius_to_fahrenheit",
        "description": "Converts a temperature from Celsius to Fahrenheit.",
        "parameters": {
            "type": "object",
            "properties": {
                "celsius": {
                    "type": "number",
                    "description": "Temperature in degrees Celsius.",
                }
            },
            "required": ["celsius"],
        }
    },
    {
        "name": "get_nasa_picture_of_the_day",
        "description": "Fetches NASA's Picture of the Day information for a given date.",
        "parameters": {
            "type": "object",
            "properties": {
                "date": {
                    "type": "string",
                    "description": "Date in YYYY-MM-DD format for which to retrieve the picture.",
                }
            },
            "required": ["date"],
        },
    },
    {
        "name": "get_stock_price",
        "description": "Retrieves the lowest and highest stock price for a given ticker symbol and date. The ticker symbol must be a valid symbol for a publicly traded company on a major US stock exchange like NYSE or NASDAQ. The tool will return the latest trade price in USD.",
        "parameters": {
            "type": "object",
            "properties": {
                "ticker": {
                    "type": "string",
                    "description": "The stock ticker symbol, e.g. AAPL for Apple Inc.",
                },
                "date": {
                    "type": "string",
                    "description": "Date in YYYY-MM-DD format",
                }
            },
            "required": ["ticker", "date"],
        },
    }
]


available_tools_list = {
    "functions_str": [json.dumps(x) for x in functions],
}

In [69]:
query = "What is the current weather at the headquarters of IBM? Also, can you provide the stock prices for the company on October 29, 2024?"

chat = [
    {"role":"system","content": f"You are a helpful assistant with access to the following function calls. Your task is to produce a sequence of function calls necessary to generate response to the user utterance. Use the following function calls as required.\n{available_tools_list}"},
    {"role": "user", "content": query }
]

In [70]:
inputs = tokenizer.apply_chat_template(
    chat,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

In [71]:
outputs = model.generate(input_ids = inputs, max_new_tokens = 1024, use_cache = True)
response = tokenizer.batch_decode(outputs)[0]
print(response)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant with access to the following function calls. Your task is to produce a sequence of function calls necessary to generate response to the user utterance. Use the following function calls as required.
{'functions_str': ['{"name": "get_current_date", "description": "Fetches the current date in the format YYYY-MM-DD.", "parameters": {"type": "object", "properties": {}, "required": []}}', '{"name": "get_current_weather", "description": "Get the current weather", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and country code, e.g. San Francisco, US"}}, "required": ["location"]}}', '{"name": "celsius_to_fahrenheit", "description": "Converts a temperature from Celsius to Fahrenheit.", "parameters": {"type": "object", "properties": {"celsius": {"type": "number", "description": "Temperature in degrees Celsius."}}, "required": ["celsius"]}}', '{"name": 

In [72]:
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 512, use_cache = True, pad_token_id = tokenizer.eos_token_id)

[{"name": "get_current_weather", "arguments": {"location": "Armonk, US"}}, {"name": "get_stock_price", "arguments": {"ticker": "IBM", "date": "2024-10-29"}}]<|eot_id|>


In [73]:
def extract_content(text):
    # Define the regex pattern to extract the content
    pattern = r"<\|start_header_id\|>assistant<\|end_header_id\|>(.*?)<\|eot_id\|>"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None  

parsed_response = json.loads(extract_content(response))
parsed_response

[{'name': 'get_current_weather', 'arguments': {'location': 'Armonk, US'}},
 {'name': 'get_stock_price',
  'arguments': {'ticker': 'IBM', 'date': '2024-10-29'}}]

In [74]:
if parsed_response:
    new_system_content = "You are a helpful assistant. Answer the user query based on the response of the specific function call or tool provided to you as context. Generate a precise answer for given user query, synthesizing the provided information."

    for res in parsed_response:
        obtained_function = res.get("name")
        arguments = res.get("arguments")
        function_description = next(item['description'] for item in functions if item['name'] == obtained_function)
        function_to_call = available_function_calls[obtained_function]
        response = function_to_call(**arguments)
        print(response, "\n")
        
        chat.append({
            "role": "tool",
            "content": f"The tool - '{obtained_function}' with the function definition - '{function_description}' and function arguments - '{arguments}' yielded the following response: {response}\n."
        })

        for message in chat:
            if message['role'] == 'system':
                message['content'] = new_system_content
                
    inputs = tokenizer.apply_chat_template(
        chat,
        tokenize = True,
        add_generation_prompt = True, 
        return_tensors = "pt").to("cuda")
    text_streamer = TextStreamer(tokenizer, skip_prompt = True)
    _ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 512, use_cache = True, pad_token_id = tokenizer.eos_token_id)
else:
    print("No function call found in the response")

Getting current weather for Armonk, US
{'description': 'clear sky', 'temperature': 11.23, 'humidity': 37} 

Getting stock price for IBM on 2024-10-29
('230.2600', '232.4200') 

The current weather at IBM's headquarters in Armonk, USA is a clear sky with a temperature of 11.23 degrees Celsius and a humidity of 37%. Additionally, the stock price for IBM on October 29, 2024 was between $230.26 and $232.42.<|eot_id|>


In [75]:
chat

[{'role': 'system',
  'content': 'You are a helpful assistant. Answer the user query based on the response of the specific function call or tool provided to you as context. Generate a precise answer for given user query, synthesizing the provided information.'},
 {'role': 'user',
  'content': 'What is the current weather at the headquarters of IBM? Also, can you provide the stock prices for the company on October 29, 2024?'},
 {'role': 'tool',
  'content': "The tool - 'get_current_weather' with the function definition - 'Get the current weather' and function arguments = '{'location': 'Armonk, US'}' yielded the following response: {'description': 'clear sky', 'temperature': 11.23, 'humidity': 37}\n."},
 {'role': 'tool',
  'content': "The tool - 'get_stock_price' with the function definition - 'Retrieves the lowest and highest stock price for a given ticker symbol and date. The ticker symbol must be a valid symbol for a publicly traded company on a major US stock exchange like NYSE or NA

In [76]:
formatted_prompt = tokenizer.apply_chat_template(
    chat,
    tokenize = False,
    add_generation_prompt = True, 
    return_tensors = "pt"
)
print(formatted_prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant. Answer the user query based on the response of the specific function call or tool provided to you as context. Generate a precise answer for given user query, synthesizing the provided information.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the current weather at the headquarters of IBM? Also, can you provide the stock prices for the company on October 29, 2024?<|eot_id|><|start_header_id|>tool<|end_header_id|>

The tool - 'get_current_weather' with the function definition - 'Get the current weather' and function arguments = '{'location': 'Armonk, US'}' yielded the following response: {'description': 'clear sky', 'temperature': 11.23, 'humidity': 37}
.<|eot_id|><|start_header_id|>tool<|end_header_id|>

The tool - 'get_stock_price' with the function definition - 'Retrieves the lowest and highest stock price for a given ticker symbol and date. The ticker symbol must be a valid symbol 

## vLLM Inference

In [None]:
from vllm import LLM
from vllm.sampling_params import SamplingParams

model_name = "gautamgc17/Llama3-FineTune-Function-Calling-Model-V2"
sampling_params = SamplingParams(max_tokens=768)

llm = LLM(
    model=model_name,
    max_model_len=2048,
    tokenizer_mode="auto",
    tensor_parallel_size=1,
    enforce_eager=True,
    gpu_memory_utilization=0.95
)

In [39]:
llm_prompt = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant with access to the following function calls. Your task is to produce a sequence of function calls necessary to generate response to the user utterance. Use the following function calls as required.
{available_tools_list}<|eot_id|><|start_header_id|>user<|end_header_id|>

{query}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

input_prompt = llm_prompt.format(available_tools_list=available_tools_list, query=query)

output = llm.generate([input_prompt], sampling_params)
generated_text = output[0].outputs[0].text
print(f"Generated text: {generated_text!r}")

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it, est. speed input: 386.51 toks/s, output: 42.50 toks/s]

Generated text: '[{"name": "get_current_weather", "arguments": {"location": "Armonk, US"}}, {"name": "get_stock_price", "arguments": {"ticker": "IBM", "date": "2024-10-29"}}]'



