In [None]:
%pip install python-dotenv seaborn matplotlib pandas numpy locust==2.18.1 orjson==3.9.10
from dotenv import load_dotenv
load_dotenv()
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import json
import subprocess
import numpy as np
import time

# LLM Bench Guide

This guide explains how to use the benchmarking tools to evaluate LLM performance.

### Metrics Collected
The `load_test.py` script measures:

1. Average Time to First Token
2. Average Token Latency
3. Average Token Count
4. Average Total Response Time
5. Request Count
6. Queries Per Second (QPS)
7. Latency Percentiles (p50, p90, p99, p99.9)
   - Time to First Token
   - Total Response Time

### Benchmark Scenarios
We will create the following benchmark suites in this notebook. Although there are many other types of tests that can be run, please refer to the README.md for more details:

1. Single Model Performance Analysis
2. Comparing Performance of different Models and Providers
3. Token Length Impact Study

In [71]:
'''Helper functions, you can ignore this'''

#Function to utilize subprocess to run the locust script
def execute_subprocess(cmd):
    print(f"\nExecuting benchmark: {' '.join(cmd)}\n")
    process = subprocess.Popen(
        cmd,
        text=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        bufsize=1,
        universal_newlines=True
    )
    # Display output in real-time
    while True:
        output = process.stdout.readline()
        if output == '' and process.poll() is not None:
            break
        if output:
            print(output.strip())

    return_code = process.poll()
    if return_code != 0:
        print(f"Benchmark failed with return code: {return_code}")
        return False
    return True


%matplotlib inline
# Optional: for higher resolution plots
%config InlineBackend.figure_format = 'retina'


def visualize_comparative_results(stat_result_paths, results_dir):
    """
    Create comparative visualizations for multiple model benchmark results
    
    Args:
        stat_result_paths (list): List of dicts containing paths to stats files and configs
            [{"path": "path/to/stats.csv", "config": {"provider": "...", "model": "..."}}, ...]
        results_dir (str): Directory to save the comparative visualizations
    """
    # Read and combine all CSV files with model information
    dfs = []
    for result in stat_result_paths:
        df = pd.read_csv(result['path'])
        df['model'] = result['config']['model']
        df['provider'] = result['config']['provider']
        dfs.append(df)
    
    # Combine all dataframes
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Get POST data first
    post_data = combined_df[combined_df['Type'] == 'POST']
    
    # Get unique models and calculate dynamic bar width
    models = post_data['model'].unique()
    num_models = len(models)
    bar_width = min(0.35, 0.8 / num_models)  # Dynamically reduce bar width as models increase
    
    # Set style for better visualizations
    plt.style.use('ggplot')
    fig = plt.figure(figsize=(20, 15))

    # 1. Response Time Distribution Comparison
    plt.subplot(2, 2, 1)
    metrics_to_plot = ['Average Response Time', 'Median Response Time']
    
    x = np.arange(len(metrics_to_plot))
    # Adjust bar positions to be centered
    positions = np.linspace(-(bar_width * (num_models-1))/2, 
                          (bar_width * (num_models-1))/2, 
                          num_models)
    
    for i, model in enumerate(models):
        model_data = post_data[post_data['model'] == model][metrics_to_plot]
        plt.bar(x + positions[i], model_data.iloc[0], bar_width, label=model.split('/')[-1])
    
    plt.title('Response Time Comparison')
    plt.xlabel('Metrics')
    plt.ylabel('Time (ms)')
    plt.xticks(x, metrics_to_plot, rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    # 2. QPS Comparison
    plt.subplot(2, 2, 2)
    qps_data = combined_df[combined_df['Type'] == 'POST'][['model', 'Requests/s']]
    x = np.arange(len(models))
    plt.bar(x, [qps_data[qps_data['model'] == model]['Requests/s'].iloc[0] for model in models], 
            width=0.6)  # Single bars can be wider
    plt.title('Throughput Comparison')
    plt.xlabel('Model')
    plt.ylabel('Requests per Second')
    plt.xticks(x, [model.split('/')[-1] for model in models], rotation=45)

    # 3. Token Latency Comparison
    plt.subplot(2, 2, 3)
    token_metrics = ['latency_per_token', 'overall_latency_per_token']
    token_data = combined_df[
        (combined_df['Type'] == 'METRIC') & 
        (combined_df['Name'].isin(token_metrics))
    ]
    
    x = np.arange(len(token_metrics))
    for i, model in enumerate(models):
        model_data = token_data[token_data['model'] == model]
        values = [
            model_data[model_data['Name'] == metric]['Average Response Time'].iloc[0]
            for metric in token_metrics
        ]
        plt.bar(x + positions[i], values, bar_width, label=model.split('/')[-1])
    
    plt.title('Token Latency Comparison')
    plt.xlabel('Metrics')
    plt.ylabel('Time (ms)')
    plt.xticks(x, ['Per Token', 'Overall Per Token'], rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    # 4. Percentile Distribution Comparison
    plt.subplot(2, 2, 4)
    percentiles = ['50%', '75%', '90%', '95%', '99%', '99.9%']
    
    for model in models:
        model_data = post_data[post_data['model'] == model]
        plt.plot(percentiles, model_data[percentiles].iloc[0], marker='o', label=model.split('/')[-1])
    
    plt.title('Response Time Percentiles Comparison')
    plt.xlabel('Percentile')
    plt.ylabel('Response Time (ms)')
    plt.xticks(rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    # Adjust layout to prevent overlapping
    plt.tight_layout()
    # Save the figure
    plt.savefig(f'{results_dir}/comparative_performance_metrics.png', 
                bbox_inches='tight',  # Ensure the legend is included in the saved figure
                dpi=300)  # Higher resolution
    # Display in notebook
    plt.show()
    # Close the figure
    plt.close()

    # Generate summary statistics
    summary_stats = []
    for model in models:
        model_data = combined_df[combined_df['model'] == model]
        post_data = model_data[model_data['Type'] == 'POST'].iloc[0]
        token_data = model_data[model_data['Type'] == 'METRIC']
        
        summary_stats.append({
            "Model": model.split('/')[-1],
            "Provider": model_data['provider'].iloc[0],
            "Average QPS": post_data['Requests/s'],
            "Average Response Time": post_data['Average Response Time'],
            "99th Percentile Latency": post_data['99%'],
            "Average Tokens per Request": token_data[token_data['Name'] == 'num_tokens']['Average Response Time'].iloc[0]
        })
    
    # Print comparative summary
    print("\nComparative Summary:")
    print("-" * 80)
    summary_df = pd.DataFrame(summary_stats)
    print(summary_df.to_string(index=False))
    
    # Save summary to CSV
    summary_df.to_csv(f'{results_dir}/comparative_summary.csv', index=False)
    
    return summary_stats

## Single Model and Provider Performance Analysis

Evaluate performance metrics of singular model from one provider. This is the most basic benchmark that can be run from the load_test.py script.

In [None]:
'''
Make sure to create a .env file in the root directory and add your API keys.
For this example, we will use the Fireworks API key.

Add the following to your .env file:

FIREWORKS_API_KEY=<your_fireworks_api_key>.

Alternatively you can edit the following script flags for custom configurations.
'''


provider_name = "fireworks"
model_name = "accounts/fireworks/models/llama-v3p2-3b-instruct"
h = "https://api.fireworks.ai/inference" #host url
api_key = os.getenv("FIREWORKS_API_KEY")

t = "5s" #test duration, set to 1 minute for now

'''
Choose ONE of the following two modes by commenting/uncommenting:
'''
# MODE 1: Fixed Queries Per Second (QPS)
# Use this mode to maintain a steady rate of requests
qps = 5  # Target requests per second
u = 100   # Number of users (keep high enough to achieve target QPS)
s = 100   # Spawn rate (keep high enough to achieve target QPS)

# MODE 2: Fixed Concurrency
# Use this mode to maintain a steady number of concurrent requests
# Comment out Mode 1 above and uncomment below to use this mode
'''
# QPS does not need to be set for fixed concurrency mode
u = 5      # Number of concurrent workers
r = 5      # Rate of spawning new workers (workers/second). Look through README.md for more details on spawn rate
'''



# Create results directory of name single_model_provider_analysis_{TIMESTAMP}
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")

edited_model_name = model_name.replace("/", "_") if provider_name != "fireworks" else model_name.replace("accounts/fireworks/models/", "").replace("/", "_")



results_dir = f"results/{provider_name}_{edited_model_name}_analysis_{timestamp}"
os.makedirs(results_dir, exist_ok=True)

# Construct the command
cmd = [
    "locust",
    "--headless",       # Run without web UI
    "--only-summary",   # Only show summary stats
    "-H", h,           # Host URL
    "--provider", provider_name,
    "--model", model_name,
    "--api-key", api_key,
    "-t", t,           # Test duration
    "--html", f"{results_dir}/report.html",  # Generate HTML report
    "--csv", f"{results_dir}/stats",        # Generate CSV stats
]

# Add Mode 1 (Fixed QPS) parameters if uncommented, remember to remove --qps below if using fixed concurrency mode
cmd.extend([
    "-u", str(u),      # Number of users
    "-r", str(s),      # Spawn rate
    "--qps", str(qps)  # Target QPS
])

# Add load_test.py as the locust file
locust_file = os.path.join(os.path.dirname(os.getcwd()), "llm_bench", "load_test.py")
cmd.extend(["-f", locust_file]) 

#call our helper function to execute the command
success = execute_subprocess(cmd)

#Visualize the results
if success: 
    time.sleep(1)
    stat_result_paths = [{"path": f'{results_dir}/stats_stats.csv', "config": {"provider": provider_name, "model": model_name}}]
    visualize_comparative_results(stat_result_paths, results_dir)

## Comparing different Models and Providers

Evaluate performance metrics of different models and providers.

In [None]:
'''
Edit the provider_configs list to add more providers and models.

Add any needed api keys to the .env file. This example uses the Fireworks API key again.
'''

provider_configs = [
    {"provider": "fireworks", "model": "accounts/fireworks/models/llama-v3p2-3b-instruct", "host": "https://api.fireworks.ai/inference", "api_key": os.getenv("FIREWORKS_API_KEY")},
    {"provider": "fireworks", "model": "accounts/fireworks/models/mistral-small-24b-instruct-2501", "host": "https://api.fireworks.ai/inference", "api_key": os.getenv("FIREWORKS_API_KEY")},
    #... add more providers and models here
]

# some starter configs and flags
t = "5s" #test duration, set to 1 minute for now
qps = 5  # Target requests per second
u = 100   # Number of users (keep high enough to achieve target QPS)
s = 100   # Spawn rate (keep high enough to achieve target QPS)

# Create results directory of name single_model_provider_analysis_{TIMESTAMP}
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
results_dir = f"results/different_models_and_providers_analysis_{timestamp}"
os.makedirs(results_dir, exist_ok=True)

for index, config in enumerate(provider_configs):
    # Construct the command

    edited_model_name = config["model"].replace("/", "_") if config["provider"] != "fireworks" else config["model"].replace("accounts/fireworks/models/", "").replace("/", "_")
    
    provider_model_path = f"{results_dir}/{config["provider"]}_{edited_model_name}_{index}"
    
    os.makedirs(provider_model_path, exist_ok=True)
    cmd = [
        "locust",
        "--headless",       # Run without web UI
        "--only-summary",   # Only show summary stats
        "-H", config["host"],           # Host URL
        "--provider", config["provider"],
        "--model", config["model"],
        "--api-key", config["api_key"],
        "-t", t,           # Test duration
        "--html", f"{provider_model_path}/report.html",  # Generate HTML report
        "--csv", f"{provider_model_path}/stats",        # Generate CSV stats
    ]

    # Add Mode 1 (Fixed QPS) parameters if uncommented, remember to remove --qps below if using fixed concurrency mode
    cmd.extend([
        "-u", str(u),      # Number of users
        "-r", str(s),      # Spawn rate
        "--qps", str(qps)  # Target QPS
    ])

    # Add load_test.py as the locust file
    locust_file = os.path.join(os.path.dirname(os.getcwd()), "llm_bench", "load_test.py")
    cmd.extend(["-f", locust_file]) 

    #call our helper function to execute the command
    execute_subprocess(cmd)

#Visualize the results
stat_result_paths = []
for index, config in enumerate(provider_configs):
    edited_model_name = config["model"].replace("/", "_") if config["provider"] != "fireworks" else config["model"].replace("accounts/fireworks/models/", "").replace("/", "_")
    stat_result_paths.append({"path": f"{results_dir}/{config['provider']}_{edited_model_name}_{index}/stats_stats.csv", "config": config})

time.sleep(1)
visualize_comparative_results(stat_result_paths, results_dir)



## Token Length Analysis

Evaluates model performance across different output lengths by testing the same model 
with varying input token limits (from short to long responses).

In [None]:
max_token_lengths = [30,64,128,256] # --max-tokens flag
max_token_lengths_distribution = "uniform" # --max-tokens-distribution flag check readme.md for more details and other options


provider_name = "fireworks"
model_name = "accounts/fireworks/models/llama-v3p2-3b-instruct"
h = "https://api.fireworks.ai/inference" #host url
api_key = os.getenv("FIREWORKS_API_KEY")

t = "5s" #test duration, set to 1 minute for now
qps = 5  # Target requests per second
u = 100   # Number of users (keep high enough to achieve target QPS)
s = 100   # Spawn rate (keep high enough to achieve target QPS)

# Create results directory of name single_model_provider_analysis_{TIMESTAMP}
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M")
results_dir = f"results/token_length_analysis_{timestamp}"
os.makedirs(results_dir, exist_ok=True)

for index, token_length in enumerate(max_token_lengths):
    # Construct the command

    edited_model_name = model_name.replace("/", "_") if provider_name != "fireworks" else model_name.replace("accounts/fireworks/models/", "").replace("/", "_")
    
    token_length_path = f"{results_dir}/{provider_name}_{edited_model_name}_{token_length}"
    os.makedirs(f"{token_length_path}", exist_ok=True)
    cmd = [
        "locust",
        "--headless",       # Run without web UI
        "--only-summary",   # Only show summary stats
        "-H", h,           # Host URL
        "--provider", provider_name,
        "--model", model_name,
        "--api-key", api_key,
        "-t", t,           # Test duration
        "--max-tokens", str(token_length), 
        "--max-tokens-distribution", max_token_lengths_distribution,
        "--html", f"{token_length_path}/report.html",  # Generate HTML report
        "--csv", f"{token_length_path}/stats",        # Generate CSV stats
    ]

    # Add Mode 1 (Fixed QPS) parameters if uncommented, remember to remove --qps below if using fixed concurrency mode
    cmd.extend([
        "-u", str(u),      # Number of users
        "-r", str(s),      # Spawn rate
        "--qps", str(qps)  # Target QPS
    ])

    # Add load_test.py as the locust file
    locust_file = os.path.join(os.path.dirname(os.getcwd()), "llm_bench", "load_test.py")
    cmd.extend(["-f", locust_file]) 

    #call our helper function to execute the command
    execute_subprocess(cmd)

#Visualize the results
stat_result_paths = []
for index, token_length in enumerate(max_token_lengths):

    edited_model_name = model_name.replace("/", "_") if provider_name != "fireworks" else model_name.replace("accounts/fireworks/models/", "").replace("/", "_")
    stat_result_paths.append({"path": f"{results_dir}/{provider_name}_{edited_model_name}_{token_length}/stats_stats.csv", "config": {"provider": "fireworks", "model": "accounts/fireworks/models/llama-v3p2-3b-instruct" + "_" + str(token_length)}})

time.sleep(1)
visualize_comparative_results(stat_result_paths, results_dir)