In [4]:
pip install ollama torch pandas matplotlib tqdm psutil pynvml

Note: you may need to restart the kernel to use updated packages.


In [1]:
# Setup stuff for confirming directory pathing

In [7]:
# Imports 
import time
import csv
import random
from datetime import datetime
from pathlib import Path
import concurrent.futures

import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import ollama 

# Directory setup 
BASE_DIR = Path.cwd()
PROMPT_DIR = BASE_DIR / "prompts"
RESULTS_DIR = BASE_DIR / "results"
RESULTS_DIR.mkdir(exist_ok=True)

HOMOGENEOUS_CSV = RESULTS_DIR / "3060_llama_latency.csv"
MIXED_CSV = RESULTS_DIR / "llama_latency_mixed.csv"

BATCH_SUMMARY_HOMOGENEOUS = RESULTS_DIR / "3060_llama_batch_summary.csv"
BATCH_SUMMARY_MIXED = RESULTS_DIR / "llama_batch_summary_mixed.csv"


# CONFIGURATION FOR MIXED PROMPTS VS HOMOGENEOUS

# For a mixed batch (all prompt types), "real world" workload 
# csv_file = MIXED_CSV
# csv_type = "mixed"
# homogeneous_key = None  # Not used for mixed runs

# For homogeneous batch (only one type/length), experiment workload 
csv_file = HOMOGENEOUS_CSV
csv_type = "homogeneous"
homogeneous_key = "simple_short"  # e.g., simple_short, moderate_long, etc.

print("Prompt directory:", PROMPT_DIR)
print("Results will be saved to:", csv_file)

# Load prompts 
prompt_files = {
    "simple_short": PROMPT_DIR / "simple_short.txt",
    "simple_long": PROMPT_DIR / "simple_long.txt",
    "moderate_short": PROMPT_DIR / "moderate_short.txt",
    "moderate_long": PROMPT_DIR / "moderate_long.txt",
    "complex_short": PROMPT_DIR / "complex_short.txt", 
    "complex_long": PROMPT_DIR / "complex_long.txt" 
}

prompts = {}
for key, file_path in prompt_files.items():
    if file_path.exists():
        with open(file_path, "r", encoding="utf-8") as f:
            prompts[key] = [line.strip() for line in f if line.strip()]
    else:
        print(f"File not found: {file_path}")
        prompts[key] = []

# Quick check
for k, v in prompts.items():
    print(k, len(v), "prompts loaded")

# Prompt bank for storage
prompt_bank = prompts

# Keys for running through tests on all key types, for overnight runs
homogeneous_keys = [
    "simple_short",
    "simple_long",
    "moderate_short",
    "moderate_long", 
    "complex_short",
    "complex_long" 
]

Prompt directory: C:\Users\chees\CECS 530 Project\prompts
Results will be saved to: C:\Users\chees\CECS 530 Project\results\3060_llama_latency.csv
simple_short 128 prompts loaded
simple_long 128 prompts loaded
moderate_short 128 prompts loaded
moderate_long 128 prompts loaded
complex_short 128 prompts loaded
complex_long 128 prompts loaded


In [8]:
# Benchmarking code with a warmup cycle

In [9]:
# Warm-up function
def warmup_model(model_name, prompt="Hello! This is a warmup run."):
    print("\nWarming up model (this may take a few seconds)...")
    start = time.time()
    _ = ollama.chat(model=model_name, messages=[{"role": "user", "content": prompt}])
    elapsed = time.time() - start
    print(f"Warm-up complete in {elapsed:.2f} seconds.")

# GPU stats function
import subprocess, re
def get_gpu_info():
    """Returns GPU name, memory used/total, and utilization %."""
    try:
        output = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=name,memory.used,memory.total,utilization.gpu", "--format=csv,noheader,nounits"],
            encoding="utf-8"
        ).strip()
        name, mem_used, mem_total, util = re.split(r',\s*', output)
        return {
            "gpu_name": name,
            "gpu_mem_used_MB": int(mem_used),
            "gpu_mem_total_MB": int(mem_total),
            "gpu_util_percent": int(util)
        }
    except Exception as e:
        print("Could not read GPU stats:", e)
        return {
            "gpu_name": "N/A",
            "gpu_mem_used_MB": 0,
            "gpu_mem_total_MB": 0,
            "gpu_util_percent": 0
        }

# Single prompt benchmark function
def run_single_prompt(model_name, prompt, prompt_type, prompt_length):
    start_total = time.time()
    # Load time (minimal since Ollama auto-loads)
    load_time_start = time.time()
    load_time = time.time() - load_time_start

    # Prompt evaluation + streaming output
    prompt_eval_start = time.time()
    stream = ollama.chat(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        stream=True,
    )

    first_token_time = None
    token_count = 0
    for chunk in stream:
        if "message" in chunk:
            token_count += 1
            if first_token_time is None:
                first_token_time = time.time()

    prompt_eval_time = (first_token_time - prompt_eval_start) if first_token_time else 0
    output_eval_time = (time.time() - first_token_time) if first_token_time else 0
    total_time = time.time() - start_total

    avg_token_latency = output_eval_time / max(token_count,1)
    throughput = token_count / output_eval_time if output_eval_time>0 else 0

    return {
        "timestamp": datetime.now().isoformat(),
        "prompt_type": prompt_type,
        "prompt_length": prompt_length,
        "input_tokens": len(prompt.split()),
        "output_tokens": token_count,
        "prompt_eval_time": prompt_eval_time,
        "output_eval_time": output_eval_time,
        "total_time": total_time,
        "avg_token_latency": avg_token_latency,
        "throughput": throughput,
        "model_name": model_name
    }

# Batch runner with GPU logging
def run_batch(model_name, batch_size, prompt_bank, csv_type="mixed", homogeneous_key=homogeneous_key):
    results = []

    # Sample prompts
    selected_prompts = []

    # For controlled experiment run throughs
    if csv_type == "homogeneous" and homogeneous_key is not None:
        # Only sample from the specified type/length key
        plist = prompt_bank[homogeneous_key]
        prompt_type, prompt_length = homogeneous_key.split("_")
        for prompt in random.sample(plist, k=min(len(plist), batch_size)):
            selected_prompts.append((prompt, prompt_type, prompt_length))

    # For simulated "real world" test scenarios
    else: 
        for key, plist in prompt_bank.items():
            prompt_type, prompt_length = key.split("_")
            for prompt in random.sample(plist, k=min(2, len(plist))):
                selected_prompts.append((prompt, prompt_type, prompt_length))

    # Duplicate to match batch size if needed
    while len(selected_prompts) < batch_size:
        selected_prompts.extend(random.sample(selected_prompts, k=min(len(selected_prompts), batch_size-len(selected_prompts))))
    selected_prompts = selected_prompts[:batch_size]

    # Run batch concurrently
    start_batch = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=batch_size) as executor:
        futures = [
            executor.submit(run_single_prompt, model_name, prompt, ptype, plen)
            for (prompt, ptype, plen) in selected_prompts
        ]
        for future in concurrent.futures.as_completed(futures):
            results.append(future.result())
    batch_time = time.time() - start_batch

    # GPU stats
    gpu_stats = get_gpu_info()
    for r in results:
        r.update({
            "gpu_name": gpu_stats["gpu_name"],
            "gpu_mem_used_MB": gpu_stats["gpu_mem_used_MB"],
            "gpu_mem_total_MB": gpu_stats["gpu_mem_total_MB"],
            "gpu_util_percent": gpu_stats["gpu_util_percent"]
        })

    # Batch summary
    total_output_tokens = sum(r["output_tokens"] for r in results)
    total_time = batch_time
    total_tokens = total_output_tokens

    # Throughput = total tokens generated / total batch time
    avg_throughput = total_tokens / total_time if total_time > 0 else 0

    # Average token latency = total time / total tokens
    avg_token_latency = total_time / total_tokens if total_tokens > 0 else 0


    batch_summary = {
        "timestamp": datetime.now().isoformat(),
        "batch_size": batch_size,
        "prompt_type": prompt_type,
        "prompt_length": prompt_length,
        "total_output_tokens": total_output_tokens,
        "batch_time": batch_time,
        "avg_token_latency": avg_token_latency,
        "avg_throughput": avg_throughput,
        "gpu_name": gpu_stats["gpu_name"],
        "gpu_mem_used_MB": gpu_stats["gpu_mem_used_MB"],
        "gpu_mem_total_MB": gpu_stats["gpu_mem_total_MB"],
        "gpu_util_percent": gpu_stats["gpu_util_percent"]
    }

    # Write batch summary to correct CSV
    if csv_type == "homogeneous":
        batch_csv_file = BATCH_SUMMARY_HOMOGENEOUS
    else:
        batch_csv_file = BATCH_SUMMARY_MIXED

    pd.DataFrame([batch_summary]).to_csv(batch_csv_file, mode='a', header=not batch_csv_file.exists(), index=False)

    print(f"Batch size {batch_size} finished in {batch_time:.2f}s with {len(results)} prompts.")
    return results

# Full experiment loop
def run_full_experiment(model_name, batch_sizes, prompt_bank, csv_file, csv_type="mixed", homogeneous_key=homogeneous_key):
    all_results = []

    for batch_size in batch_sizes:
        print(f"\n=== Running batch size {batch_size} ===")
        batch_results = run_batch(model_name, batch_size, prompt_bank, csv_type, homogeneous_key=homogeneous_key)
        for r in batch_results:
            r["batch_size"] = batch_size
        all_results.extend(batch_results)

        # Incremental per-prompt CSV
        pd.DataFrame(batch_results).to_csv(csv_file, mode='a', header=not csv_file.exists(), index=False)

    print("\nâœ… Experiment completed.")
    return pd.DataFrame(all_results)


# BENCHMARKING PARAMETERS
model_name = "llama3"  # change as needed
batch_sizes = [1, 8, 32, 128]

# Warm-up
warmup_model(model_name)

# Run full experiment, singular
df_results = run_full_experiment(
    model_name=model_name,
    batch_sizes=batch_sizes,
    prompt_bank=prompt_bank,
    csv_file=csv_file,
    csv_type=csv_type,
    homogeneous_key=homogeneous_key
)

# Multiloop experiment 
# Number of repeats per prompt type
# num_repeats = 5   # adjust as needed

#Loop over prompt types and run
#for key in homogeneous_keys:
#    print(f"\n=== Running homogeneous tests for {key} ===\n")
#    for i in range(num_repeats):
#        print(f"Run {i+1}/{num_repeats} for {key}")
#        df_results = run_full_experiment(
#            model_name=model_name,
#            batch_sizes=batch_sizes,
#            prompt_bank=prompt_bank,
#            csv_file=csv_file,
#            csv_type="homogeneous",
#            homogeneous_key=key
#        )
#        print(f"Completed {key} run {i+1}/{num_repeats}\n")

# Quick CSV preview
display(df_results.head())
print(f"Total runs logged: {len(df_results)}")



Warming up model (this may take a few seconds)...
Warm-up complete in 7.09 seconds.

=== Running batch size 1 ===
Batch size 1 finished in 0.95s with 1 prompts.

=== Running batch size 8 ===
Batch size 8 finished in 11.31s with 8 prompts.

=== Running batch size 32 ===
Batch size 32 finished in 24.36s with 32 prompts.

=== Running batch size 128 ===
Batch size 128 finished in 123.28s with 128 prompts.

Experiment completed.


Unnamed: 0,timestamp,prompt_type,prompt_length,input_tokens,output_tokens,prompt_eval_time,output_eval_time,total_time,avg_token_latency,throughput,model_name,gpu_name,gpu_mem_used_MB,gpu_mem_total_MB,gpu_util_percent,batch_size
0,2025-11-05T11:24:46.445971,simple,short,7,62,0.14651,0.801081,0.947592,0.012921,77.395379,llama3,NVIDIA GeForce RTX 3060 Ti,6553,8192,93,1
1,2025-11-05T11:24:46.859090,simple,short,7,13,0.196511,0.158616,0.355127,0.012201,81.958911,llama3,NVIDIA GeForce RTX 3060 Ti,6559,8192,94,8
2,2025-11-05T11:24:46.979990,simple,short,7,7,0.394022,0.079504,0.473526,0.011358,88.046134,llama3,NVIDIA GeForce RTX 3060 Ti,6559,8192,94,8
3,2025-11-05T11:24:47.122496,simple,short,7,10,0.501026,0.118505,0.619531,0.011851,84.384624,llama3,NVIDIA GeForce RTX 3060 Ti,6559,8192,94,8
4,2025-11-05T11:24:47.277139,simple,short,5,11,0.641034,0.131141,0.772175,0.011922,83.879217,llama3,NVIDIA GeForce RTX 3060 Ti,6559,8192,94,8


Total runs logged: 169
