In [3]:
import time
import asyncio
import pandas as pd
from typing import List, Dict, Any
from llama_index.llms.openai_like import OpenAILike
from llama_index.core.llms import ChatMessage

class ModelBenchmark:
    def __init__(self, model_configs: Dict[str, Dict[str, Any]]):
        """
        Initialize model benchmark class
        
        Args:
            model_configs: Model configuration dictionary, format as {
                "model_name_1": {"api_base": "http://localhost:8001", "api_key": "empty"},
                "model_name_2": {"api_base": "http://localhost:8002", "api_key": "empty"}
            }
        """
        self.models = {}
        for model_name, config in model_configs.items():
            self.models[model_name] = OpenAILike(
                model=model_name,
                api_base=config["api_base"],
                api_key=config.get("api_key", "sk-xxx"),
                is_chat_model=True,
                max_retries=3,
                timeout=100,
                additional_kwargs={"extra_body": {"chat_template_kwargs": {"enable_thinking": False}}}
            )
    
    def create_test_messages(self, prompt: str) -> List[ChatMessage]:
        """Create test conversation messages"""
        return [
            ChatMessage(role="system", content="You are a helpful AI assistant."),
            ChatMessage(role="user", content=prompt)
        ]
    
    async def test_single_model(self, model_name: str, prompt: str, num_runs: int = 5) -> Dict[str, Any]:
        """Test single model performance"""
        print(f"Testing model: {model_name}")
        
        results = {
            "model": model_name,
            "total_time": 0,
            "total_tokens": 0,
            "response_times": [],
            "throughputs": [],
            "success_count": 0
        }
        
        messages = self.create_test_messages(prompt)
        
        for i in range(num_runs):
            try:
                start_time = time.time()
                response = await self.models[model_name].acomplete(
                    prompt,
                    messages
                )
                
                end_time = time.time()
                elapsed_time = end_time - start_time
                
                # Calculate token count (approximate value)
                tokens = len(response.text.split())
                
                results["total_time"] += elapsed_time
                results["total_tokens"] += tokens
                results["response_times"].append(elapsed_time)
                results["throughputs"].append(tokens / elapsed_time)
                results["success_count"] += 1
                
                print(f"  Run {i+1}/{num_runs}: {elapsed_time:.2f}s, {tokens} tokens")
                
            except Exception as e:
                print(f"  Run {i+1}/{num_runs} failed: {str(e)}")
            
            # Add short delay between runs
            await asyncio.sleep(1)
        
        return results
    
    async def compare_models(self, prompts: List[str], num_runs: int = 5) -> pd.DataFrame:
        """Compare performance of multiple models"""
        all_results = []
        
        for prompt in prompts:
            print(f"\nTest prompt: '{prompt[:50]}...'")
            
            prompt_results = {"prompt": prompt}
            
            # Run tests for each model
            tasks = []
            for model_name in self.models.keys():
                task = self.test_single_model(model_name, prompt, num_runs)
                tasks.append(task)
            
            model_results = await asyncio.gather(*tasks)
            
            # Process results
            for result in model_results:
                model_name = result["model"]
                if result["success_count"] > 0:
                    avg_time = result["total_time"] / result["success_count"]
                    avg_throughput = result["total_tokens"] / result["total_time"] if result["total_time"] > 0 else 0
                    
                    prompt_results[f"{model_name}_avg_time"] = avg_time
                    prompt_results[f"{model_name}_avg_throughput"] = avg_throughput
                    prompt_results[f"{model_name}_success_rate"] = result["success_count"] / num_runs
                else:
                    prompt_results[f"{model_name}_avg_time"] = None
                    prompt_results[f"{model_name}_avg_throughput"] = None
                    prompt_results[f"{model_name}_success_rate"] = 0
            
            all_results.append(prompt_results)
        
        return pd.DataFrame(all_results)
    
    def generate_report(self, results_df: pd.DataFrame) -> str:
        """Generate performance comparison report"""
        report = []
        report.append("=" * 60)
        report.append("Large Language Model Inference Speed Comparison Report")
        report.append("=" * 60)
        
        for index, row in results_df.iterrows():
            report.append(f"\nPrompt {index + 1}: '{row['prompt'][:50]}...'")
            report.append("-" * 40)
            
            for model_name in self.models.keys():
                if f"{model_name}_avg_time" in row:
                    report.append(
                        f"{model_name}: "
                        f"{row[f'{model_name}_avg_time']:.2f}s, "
                        f"{row[f'{model_name}_avg_throughput']:.1f} tokens/s, "
                        f"Success rate: {row[f'{model_name}_success_rate']:.0%}"
                    )
        
        # Calculate overall averages
        report.append("\n" + "=" * 60)
        report.append("Overall Performance Comparison")
        report.append("=" * 60)
        
        for model_name in self.models.keys():
            time_col = f"{model_name}_avg_time"
            throughput_col = f"{model_name}_avg_throughput"
            
            if time_col in results_df.columns:
                avg_time = results_df[time_col].mean()
                avg_throughput = results_df[throughput_col].mean()
                report.append(f"{model_name}: Average response time {avg_time:.2f}s, Average throughput {avg_throughput:.1f} tokens/s")
        
        return "\n".join(report)

In [None]:
# Configure your models
model_configs = {
    "qwen3": {
        "api_base": "http://192.168.100.30:16001/v1",
        "api_key": "empty"
    },
    "qwen3-vllm": {
        "api_base": "http://192.168.100.30:16002/v1",
        "api_key": "empty"
    }
}

# Test prompts
test_prompts = [
    "Please explain the basic concepts of machine learning",
    "Write a short story about artificial intelligence",
    "How to improve programming skills? Please provide specific suggestions",
    "Write a quicksort algorithm in Python",
    "Discuss the impact of climate change on the global economy"
]

# Create benchmark instance
benchmark = ModelBenchmark(model_configs)

# Run tests
print("Starting model performance comparison test...")
results = await benchmark.compare_models(test_prompts, num_runs=3)

# Generate report
report = benchmark.generate_report(results)
print(report)

# Save results to CSV
results.to_csv("model_benchmark_results.csv", index=False)
print("\nDetailed results saved to model_benchmark_results.csv")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Read the CSV file
df = pd.read_csv('model_benchmark_results.csv')

# Set up the plotting style
plt.style.use('seaborn-v0_8')
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Qwen3 Model Benchmark Comparison: sglang vs vLLM', fontsize=16)

# Prepare data for plotting
models = ['sglang', 'vLLM']
x_pos = np.arange(len(df))

# 1. Plot Average Time Comparison
ax1 = axes[0, 0]
width = 0.35
bars1 = ax1.bar(x_pos - width/2, df['qwen3_avg_time'], width, label='sglang', color='skyblue')
bars2 = ax1.bar(x_pos + width/2, df['qwen3-vllm_avg_time'], width, label='vLLM', color='lightcoral')
ax1.set_xlabel('Prompts')
ax1.set_ylabel('Average Time (seconds)')
ax1.set_title('Average Response Time Comparison')
ax1.set_xticks(x_pos)
ax1.set_xticklabels([f'Prompt {i+1}' for i in range(len(df))], rotation=45, ha='right')
ax1.legend()

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax1.annotate(f'{height:.1f}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontsize=8)

for bar in bars2:
    height = bar.get_height()
    ax1.annotate(f'{height:.1f}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontsize=8)

# 2. Plot Throughput Comparison
ax2 = axes[0, 1]
bars1 = ax2.bar(x_pos - width/2, df['qwen3_avg_throughput'], width, label='sglang', color='lightgreen')
bars2 = ax2.bar(x_pos + width/2, df['qwen3-vllm_avg_throughput'], width, label='vLLM', color='orange')
ax2.set_xlabel('Prompts')
ax2.set_ylabel('Throughput (tokens/second)')
ax2.set_title('Average Throughput Comparison')
ax2.set_xticks(x_pos)
ax2.set_xticklabels([f'Prompt {i+1}' for i in range(len(df))], rotation=45, ha='right')
ax2.legend()

# 3. Plot Success Rate
ax3 = axes[1, 0]
success_sglang = df['qwen3_success_rate']
success_vllm = df['qwen3-vllm_success_rate']
bars1 = ax3.bar(x_pos - width/2, success_sglang, width, label='sglang', color='gold')
bars2 = ax3.bar(x_pos + width/2, success_vllm, width, label='vLLM', color='mediumpurple')
ax3.set_xlabel('Prompts')
ax3.set_ylabel('Success Rate')
ax3.set_title('Success Rate Comparison')
ax3.set_ylim([0, 1.1])
ax3.set_xticks(x_pos)
ax3.set_xticklabels([f'Prompt {i+1}' for i in range(len(df))], rotation=45, ha='right')
ax3.legend()

# 4. Performance Summary Radar Chart
ax4 = axes[1, 1]
# Calculate average metrics for each model
avg_time_sglang = df['qwen3_avg_time'].mean()
avg_time_vllm = df['qwen3-vllm_avg_time'].mean()
avg_throughput_sglang = df['qwen3_avg_throughput'].mean()
avg_throughput_vllm = df['qwen3-vllm_avg_throughput'].mean()

# Normalize data for radar chart (higher is better)
# For time, we invert the values since lower time is better
max_time = max(avg_time_sglang, avg_time_vllm)
norm_time_sglang = (max_time - avg_time_sglang) / max_time
norm_time_vllm = (max_time - avg_time_vllm) / max_time

# For throughput, higher is better
max_throughput = max(avg_throughput_sglang, avg_throughput_vllm)
norm_throughput_sglang = avg_throughput_sglang / max_throughput
norm_throughput_vllm = avg_throughput_vllm / max_throughput

# Success rate is already normalized
avg_success_sglang = df['qwen3_success_rate'].mean()
avg_success_vllm = df['qwen3-vllm_success_rate'].mean()

# Radar chart data
categories = ['Response Time', 'Throughput', 'Success Rate']
sglang_values = [norm_time_sglang, norm_throughput_sglang, avg_success_sglang]
vllm_values = [norm_time_vllm, norm_throughput_vllm, avg_success_vllm]

angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
sglang_values += sglang_values[:1]  # Close the loop
vllm_values += vllm_values[:1]
angles += angles[:1]

ax4 = plt.subplot(2, 2, 4, projection='polar')
ax4.plot(angles, sglang_values, 'o-', linewidth=2, label='sglang', color='blue')
ax4.fill(angles, sglang_values, alpha=0.25, color='blue')
ax4.plot(angles, vllm_values, 'o-', linewidth=2, label='vLLM', color='red')
ax4.fill(angles, vllm_values, alpha=0.25, color='red')
ax4.set_xticks(angles[:-1])
ax4.set_xticklabels(categories)
ax4.set_title('Performance Summary')
ax4.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))

plt.tight_layout()
plt.show()

# Print summary statistics
print("Summary Statistics:")
print("==================")
print(f"Average response time (sglang): {avg_time_sglang:.2f} seconds")
print(f"Average response time (vLLM): {avg_time_vllm:.2f} seconds")
print(f"Average throughput (sglang): {avg_throughput_sglang:.2f} tokens/second")
print(f"Average throughput (vLLM): {avg_throughput_vllm:.2f} tokens/second")
print(f"Average success rate (sglang): {avg_success_sglang:.2f}")
print(f"Average success rate (vLLM): {avg_success_vllm:.2f}")