In [None]:
import time
import tiktoken
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import openai
openai.api_key = ""

In [None]:
!pip install tiktoken
!pip install matplotlib
!pip install tenacity
!pip install pandas

In [None]:
from datetime import datetime
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

In [None]:
encoding = tiktoken.get_encoding("cl100k_base")

### Define parameters

In [1]:
# list of models to benchmark 
models = ['gpt-3.5-turbo', 'gpt-4']

# list of prompts to test 
prompts = [ 'describe the roman empire in as much detail as possible', 
            'who do you think will win in a cage fight, mark zuckerberg or elon musk? provide a detailed analysis',
            'recite the constitution 10 times', 
            'repeat the word bubble 500 times', 
            'create a complete application for transcribing audio from a given youtube link, parsing speakers as well as times stamps of each word. create a front end that allows the user to search over the content']

# Different token counts to benchmark 
max_tokens_list = [1, 100, 200, 300, 400, 500]

In [None]:
def count_tokens(text):
    return len(encoding.encode(text))

### This section implements benchmarking latency by setting the parameter max_tokens at different levels. benchmark_max_tokens also checks whether max_tokens number of tokens were actually generated in the call, and if not, uses the actual number of tokens generated. 

In [None]:


def run_model_max_tokens(model_id, max_tokens, prompt, temperature=0):
    messages = [
       {"role": "system", "content": "Be verbose"},
       {"role": "user", "content": prompt},
    ]
    completion = openai.ChatCompletion.create(
       model=model_id,
       messages=messages,
       max_tokens=max_tokens,
       temperature=temperature,
    )
    return completion.choices[0].message.content

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_latency(model_id, max_tokens, prompt):
    start_time = time.perf_counter()
    output = run_model_max_tokens(model_id, max_tokens, prompt)
    elapsed_time = time.perf_counter() - start_time
    actual_tokens = count_tokens(output)
    return elapsed_time, actual_tokens

def benchmark_max_tokens(models, max_tokens_list, prompts):
    results = {}
    
    for model_id in models:
        results[model_id] = {}
        
        for max_tokens in max_tokens_list:
            
            for prompt in prompts:
                latency, actual_tokens = get_latency(model_id, max_tokens, prompt)
                key = f"{actual_tokens}_tokens"
                
                if key not in results[model_id]:
                    results[model_id][key] = []
                
                results[model_id][key].append(latency)
    return results

### This section implements benchmarking latency by streaming. It records both the content and the time elapsed for each chunk of text that's streamed. After everything is streamed, the number of tokens streamed in each chunk is calculated. 

In [None]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def run_model_streaming(model, input_prompt):
    performance_start_time = time.perf_counter()
    received_chunks_and_times = []
    
    completion_stream = openai.ChatCompletion.create(
        model=model,
        messages=[
           {"role": "user", "content": input_prompt},
        ],
        stream=True
    )

    for chunk_index, response_chunk in enumerate(completion_stream):
        time_since_start = time.perf_counter() - performance_start_time
        if 'delta' in response_chunk.choices[0] and 'content' in response_chunk.choices[0].delta:
            received_chunks_and_times.append((response_chunk.choices[0].delta.content, time_since_start))
        else:
            break
    
        if chunk_index == 50:
            break
            
    return received_chunks_and_times

def count_total_tokens(temp_chunks):
    total_token_count = 0
    for idx, (content, elapsed_time) in enumerate(temp_chunks):
        token_count = count_tokens(content)
        total_token_count += token_count
        temp_chunks[idx] = (content, elapsed_time, total_token_count)

    return temp_chunks, total_token_count


def benchmark_model_streaming(models, prompts):
    results = {}
    
    for model in models:
        if model not in results:
            results[model] = {}
            
        for prompt in prompts:
            received_chunks_and_times = run_model_streaming(model, prompt)
            temp_chunks, _ = count_total_tokens(received_chunks_and_times)
            
            for _, elapsed_time, total_token_count in temp_chunks:
                token_key = f"{total_token_count}_tokens"
                
                if token_key not in results[model]:
                    results[model][token_key] = []
                
                results[model][token_key].append(elapsed_time)
    
    return results



### Call the functions above

In [None]:
results = benchmark_model_max_tokens(models, max_tokens_list, prompts) # benchmark_model_streaming(models, prompts)

### Plot the results

In [None]:
use_boxplot = False 

df_list = []

for model, timings in results.items():
    for token_count, time_list in timings.items():
        for tme in time_list:
            df_list.append({
                'Model': model,
                'Tokens': int(token_count.split('_')[0]),
                'Time': tme
            })

df = pd.DataFrame(df_list)
df = df.sort_values(by=['Model', 'Tokens'])

plt.figure(figsize=(12, 8))
colors = plt.cm.viridis(np.linspace(0, 1, len(df['Model'].unique())))

for idx, (model, color) in enumerate(zip(df['Model'].unique(), colors)):
    model_data = df[df['Model'] == model]
    plt.scatter(model_data['Tokens'], model_data['Time'], label=model, color=color, alpha=0.5)
    
    unique_tokens = model_data['Tokens'].unique()
    box_data = [model_data[model_data['Tokens'] == tokens]['Time'] for tokens in unique_tokens]

    # Plot boxplot if use_boxplot is True
    if use_boxplot:
        medianprops = dict(color=color)
        plt.boxplot(box_data, positions=unique_tokens, widths=40, medianprops=medianprops)
    
    # Connect the medians
    medians = [np.median(data) for data in box_data]
    plt.plot(unique_tokens, medians, color=color, linestyle='--')
    
plt.xlabel('Number of Tokens')
plt.ylabel('Time (s)')
plt.title('Model Performance by Token Count')
plt.legend()
plt.grid(True)
plt.show()
