In [1]:
from collections import defaultdict, namedtuple
import os, itertools, json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from dataclasses import dataclass

def get_tpot(df_original):
    df = df_original.copy()
    # remove entries where is_warmup_request is 1 or decoding_step_idx is < 0
    df = df[(df["is_warmup_request"] == 0) & (df["decoding_step_idx"] >= 0)]
    group = df.groupby("request_guid", as_index=False)
    min_time = group["timestamp"].min()["timestamp"]
    max_time = group["timestamp"].max()["timestamp"]
    num_generated_tokens = group.size()["size"]
    tpots = (max_time - min_time) / num_generated_tokens / 1000
    # return mean and p99 of tpots
    return tpots.mean(), tpots.median(), tpots.quantile(0.99)

def get_throughput(df_original):
    df = df_original.copy()
    # remove entries where is_warmup_request is 1 or request_step_idx is < 0
    df = df[(df["is_warmup_request"] == 0) & (df["decoding_step_idx"] >= 0)]
    # compute the throughput as the number of rows in the filtered dataframe (df) divided by the total time taken
    microsec_to_sec = 1_000_000
    total_time_sec = (df["timestamp"].max() - df["timestamp"].min()) / microsec_to_sec
    total_output_tokens = df.shape[0]
    return total_output_tokens / total_time_sec
def get_ttft(df_original):
    df = df_original.copy()
    # remove entries where is_warmup_request is 1
    df = df[(df["is_warmup_request"] == 0)]
    group = df.groupby("request_guid", as_index=False)
    ttft = group.apply(lambda x: x[x["decoding_step_idx"] == 0]["timestamp"].values[0] - x[x["decoding_step_idx"] == -1]["timestamp"].values[0])/1000
    # convert to milliseconds from microseconds
    return ttft.mean().iloc[1], ttft.median().iloc[1], ttft.quantile(0.99).iloc[1]

def get_queueing_time(df_original):
    df = df_original.copy()
    # remove entries where is_warmup_request is 1
    df = df[(df["is_warmup_request"] == 0)]
    group = df.groupby("request_guid", as_index=False)
    microsec_to_sec = 1_000_000
    # in each group, find the difference between the timestampt at request_step_idx=-1 and the timestamp at request_step_idx=-2.
    queueing_time = group.apply(lambda x: x[x["decoding_step_idx"] == -1]["timestamp"].values[0] - x[x["decoding_step_idx"] == -2]["timestamp"].values[0])/1000
    return queueing_time.mean().iloc[1], queueing_time.median().iloc[1], queueing_time.quantile(0.99).iloc[1]
def get_ft_throughput(df_original):
    df = df_original.copy()
    # remove entries where is_warmup_request is 1 or request_step_idx is < 0
    df = df[df["is_warmup_step"] == 0]
    # compute the throughput as the number of rows in the filtered dataframe (df) divided by the total time taken
    microsec_to_sec = 1_000_000
    total_time_sec = (df["timestamp"].max() - df["timestamp"].min()) / microsec_to_sec
    total_output_tokens = df["num_finetuning_fwd_tokens"].sum()
    return total_output_tokens / total_time_sec

In [2]:
directory = "/global/homes/g/goliaro/flexllm/benchmarking/output/e2e/coserving/profiling"
models=["meta-llama/Llama-3.1-8B-Instruct", "Qwen/Qwen2.5-14B-Instruct", "Qwen/Qwen2.5-32B-Instruct"]
tp_degrees=[1, 2, 4]
kv_cache_slots_values=[70000, 70000, 60000]
qps_values=["1.0","2.0","3.0","4.0","5.0"]
for i, model in enumerate(models):
    model_ = model.replace("/", "_").lower()
    tp_degree = tp_degrees[i]
    kv_cache_slots = kv_cache_slots_values[i]
    mean_tpots=[]
    p99_tpots=[]
    inf_throughputs=[]
    mean_queue_times=[]
    p99_queue_times=[]
    mean_ttfts=[]
    p99_ttfts=[]
    ft_throughputs=[]
    for qps in qps_values:
        filepath= os.path.join(directory, f"inference_request_profiling_sharegpt_8192_{qps}_qps_{model_}_tensor_parallelism_{tp_degree}_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_{kv_cache_slots}_qps_0.000000_num_warmup_requests_0.csv")
        if os.path.exists(filepath):
            df = pd.read_csv(filepath)
            tpot_mean, tpot_median, tpot_p99 = get_tpot(df)
            throughput = get_throughput(df)
            queueing_time = get_queueing_time(df)
            ttft = get_ttft(df)
            mean_tpots.append(f"{tpot_mean:.3f}")
            p99_tpots.append(f"{tpot_p99:.3f}")
            inf_throughputs.append(f"{throughput:.3f}")
            mean_queue_times.append(f"{queueing_time[0]:.3f}")
            p99_queue_times.append(f"{queueing_time[2]:.3f}")
            mean_ttfts.append(f"{ttft[0]:.3f}")
            p99_ttfts.append(f"{ttft[2]:.3f}")
        else:
            print(f"File {filepath} does not exist.")
            mean_tpots.append(f"{0:.3f}")
            p99_tpots.append(f"{0:.3f}")
            inf_throughputs.append(f"{0:.3f}")
            mean_queue_times.append(f"{0:.3f}")
            p99_queue_times.append(f"{0:.3f}")
            mean_ttfts.append(f"{0:.3f}")
            p99_ttfts.append(f"{0:.3f}")
        step_filepath=os.path.join(directory, f"step_profiling_sharegpt_8192_{qps}_qps_{model_}_tensor_parallelism_{tp_degree}_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_{kv_cache_slots}_qps_0.000000_num_warmup_requests_0.csv")
        if os.path.exists(step_filepath):
            df = pd.read_csv(step_filepath)
            ft_throughput = get_ft_throughput(df)
            ft_throughputs.append(f"{ft_throughput:.3f}")
        else:
            print(f"File {step_filepath} does not exist.")
            ft_throughputs.append(f"{ft_throughput:.3f}")
    print(f"Model: {model}, TP Degree: {tp_degree}, KV Cache Slots: {kv_cache_slots}")
    print(f"QPS: {qps_values}")
    print(f"TPOT - Mean: {mean_tpots}, P99: {p99_tpots}")
    print(f"Inf Throughput: {inf_throughputs}")
    print(f"Queueing Time - Mean: {mean_queue_times}, P99: {p99_queue_times}")
    print(f"TTFT - Mean: {mean_ttfts}, P99: {p99_ttfts}")
    print(f"Finetuning Throughput: {ft_throughputs}")
    print("-" * 50)


Model: meta-llama/Llama-3.1-8B-Instruct, TP Degree: 1, KV Cache Slots: 70000
QPS: ['1.0', '2.0', '3.0', '4.0', '5.0']
TPOT - Mean: ['32.567', '33.594', '34.854', '36.053', '36.674'], P99: ['42.962', '43.232', '45.365', '45.409', '48.638']
Inf Throughput: ['259.807', '513.973', '751.252', '1059.269', '1307.249']
Queueing Time - Mean: ['40.472', '77.131', '104.183', '139.490', '181.138'], P99: ['335.125', '678.591', '883.131', '965.131', '1210.971']
TTFT - Mean: ['68.906', '76.272', '81.292', '85.394', '87.066'], P99: ['453.013', '598.573', '639.797', '621.888', '617.447']
Finetuning Throughput: ['2351.923', '2228.795', '2093.183', '1937.874', '1798.804']
--------------------------------------------------
Model: Qwen/Qwen2.5-14B-Instruct, TP Degree: 2, KV Cache Slots: 70000
QPS: ['1.0', '2.0', '3.0', '4.0', '5.0']
TPOT - Mean: ['57.926', '60.943', '64.268', '66.207', '66.455'], P99: ['66.487', '69.214', '72.123', '75.286', '73.150']
Inf Throughput: ['258.493', '504.365', '604.072', '887.

## LLAMA-Factory

8B model: 3810.46/2 = 1,905.23 tok/s

14B model: 1048.954/2 = 524.477 tok/s

32B model: 469.974/2 = 234 tok/s


## VLLM

In [3]:
directory = "/global/homes/g/goliaro/flexllm/benchmarking/output/vllm"
models=["meta-llama/Llama-3.1-8B-Instruct", "Qwen/Qwen2.5-14B-Instruct", "Qwen/Qwen2.5-32B-Instruct"]
qps_values=["2.0","4.0","6.0","8.0","10.0","12.0","16.0","20.0"]
for i, model in enumerate(models):
    model_ = model.replace("/", "_").lower()
    mean_tpots=[]
    p99_tpots=[]
    inf_throughputs=[]
    mean_ttfts=[]
    p99_ttfts=[]
    ft_throughputs=[]
    for qps in qps_values:
        filepath= os.path.join(directory, f"results_sharegpt_eager_v1_{model_}_bz_256_max_num_batched_tokens_256_{qps}_qps.json")
        if not os.path.exists(filepath):
            filepath = os.path.join(directory, f"results_sharegpt_eager_v1_{model_}_bz_256_max_num_batched_tokens_256_{qps}_qps_.json")
        if os.path.exists(filepath):
            with open(filepath, 'r') as f:
                data = json.load(f)
            mean_tpots.append(f"{data['mean_tpot_ms']:.3f}")
            p99_tpots.append(f"{data['p99_tpot_ms']:.3f}")
            inf_throughputs.append(f"{data['output_throughput']:.3f}")
            mean_ttfts.append(f"{data['mean_ttft_ms']:.3f}")
            p99_ttfts.append(f"{data['p99_ttft_ms']:.3f}")
        else:
            print(f"File {filepath} does not exist.")
            mean_tpots.append(f"{0:.3f}")
            p99_tpots.append(f"{0:.3f}")
            inf_throughputs.append(f"{0:.3f}")
            mean_ttfts.append(f"{0:.3f}")
            p99_ttfts.append(f"{0:.3f}")
    print(f"Model: {model}")
    print(f"QPS: {qps_values}")
    print(f"TPOT - Mean: {mean_tpots}, P99: {p99_tpots}")
    print(f"Inf Throughput: {inf_throughputs}")
    print(f"TTFT - Mean: {mean_ttfts}, P99: {p99_ttfts}")
    print("-" * 50)


Model: meta-llama/Llama-3.1-8B-Instruct
QPS: ['2.0', '4.0', '6.0', '8.0', '10.0', '12.0', '16.0', '20.0']
TPOT - Mean: ['12.986', '15.489', '18.313', '21.861', '29.843', '39.383', '38.819', '38.527'], P99: ['16.120', '21.730', '27.480', '32.325', '42.202', '47.474', '46.787', '46.994']
Inf Throughput: ['524.688', '1074.685', '1609.861', '2149.054', '2659.603', '2835.174', '2929.336', '3004.973']
TTFT - Mean: ['77.664', '126.601', '189.721', '292.819', '1381.504', '28487.625', '60710.476', '75967.820'], P99: ['378.144', '559.422', '953.589', '1369.075', '7167.328', '56380.458', '139531.039', '182959.385']
--------------------------------------------------
Model: Qwen/Qwen2.5-14B-Instruct
QPS: ['2.0', '4.0', '6.0', '8.0', '10.0', '12.0', '16.0', '20.0']
TPOT - Mean: ['15.922', '18.594', '23.196', '36.465', '48.333', '49.917', '49.695', '49.282'], P99: ['20.015', '26.887', '36.449', '53.607', '58.592', '59.535', '59.593', '59.558']
Inf Throughput: ['526.911', '1080.292', '1615.444', '2159

## Temporal Sharing

In [4]:
directory = "/global/homes/g/goliaro/flexllm/benchmarking/output/e2e/temporal_sharing/profiling"
models=["meta-llama/Llama-3.1-8B-Instruct", "Qwen/Qwen2.5-14B-Instruct", "Qwen/Qwen2.5-32B-Instruct"]
tp_degrees=[1, 2, 4]
kv_cache_slots_values=[70000, 70000, 60000]
qps_values=["1.0","2.0","3.0","4.0","5.0"]
for i, model in enumerate(models):
    model_ = model.replace("/", "_").lower()
    tp_degree = tp_degrees[i]
    kv_cache_slots = kv_cache_slots_values[i]
    mean_tpots=[]
    p99_tpots=[]
    inf_throughputs=[]
    mean_queue_times=[]
    p99_queue_times=[]
    mean_ttfts=[]
    p99_ttfts=[]
    ft_throughputs=[]
    for qps in qps_values:
        filepath= os.path.join(directory, f"inference_request_profiling_sharegpt_8192_{qps}_qps_{model_}_tensor_parallelism_{tp_degree}_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_{kv_cache_slots}_qps_0.000000_num_warmup_requests_0.csv")
        if os.path.exists(filepath):
            df = pd.read_csv(filepath)
            tpot_mean, tpot_median, tpot_p99 = get_tpot(df)
            throughput = get_throughput(df)
            queueing_time = get_queueing_time(df)
            ttft = get_ttft(df)
            mean_tpots.append(f"{tpot_mean:.3f}")
            p99_tpots.append(f"{tpot_p99:.3f}")
            inf_throughputs.append(f"{throughput:.3f}")
            mean_queue_times.append(f"{queueing_time[0]:.3f}")
            p99_queue_times.append(f"{queueing_time[2]:.3f}")
            mean_ttfts.append(f"{ttft[0]:.3f}")
            p99_ttfts.append(f"{ttft[2]:.3f}")
        else:
            print(f"File {filepath} does not exist.")
            mean_tpots.append(f"{0:.3f}")
            p99_tpots.append(f"{0:.3f}")
            inf_throughputs.append(f"{0:.3f}")
            mean_queue_times.append(f"{0:.3f}")
            p99_queue_times.append(f"{0:.3f}")
            mean_ttfts.append(f"{0:.3f}")
            p99_ttfts.append(f"{0:.3f}")
        step_filepath=os.path.join(directory, f"step_profiling_sharegpt_8192_{qps}_qps_{model_}_tensor_parallelism_{tp_degree}_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_{kv_cache_slots}_qps_0.000000_num_warmup_requests_0.csv")
        if os.path.exists(step_filepath):
            df = pd.read_csv(step_filepath)
            ft_throughput = get_ft_throughput(df)
            ft_throughputs.append(f"{ft_throughput:.3f}")
        else:
            print(f"File {step_filepath} does not exist.")
            ft_throughputs.append(f"{0:.3f}")
    print(f"Model: {model}, TP Degree: {tp_degree}, KV Cache Slots: {kv_cache_slots}")
    print(f"QPS: {qps_values}")
    print(f"TPOT - Mean: {mean_tpots}, P99: {p99_tpots}")
    print(f"Inf Throughput: {inf_throughputs}")
    print(f"Queueing Time - Mean: {mean_queue_times}, P99: {p99_queue_times}")
    print(f"TTFT - Mean: {mean_ttfts}, P99: {p99_ttfts}")
    print(f"Finetuning Throughput: {ft_throughputs}")
    print("-" * 50)


Model: meta-llama/Llama-3.1-8B-Instruct, TP Degree: 1, KV Cache Slots: 70000
QPS: ['1.0', '2.0', '3.0', '4.0', '5.0']
TPOT - Mean: ['643.117', '640.100', '639.872', '641.125', '642.798'], P99: ['731.466', '712.521', '719.272', '718.240', '733.149']
Inf Throughput: ['132.148', '154.654', '156.939', '157.898', '157.601']
Queueing Time - Mean: ['74469.599', '374115.479', '422726.020', '451226.948', '468695.820'], P99: ['201956.365', '801683.773', '946641.186', '1014430.762', '1052682.907']
TTFT - Mean: ['1120.833', '1092.522', '1112.511', '1119.881', '1124.694'], P99: ['10846.146', '13594.622', '13668.549', '13461.397', '13007.038']
Finetuning Throughput: ['5648.021', '5652.325', '5649.419', '5645.506', '5638.343']
--------------------------------------------------
Model: Qwen/Qwen2.5-14B-Instruct, TP Degree: 2, KV Cache Slots: 70000
QPS: ['1.0', '2.0', '3.0', '4.0', '5.0']
TPOT - Mean: ['841.062', '840.547', '841.311', '838.705', '842.178'], P99: ['931.530', '949.894', '948.095', '931.68

## Spatial Sharing

In [9]:
directory = "/global/homes/g/goliaro/flexllm/benchmarking/output/e2e/spatial_sharing/profiling"
models=["meta-llama/Llama-3.1-8B-Instruct", "Qwen/Qwen2.5-14B-Instruct", "Qwen/Qwen2.5-32B-Instruct"]
tp_degrees=[1, 2, 4]
kv_cache_slots_values=[70000, 70000, 60000]
qps_values=["1.0","2.0","3.0","4.0","5.0"]
for i, model in enumerate(models):
    model_ = model.replace("/", "_").lower()
    tp_degree = tp_degrees[i]
    kv_cache_slots = kv_cache_slots_values[i]
    mean_tpots=[]
    p99_tpots=[]
    inf_throughputs=[]
    mean_queue_times=[]
    p99_queue_times=[]
    mean_ttfts=[]
    p99_ttfts=[]
    ft_throughputs=[]
    for qps in qps_values:
        filepath= os.path.join(directory, f"inference_request_profiling_sharegpt_8192_{qps}_qps_{model_}_tensor_parallelism_{tp_degree}_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_{kv_cache_slots}_qps_0.000000_num_warmup_requests_0.csv")
        if os.path.exists(filepath):
            df = pd.read_csv(filepath)
            tpot_mean, tpot_median, tpot_p99 = get_tpot(df)
            throughput = get_throughput(df)
            queueing_time = get_queueing_time(df)
            ttft = get_ttft(df)
            mean_tpots.append(f"{tpot_mean:.3f}")
            p99_tpots.append(f"{tpot_p99:.3f}")
            inf_throughputs.append(f"{throughput:.3f}")
            mean_queue_times.append(f"{queueing_time[0]:.3f}")
            p99_queue_times.append(f"{queueing_time[2]:.3f}")
            mean_ttfts.append(f"{ttft[0]:.3f}")
            p99_ttfts.append(f"{ttft[2]:.3f}")
        else:
            print(f"File {filepath} does not exist.")
            mean_tpots.append(f"{0:.3f}")
            p99_tpots.append(f"{0:.3f}")
            inf_throughputs.append(f"{0:.3f}")
            mean_queue_times.append(f"{0:.3f}")
            p99_queue_times.append(f"{0:.3f}")
            mean_ttfts.append(f"{0:.3f}")
            p99_ttfts.append(f"{0:.3f}")
        step_filepath=os.path.join(directory, f"step_profiling_sharegpt_8192_{qps}_qps_{model_}_tensor_parallelism_{tp_degree}_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_{kv_cache_slots}_qps_0.000000_num_warmup_requests_0.csv")
        if os.path.exists(step_filepath):
            df = pd.read_csv(step_filepath)
            ft_throughput = get_ft_throughput(df)
            ft_throughputs.append(f"{ft_throughput:.3f}")
        else:
            print(f"File {step_filepath} does not exist.")
            ft_throughputs.append(f"{0:.3f}")
    print(f"Model: {model}, TP Degree: {tp_degree}, KV Cache Slots: {kv_cache_slots}")
    print(f"QPS: {qps_values}")
    print(f"TPOT - Mean: {mean_tpots}, P99: {p99_tpots}")
    print(f"Inf Throughput: {inf_throughputs}")
    print(f"Finetuning Throughput: {ft_throughputs}")
    print(f"Queueing Time - Mean: {mean_queue_times}, P99: {p99_queue_times}")
    print(f"TTFT - Mean: {mean_ttfts}, P99: {p99_ttfts}")
    print("-" * 50)


File /global/homes/g/goliaro/flexllm/benchmarking/output/e2e/spatial_sharing/profiling/inference_request_profiling_sharegpt_8192_2.0_qps_meta-llama_llama-3.1-8b-instruct_tensor_parallelism_1_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_70000_qps_0.000000_num_warmup_requests_0.csv does not exist.
File /global/homes/g/goliaro/flexllm/benchmarking/output/e2e/spatial_sharing/profiling/step_profiling_sharegpt_8192_2.0_qps_meta-llama_llama-3.1-8b-instruct_tensor_parallelism_1_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_70000_qps_0.000000_num_warmup_requests_0.csv does not exist.
Model: meta-llama/Llama-3.1-8B-Instruct, TP Degree: 1, KV Cache Slots: 70000
QPS: ['1.0', '2.0', '3.0', '4.0', '5.0']
TPOT - Mean: ['328.604', '0.000', '336.463', '336.421', '335.331'], P99: ['396.033', '0.000', '404.990', '398.794', '389.157']
Inf Throughput: ['198.467', '0.000', '155.932', '296.234', '299.148']
Finetuning Throughput: ['5495.135', '0.000', '5522.6

In [6]:
fp="/global/homes/g/goliaro/flexllm/benchmarking/output/e2e/temporal_sharing/profiling/step_profiling_sharegpt_8192_1.0_qps_meta-llama_llama-3.1-8b-instruct_tensor_parallelism_1_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_70000_qps_0.000000_num_warmup_requests_0.csv"
df_temporal = pd.read_csv(fp)
# count how many rows have both num_prefilling_tokens==0 and num_decoding_tokens==0
mask = (df_temporal["num_prefilling_tokens"] == 0) & (df_temporal["num_decoding_tokens"] == 0)
count = mask.sum()
total = len(df_temporal)
pct = count / total * 100
print(f"Rows with both tokens zero: {count} of {total} ({pct:.2f}%)")


Rows with both tokens zero: 3716 of 5543 (67.04%)


## Spatial Sharing (limited)

In [10]:
directory = "/global/homes/g/goliaro/flexllm/benchmarking/output/e2e/spatial_sharing_limited/profiling"
models=["meta-llama/Llama-3.1-8B-Instruct", "Qwen/Qwen2.5-14B-Instruct", "Qwen/Qwen2.5-32B-Instruct"]
tp_degrees=[1, 2, 4]
kv_cache_slots_values=[70000, 70000, 60000]
qps_values=["1.0","2.0","3.0","4.0","5.0"]
for i, model in enumerate(models):
    model_ = model.replace("/", "_").lower()
    tp_degree = tp_degrees[i]
    kv_cache_slots = kv_cache_slots_values[i]
    mean_tpots=[]
    p99_tpots=[]
    inf_throughputs=[]
    mean_queue_times=[]
    p99_queue_times=[]
    mean_ttfts=[]
    p99_ttfts=[]
    ft_throughputs=[]
    for qps in qps_values:
        filepath= os.path.join(directory, f"inference_request_profiling_sharegpt_8192_{qps}_qps_{model_}_tensor_parallelism_{tp_degree}_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_{kv_cache_slots}_qps_0.000000_num_warmup_requests_0.csv")
        if os.path.exists(filepath):
            df = pd.read_csv(filepath)
            tpot_mean, tpot_median, tpot_p99 = get_tpot(df)
            throughput = get_throughput(df)
            queueing_time = get_queueing_time(df)
            ttft = get_ttft(df)
            mean_tpots.append(f"{tpot_mean:.3f}")
            p99_tpots.append(f"{tpot_p99:.3f}")
            inf_throughputs.append(f"{throughput:.3f}")
            mean_queue_times.append(f"{queueing_time[0]:.3f}")
            p99_queue_times.append(f"{queueing_time[2]:.3f}")
            mean_ttfts.append(f"{ttft[0]:.3f}")
            p99_ttfts.append(f"{ttft[2]:.3f}")
        else:
            print(f"File {filepath} does not exist.")
            mean_tpots.append(f"{0:.3f}")
            p99_tpots.append(f"{0:.3f}")
            inf_throughputs.append(f"{0:.3f}")
            mean_queue_times.append(f"{0:.3f}")
            p99_queue_times.append(f"{0:.3f}")
            mean_ttfts.append(f"{0:.3f}")
            p99_ttfts.append(f"{0:.3f}")
        step_filepath=os.path.join(directory, f"step_profiling_sharegpt_8192_{qps}_qps_{model_}_tensor_parallelism_{tp_degree}_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_{kv_cache_slots}_qps_0.000000_num_warmup_requests_0.csv")
        if os.path.exists(step_filepath):
            df = pd.read_csv(step_filepath)
            ft_throughput = get_ft_throughput(df)
            ft_throughputs.append(f"{ft_throughput:.3f}")
        else:
            print(f"File {step_filepath} does not exist.")
            ft_throughputs.append(f"{0:.3f}")
    print(f"Model: {model}, TP Degree: {tp_degree}, KV Cache Slots: {kv_cache_slots}")
    print(f"QPS: {qps_values}")
    print(f"TPOT - Mean: {mean_tpots}, P99: {p99_tpots}")
    print(f"Inf Throughput: {inf_throughputs}")
    print(f"Finetuning Throughput: {ft_throughputs}")
    print(f"Queueing Time - Mean: {mean_queue_times}, P99: {p99_queue_times}")
    print(f"TTFT - Mean: {mean_ttfts}, P99: {p99_ttfts}")
    print("-" * 50)


Model: meta-llama/Llama-3.1-8B-Instruct, TP Degree: 1, KV Cache Slots: 70000
QPS: ['1.0', '2.0', '3.0', '4.0', '5.0']
TPOT - Mean: ['35.994', '36.126', '37.553', '38.689', '39.309'], P99: ['46.417', '46.933', '48.007', '51.054', '49.291']
Inf Throughput: ['259.261', '513.103', '730.142', '985.756', '1085.179']
Finetuning Throughput: ['2108.519', '2040.275', '1925.944', '1786.629', '1690.768']
Queueing Time - Mean: ['41.999', '76.036', '106.728', '147.297', '198.028'], P99: ['376.811', '702.075', '1048.795', '1174.243', '1337.289']
TTFT - Mean: ['71.663', '77.403', '81.689', '86.242', '87.656'], P99: ['446.426', '573.454', '519.176', '601.988', '590.755']
--------------------------------------------------
Model: Qwen/Qwen2.5-14B-Instruct, TP Degree: 2, KV Cache Slots: 70000
QPS: ['1.0', '2.0', '3.0', '4.0', '5.0']
TPOT - Mean: ['61.578', '65.044', '68.632', '69.282', '69.205'], P99: ['73.073', '76.011', '77.754', '78.423', '77.526']
Inf Throughput: ['258.217', '503.114', '588.029', '757

## Temporal Sharing (limited)

In [11]:
directory = "/global/homes/g/goliaro/flexllm/benchmarking/output/e2e/temporal_sharing_limited/profiling"
models=["meta-llama/Llama-3.1-8B-Instruct", "Qwen/Qwen2.5-14B-Instruct", "Qwen/Qwen2.5-32B-Instruct"]
tp_degrees=[1, 2, 4]
kv_cache_slots_values=[70000, 70000, 60000]
qps_values=["1.0","2.0","3.0","4.0","5.0"]
for i, model in enumerate(models):
    model_ = model.replace("/", "_").lower()
    tp_degree = tp_degrees[i]
    kv_cache_slots = kv_cache_slots_values[i]
    mean_tpots=[]
    p99_tpots=[]
    inf_throughputs=[]
    mean_queue_times=[]
    p99_queue_times=[]
    mean_ttfts=[]
    p99_ttfts=[]
    ft_throughputs=[]
    for qps in qps_values:
        filepath= os.path.join(directory, f"inference_request_profiling_sharegpt_8192_{qps}_qps_{model_}_tensor_parallelism_{tp_degree}_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_{kv_cache_slots}_qps_0.000000_num_warmup_requests_0.csv")
        if os.path.exists(filepath):
            df = pd.read_csv(filepath)
            tpot_mean, tpot_median, tpot_p99 = get_tpot(df)
            throughput = get_throughput(df)
            queueing_time = get_queueing_time(df)
            ttft = get_ttft(df)
            mean_tpots.append(f"{tpot_mean:.3f}")
            p99_tpots.append(f"{tpot_p99:.3f}")
            inf_throughputs.append(f"{throughput:.3f}")
            mean_queue_times.append(f"{queueing_time[0]:.3f}")
            p99_queue_times.append(f"{queueing_time[2]:.3f}")
            mean_ttfts.append(f"{ttft[0]:.3f}")
            p99_ttfts.append(f"{ttft[2]:.3f}")
        else:
            print(f"File {filepath} does not exist.")
            mean_tpots.append(f"{0:.3f}")
            p99_tpots.append(f"{0:.3f}")
            inf_throughputs.append(f"{0:.3f}")
            mean_queue_times.append(f"{0:.3f}")
            p99_queue_times.append(f"{0:.3f}")
            mean_ttfts.append(f"{0:.3f}")
            p99_ttfts.append(f"{0:.3f}")
        step_filepath=os.path.join(directory, f"step_profiling_sharegpt_8192_{qps}_qps_{model_}_tensor_parallelism_{tp_degree}_max_requests_per_batch_256_max_tokens_per_batch_256_num_kv_cache_slots_{kv_cache_slots}_qps_0.000000_num_warmup_requests_0.csv")
        if os.path.exists(step_filepath):
            df = pd.read_csv(step_filepath)
            ft_throughput = get_ft_throughput(df)
            ft_throughputs.append(f"{ft_throughput:.3f}")
        else:
            print(f"File {step_filepath} does not exist.")
            ft_throughputs.append(f"{0:.3f}")
    print(f"Model: {model}, TP Degree: {tp_degree}, KV Cache Slots: {kv_cache_slots}")
    print(f"QPS: {qps_values}")
    print(f"TPOT - Mean: {mean_tpots}, P99: {p99_tpots}")
    print(f"Inf Throughput: {inf_throughputs}")
    print(f"Finetuning Throughput: {ft_throughputs}")
    print(f"Queueing Time - Mean: {mean_queue_times}, P99: {p99_queue_times}")
    print(f"TTFT - Mean: {mean_ttfts}, P99: {p99_ttfts}")
    print("-" * 50)


Model: meta-llama/Llama-3.1-8B-Instruct, TP Degree: 1, KV Cache Slots: 70000
QPS: ['1.0', '2.0', '3.0', '4.0', '5.0']
TPOT - Mean: ['36.505', '37.869', '39.223', '40.523', '41.344'], P99: ['47.739', '48.968', '50.725', '50.362', '52.389']
Inf Throughput: ['258.758', '511.104', '720.476', '964.375', '1060.604']
Finetuning Throughput: ['2506.852', '2375.358', '2267.167', '2110.475', '2048.672']
Queueing Time - Mean: ['43.129', '79.445', '117.823', '161.835', '240.805'], P99: ['361.234', '650.009', '1136.216', '1259.362', '1661.896']
TTFT - Mean: ['70.914', '77.695', '85.546', '88.537', '92.039'], P99: ['473.582', '536.361', '634.297', '629.615', '670.324']
--------------------------------------------------
Model: Qwen/Qwen2.5-14B-Instruct, TP Degree: 2, KV Cache Slots: 70000
QPS: ['1.0', '2.0', '3.0', '4.0', '5.0']
TPOT - Mean: ['66.200', '69.697', '74.052', '76.365', '78.048'], P99: ['76.833', '79.636', '84.867', '85.766', '87.552']
Inf Throughput: ['256.806', '498.716', '564.383', '726