<!-- du4://thèse/cai/results.ipynb?d=20251024?loc=ttum?hPa=1020 -->

# Confidential Artificial Intelligence: What's the Catch?
### _Performance and costs_

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns

from dataclasses import dataclass
from enum import Enum
from functools import cached_property
from pathlib import Path
from typing import List, Tuple

In [None]:
sns.set_theme(style="ticks", context="paper")
sns.set_palette("colorblind")

In [None]:
class TEE_Mode(Enum):
    TEE_ON = "tee_on"
    TEE_OFF = "tee_off"

@dataclass
class Experiment:
    path: Path

    @property
    def name(self) -> str:
        return self.path.stem

    @cached_property
    def conditions(self) -> List["Condition"]:
        return [
            Condition(
                q, 
                q.parent.name, 
                TEE_Mode.TEE_ON if q.name == "tee_on" else TEE_Mode.TEE_OFF
            )
            for q in self.path.glob("*/*")
            if q.is_dir() and q.name in {"tee_on", "tee_off"}
        ]

    def get_all_conditions_names(self, sort_by_model_size: bool=False):
        all_conditions = self.conditions
        if sort_by_model_size:
            all_conditions_sorted = sorted(all_conditions, key=lambda c: int(c.model_size))
            return list(dict.fromkeys(c.name for c in all_conditions_sorted))
        else:
            return list(dict.fromkeys(c.name for c in all_conditions))

    def get_conditions(self, tee_mode: TEE_Mode):
        return [c for c in self.conditions if c.tee_mode == tee_mode]

    def get_condition(self, name: str, tee_mode: TEE_Mode):
        return next(
            filter(lambda c: c.name == name and c.tee_mode == tee_mode, self.conditions),
            None,
        )
    
    def get_all_runs(self):
        return [r for c in self.conditions for r in c.runs]
    
    def get_runs(self, tee_mode: TEE_Mode):
        return [r for c in self.conditions for r in c.runs if c.tee_mode == tee_mode]

    def __str__(self):
        name = self.name
        nb_conditions = len(self.conditions)
        nb_total_runs = len(self.get_all_runs())
        return f"Experiment: {name}, Conditions: {nb_conditions} ({nb_total_runs} total measurements)"

@dataclass
class Condition:
    path: Path
    name: str
    tee_mode: TEE_Mode

    @property
    def model_name(self) :
        return self.path.parent.name.split("_")

    @property
    def model_size(self) -> str:
        return re.search(r"(\d+)[bB]", "_".join(self.model_name)).group(1)

    @cached_property
    def runs(self) -> List["Run"]:
        run_paths = list(self.path.glob("*repetition_*"))
        json_files = sorted([r for r in run_paths if r.suffix == ".json"])
        csv_files = sorted([r for r in run_paths if r.suffix == ".csv"])

        assert len(list(run_paths)) > 0, "Empty results"
        assert len(json_files) == len(csv_files), f"Mismatch: {len(json_files)} .json vs. {len(csv_files)} .csv: {run_paths}"

        return [
            Run(
                idx, json_file, self.path / f"{json_file.stem}_power_metrics.csv"
            )
            for idx, json_file in enumerate(json_files)
        ]

    def get_all_runs(self) -> List["Run"]:
        return self.runs

    def get_run(self, index: int) -> "Run":
        return self.runs[index]

    def get_median_throughput_with_std(self) -> Tuple[float, float]:
        output_throughputs = [
            rep.get_vllm_key("output_throughput") for rep in self.runs
        ]
        return np.median(output_throughputs), np.std(output_throughputs)
    
    def get_median_ttft_with_std_and_p95(self):# -> Tuple[float, float, float]:
        latencies = [
            rep.get_vllm_key("ttfts") for rep in self.runs 
        ]
        #return np.median(latencies), np.std(ltencies), np.percentile(latencies, 95) # TODO: compare to VLLM output
        return latencies

    def get_median_itl_with_std_and_p95(self) -> Tuple[float, float, float]:
        latencies = [
            rep.get_vllm_key("itls") for rep in self.runs
        ]
        return np.median(latencies), np.std(latencies), np.percentile(latencies, 95) # TODO: compare to VLLM output

@dataclass
class Run:
    index: int
    path_vllm_json: Path
    path_power_csv: Path

    @cached_property
    def vllm_metrics(self) -> dict:
        return json.loads(self.path_vllm_json.read_text())

    @cached_property
    def gpu_metrics(self) -> pd.DataFrame:
        return pd.read_csv(self.path_power_csv)

    @property
    def dataset(self) -> str:
        return self.vllm_metrics["dataset"]

    @property
    def model_id(self) -> str:
        return self.vllm_metrics["model"]

    @property
    def input_length(self) -> int:
        return self.vllm_metrics["input_length"]

    @property
    def output_length(self) -> int:
        return self.vllm_metrics["output_length"]

    @property
    def concurrency(self) -> int:
        return self.vllm_metrics["concurrency"]

    @property
    def temperature(self) -> float:
        return self.vllm_metrics["temperature"]

    def get_vllm_key(self, key: str):
        return self.vllm_metrics[key]

## 0. Data summary

In [None]:
# Parent folder containing the data
data_path = Path("data", "calibration")  # ← FIXME
# The experiments
exp_throughput_latency = Experiment(data_path.joinpath("throughput_latency"))
exp_saturation_point   = Experiment(data_path.joinpath("saturation_point"))
exp_sequence_overhead  = Experiment(data_path.joinpath("sequence_overhead"))
exp_energy             = Experiment(data_path.joinpath("energy"))
# All experiments
all_exps = (exp_throughput_latency, exp_saturation_point, exp_sequence_overhead, exp_energy)

In [None]:
def format_seconds_long(seconds: float) -> str:
    h, remainder = divmod(int(seconds), 3600)
    m, s = divmod(remainder, 60)
    return f"{h:02d}:{m:02d}:{s:02d}"

nb_total_runs = 0
duration_total = 0
print(f"• Number of experiments: {len(all_exps)}")
for exp in all_exps:
    print(f"  • {str(exp)}")
    all_runs = exp.get_all_runs()
    nb_total_runs += len(all_runs)
    for m in all_runs:
        duration_total += m.get_vllm_key("duration")
print(f"• Total measurements: {nb_total_runs}")
print(f"• Total duration: {format_seconds_long(duration_total)}")
print(f"• Estimated Azure price: {round(duration_total / 3600 * 7, 3)} €")


## 1. Throughput and Latency

### 1.1. Data summary

In [None]:
rows = []  # We build row by row
for c in exp_throughput_latency.conditions:
    for m in c.runs:
        assert all(e == "" for e in m.get_vllm_key("errors")), (
            f"vLLM reported an error during measurement. Check .json {m.path_vllm_json}"
        )
        rows.append(
            {
                # Condition
                "condition": c.name,
                "tee_mode": c.tee_mode.value,
                # Measurement
                "Measurement #": m.index,
                "duration (s)": round(m.get_vllm_key("duration")),
                # Throughput
                "output throughput (tok/s)": m.get_vllm_key("output_throughput"),
                "total token throughput (tok/s)": m.get_vllm_key(
                    "total_token_throughput"
                ),
                # Latency TODO
                # Cloud
                "Azure cost (€)": round(m.get_vllm_key("duration") / 3600 * 7, 3),
            }
        )

pd.DataFrame(rows)

### 1.2. Throughput

In [None]:
condition_labels = []
throughput_medians_tee_on, tee_on_stds = [], []
throughput_medians_tee_off, tee_off_stds = [], []

for c in exp_throughput_latency.get_all_conditions_names(sort_by_model_size=True):
    condition_labels.append(c)
    
    # TEE ON
    median_tee_on, std_tee_on = exp_throughput_latency.get_condition(
        c, TEE_Mode.TEE_ON
    ).get_median_throughput_with_std()
    throughput_medians_tee_on.append(median_tee_on)
    tee_on_stds.append(std_tee_on)
    # TEE OFF
    median_tee_off, std_tee_off = exp_throughput_latency.get_condition(
        c, TEE_Mode.TEE_OFF
    ).get_median_throughput_with_std()
    throughput_medians_tee_off.append(median_tee_off)
    tee_off_stds.append(std_tee_off)

x = np.arange(len(condition_labels))
width = 0.20  # Width of the bars

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(
    x - width / 2,
    throughput_medians_tee_off,
    width,
    yerr=tee_off_stds,
    label="TEE off",
    capsize=5,
    alpha=0.8,
)
bars2 = ax.bar(
    x + width / 2,
    throughput_medians_tee_on,
    width,
    yerr=tee_on_stds,
    label="TEE On",
    capsize=5,
    alpha=0.8,
)

# Add labels, title and legend
ax.set_xlabel("Model")
ax.set_ylabel("Output throughput (tok/s)")
ax.set_title("Throughput Comparison: TEE Off vs TEE On")
ax.set_xticks(x)
ax.set_xticklabels(condition_labels, rotation=45, ha="right")
ax.legend()

plt.show()

### 1.3. Latency

In [None]:
def get_ttft_latency_metrics(condition: Condition):
    runs = condition.get_all_runs()
    run_medians = []
    run_p95s = []
    for run in runs:
        ttfts = run.get_vllm_key("ttfts")
        run_medians.append(np.median(ttfts))
        run_p95s.append(np.percentile(ttfts, 95))
    median_ttft = np.median(run_medians)
    std_ttft = np.std(run_medians, ddof=1)
    p95_ttft = np.median(run_p95s)
    std_p95 = np.std(run_p95s, ddof=1)
    return median_ttft, std_ttft, p95_ttft, std_p95

#### 1.3.1. Prefill: Time to First Token (TTFT)

In [None]:
models = []
medians = []
stds = []
p95s = []
std_p95s = []

for c in exp_throughput_latency.conditions:
    models.append(c.model_name)
    median, std, p95, std_p95 = get_ttft_latency_metrics(c)
    medians.append(median)
    stds.append(std)
    p95s.append(p95)
    std_p95s.append(std_p95)

condition_labels = []
latency_medians_tee_on, tee_on_stds = [], []
latency_medians_tee_off, tee_off_stds = [], []
p95_latencies_tee_on = []
p95_latencies_tee_off = []

for c in exp_throughput_latency.get_all_conditions_names(sort_by_model_size=True):
    condition_labels.append(c)
    
    # TEE ON
    median_tee_on, std_tee_on, p95_tee_on, std_p95_tee_on = get_ttft_latency_metrics(
        exp_throughput_latency.get_condition(c, TEE_Mode.TEE_ON)
    )
    latency_medians_tee_on.append(median_tee_on)
    tee_on_stds.append(std_tee_on)
    p95_latencies_tee_on.append(p95_tee_on)
    
    # TEE OFF
    median_tee_off, std_tee_off, p95_tee_off, std_p95_tee_off = get_ttft_latency_metrics(
        exp_throughput_latency.get_condition(c, TEE_Mode.TEE_OFF)
    )
    latency_medians_tee_off.append(median_tee_off)
    tee_off_stds.append(std_tee_off)
    p95_latencies_tee_off.append(p95_tee_off)

y = np.arange(len(condition_labels))
height = 0.20  # Height of the bars

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.barh(
    y - height / 2,
    latency_medians_tee_off,
    height,
    xerr=tee_off_stds,
    label="TEE off",
    capsize=5,
    alpha=0.8,
)
bars2 = ax.barh(
    y + height / 2,
    latency_medians_tee_on,
    height,
    xerr=tee_on_stds,
    label="TEE On",
    capsize=5,
    alpha=0.8,
)

# Add whiskers for p95 latency (drawn as thin lines extending rightward)
for i, (median_off, p95_off) in enumerate(zip(latency_medians_tee_off, p95_latencies_tee_off)):
    ax.plot([median_off, p95_off], [i - height / 2, i - height / 2], color="black", lw=1.5)
    ax.scatter(p95_off, i - height / 2, color="black", s=20, zorder=3)

for i, (median_on, p95_on) in enumerate(zip(latency_medians_tee_on, p95_latencies_tee_on)):
    ax.plot([median_on, p95_on], [i + height / 2, i + height / 2], color="black", lw=1.5)
    ax.scatter(p95_on, i + height / 2, color="black", s=20, zorder=3)

# Add labels, title and legend
ax.set_xlabel("Time to First Token (ms)")
ax.set_ylabel("Model")
ax.set_title("Latency Comparison: TEE Off vs TEE On")
ax.set_yticks(y)
ax.set_yticklabels(condition_labels)
ax.legend()
#ax.invert_yaxis()  # highest on top

plt.show()

In [None]:
condition = exp_throughput_latency.get_condition("Gemma3 1B", TEE_Mode.TEE_ON)


def get_decode_latency_metrics(condition: Condition):
    runs = condition.get_all_runs()
    run_medians = []
    run_p95s = []

    for run in runs:
        # Fetch aggregated TPOT stats
        median_tpot = run.get_vllm_key("median_tpot_ms")
        p95_tpot = run.get_vllm_key("p95_tpot_ms")

        # Compute mean output length for this run
        output_lens = np.array(run.get_vllm_key("output_lens"))
        mean_output_len = np.mean(output_lens)

        # Estimate total decode latency for this run (ms)
        run_medians.append(median_tpot * mean_output_len)
        run_p95s.append(p95_tpot * mean_output_len)

    # Aggregate across runs
    median_decode = np.median(run_medians)
    std_decode = np.std(run_medians, ddof=1)
    p95_decode = np.median(run_p95s)
    std_p95 = np.std(run_p95s, ddof=1)

    return median_decode, std_decode, p95_decode, std_p95

#### 1.3.2. Decode latency

In [None]:
condition_labels_decode = []
decode_latency_medians_tee_on, decode_tee_on_stds = [], []
decode_latency_medians_tee_off, decode_tee_off_stds = [], []
p95_decode_latencies_tee_on = []
p95_decode_latencies_tee_off = []

for c in exp_throughput_latency.get_all_conditions_names(sort_by_model_size=True):
    condition_labels_decode.append(c)
    
    # TEE ON
    median_tee_on, std_tee_on, p95_tee_on, std_p95_tee_on = get_decode_latency_metrics(
        exp_throughput_latency.get_condition(c, TEE_Mode.TEE_ON)
    )
    decode_latency_medians_tee_on.append(median_tee_on)
    decode_tee_on_stds.append(std_tee_on)
    p95_decode_latencies_tee_on.append(p95_tee_on)
    
    # TEE OFF
    median_tee_off, std_tee_off, p95_tee_off, std_p95_tee_off = get_decode_latency_metrics(
        exp_throughput_latency.get_condition(c, TEE_Mode.TEE_OFF)
    )
    decode_latency_medians_tee_off.append(median_tee_off)
    decode_tee_off_stds.append(std_tee_off)
    p95_decode_latencies_tee_off.append(p95_tee_off)

y = np.arange(len(condition_labels_decode))
height = 0.20  # Height of the bars

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.barh(
    y - height / 2,
    decode_latency_medians_tee_off,
    height,
    xerr=decode_tee_off_stds,
    label="TEE off",
    capsize=5,
    alpha=0.8,
)
bars2 = ax.barh(
    y + height / 2,
    decode_latency_medians_tee_on,
    height,
    xerr=decode_tee_on_stds,
    label="TEE On",
    capsize=5,
    alpha=0.8,
)

# Add whiskers for p95 latency (drawn as thin lines extending rightward)
for i, (median_off, p95_off) in enumerate(zip(decode_latency_medians_tee_off, p95_decode_latencies_tee_off)):
    ax.plot([median_off, p95_off], [i - height / 2, i - height / 2], color="black", lw=1.5)
    ax.scatter(p95_off, i - height / 2, color="black", s=20, zorder=3)

for i, (median_on, p95_on) in enumerate(zip(decode_latency_medians_tee_on, p95_decode_latencies_tee_on)):
    ax.plot([median_on, p95_on], [i + height / 2, i + height / 2], color="black", lw=1.5)
    ax.scatter(p95_on, i + height / 2, color="black", s=20, zorder=3)

# Add labels, title and legend
ax.set_xlabel("Decode Latency (ms)")
ax.set_ylabel("Model")
ax.set_title("Decode Latency Comparison: TEE Off vs TEE On")
ax.set_yticks(y)
ax.set_yticklabels(condition_labels_decode)
ax.legend()
#ax.invert_yaxis()  # highest on top

plt.show()

## 2. Saturation point

In [None]:
# vllm_data.get("max_concurrent_requests")

## 3. Sequence length overhead

## 4. Energy efficiency

In [None]:
# Clean columns for plotting
def pre_process_gpu_metrics_for_condition(m: Run) -> pd.DataFrame:
    MAX_WATTS = 700
    gpu_metrics = m.gpu_metrics
    gpu_metrics["power_draw_watts"] = gpu_metrics[" power.draw [W]"].str.rstrip("W").str.strip().astype(float)
    gpu_metrics["power_draw_percent"] = gpu_metrics["power_draw_watts"] / MAX_WATTS * 100
    gpu_metrics["utilization_gpu_percent"] = gpu_metrics[" utilization.gpu [%]"].str.rstrip("%").str.strip().astype(float)
    gpu_metrics["utilization_memory_percent"] = gpu_metrics[" utilization.memory [%]"].str.rstrip("%").str.strip().astype(float)
    gpu_metrics["temperature_gpu_celsius"] = gpu_metrics[" temperature.gpu"]
    return gpu_metrics

In [None]:
def plot_gpu_metrics_for_condition(condition: Condition):
    measurements = condition.get_all_runs()
    n_measurements = len(measurements)
    
    # Calculate grid dimensions
    n_cols = min(3, n_measurements)  # Max 3 columns
    n_rows = (n_measurements + n_cols - 1) // n_cols  # Ceiling division
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
    
    # Flatten axes array for easier iteration
    if n_measurements == 1:
        axes = [axes]
    else:
        axes = axes.flatten()
    
    for idx, measurement in enumerate(measurements):
        ax = axes[idx]
        gpu_metrics = pre_process_gpu_metrics_for_condition(measurement)
        
        # Plot metrics
        ax.plot(gpu_metrics.index, gpu_metrics["power_draw_percent"], linewidth=1.5, alpha=0.8, label="Power Draw (%)")
        ax.plot(gpu_metrics.index, gpu_metrics["utilization_gpu_percent"], linewidth=1.5, alpha=0.8, label="GPU Utilization (%)")
        ax.plot(gpu_metrics.index, gpu_metrics["utilization_memory_percent"], linewidth=1.5, alpha=0.8, label="Memory Utilization (%)")
        ax.plot(gpu_metrics.index, gpu_metrics["temperature_gpu_celsius"], linewidth=1.5, alpha=0.8, label="Temperature (°C)")
        
        ax.set_ylim(0, 100)
        ax.set_yticks(range(0, 101, 10))
        ax.set_xlabel("Sample Index")
        ax.set_ylabel("Value")
        ax.set_title(f"GPU Usage Over Time - Measurement {measurement.index}")
        ax.legend(fontsize=8)
        ax.grid(True, alpha=0.3)
    
    # Hide unused subplots
    for idx in range(len(measurements), len(axes)):
        axes[idx].set_visible(False)
    
    # Add main title
    fig.suptitle(f"GPU Metrics: {condition.name} ({condition.tee_mode.value})", fontsize=14, fontweight='bold', y=0.995)
    
    plt.tight_layout()
    plt.show()

In [None]:
gemma = exp_throughput_latency.get_condition("gemma-3-1b-it", TEE_Mode.TEE_ON)
llama = exp_throughput_latency.get_condition("Llama-3.1-8B-Instruct", TEE_Mode.TEE_ON)
mistral = exp_throughput_latency.get_condition("Mistral-Small-24B-Instruct-2501", TEE_Mode.TEE_ON)
qwen = exp_throughput_latency.get_condition("Qwen3-32B", TEE_Mode.TEE_ON)

plot_gpu_metrics_for_condition(gemma)
plot_gpu_metrics_for_condition(llama)
plot_gpu_metrics_for_condition(mistral)
plot_gpu_metrics_for_condition(qwen)

## 4. Price of operations