<!-- du4://thèse/cai/results.ipynb?d=20251024?loc=ttum?hPa=1020 -->

# Confidential Artificial Intelligence: What's the Catch?
### _Performance and costs_

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns

from dataclasses import dataclass
from enum import Enum
from functools import cached_property
from pathlib import Path
from typing import List, Tuple

In [None]:
sns.set_theme(style="ticks", context="paper")
sns.set_palette("colorblind")

In [None]:
class TEE_Mode(Enum):
    TEE_ON = "tee_on"
    TEE_OFF = "tee_off"

@dataclass
class Experiment:
    path: Path

    @property
    def name(self) -> str:
        return self.path.stem

    @cached_property
    def conditions(self) -> List["Condition"]:
        return [
            Condition(
                q, 
                q.parent.name, 
                TEE_Mode.TEE_ON if q.name == "tee_on" else TEE_Mode.TEE_OFF
            )
            for q in self.path.glob("*/*")
            if q.is_dir() and q.name in {"tee_on", "tee_off"}
        ]

    def get_all_conditions_names(self, sort_by_model_size: bool=False):
        all_conditions = self.conditions
        if sort_by_model_size:
            all_conditions_sorted = sorted(all_conditions, key=lambda c: int(c.model_size))
            return list(dict.fromkeys(c.name for c in all_conditions_sorted))
        else:
            return list(dict.fromkeys(c.name for c in all_conditions))

    def get_conditions(self, tee_mode: TEE_Mode):
        return [c for c in self.conditions if c.tee_mode == tee_mode]

    def get_condition(self, name: str, tee_mode: TEE_Mode):
        return next(
            filter(lambda c: c.name == name and c.tee_mode == tee_mode, self.conditions),
            None,
        )
    
    def get_all_measurements(self):
        return [r for c in self.conditions for r in c.measurements]
    
    def get_measurements(self, tee_mode: TEE_Mode):
        return [r for c in self.conditions for r in c.measurements if c.tee_mode == tee_mode]

    def __str__(self):
        name = self.name
        nb_conditions = len(self.conditions)
        nb_total_measurements = len(self.get_all_measurements())
        return f"Experiment: {name}, Conditions: {nb_conditions} ({nb_total_measurements} total measurements)"

@dataclass
class Condition:
    path: Path
    name: str
    tee_mode: TEE_Mode

    @property
    def model_name(self) :
        return self.path.parent.name.split("_")

    @property
    def model_size(self) -> str:
        return re.search(r"(\d+)[bB]", "_".join(self.model_name)).group(1)

    @cached_property
    def measurements(self) -> List["Measurement"]:
        measurements_paths = list(self.path.glob("*repetition_*")) # FIXME repetition -> measurement
        json_files = sorted([r for r in measurements_paths if r.suffix == ".json"])
        csv_files = sorted([r for r in measurements_paths if r.suffix == ".csv"])

        assert len(list(measurements_paths)) > 0, "Empty results"
        assert len(json_files) == len(csv_files), f"Mismatch: {len(json_files)} .json vs. {len(csv_files)} .csv: {measurements_paths}"

        return [
            Measurement(
                idx, json_file, self.path / f"{json_file.stem}_power_metrics.csv"
            )
            for idx, json_file in enumerate(json_files)
        ]

    def get_all_measurements(self) -> List["Measurement"]:
        return self.measurements

    def get_measurement(self, index: int) -> "Measurement":
        return self.measurements[index]

    def get_median_throughput_with_std(self) -> Tuple[float, float]:
        output_throughputs = [
            rep.get_vllm_key("output_throughput") for rep in self.measurements
        ]
        return np.median(output_throughputs), np.std(output_throughputs)

@dataclass
class Measurement:
    index: int
    path_vllm_json: Path
    path_power_csv: Path

    @cached_property
    def vllm_results(self) -> dict:
        return json.loads(self.path_vllm_json.read_text())

    @cached_property
    def gpu_metrics(self) -> pd.DataFrame:
        return pd.read_csv(self.path_power_csv)

    @property
    def dataset(self) -> str:
        return self.vllm_results["dataset"]

    @property
    def model_id(self) -> str:
        return self.vllm_results["model"]

    @property
    def input_length(self) -> int:
        return self.vllm_results["input_length"]

    @property
    def output_length(self) -> int:
        return self.vllm_results["output_length"]

    @property
    def concurrency(self) -> int:
        return self.vllm_results["concurrency"]

    @property
    def temperature(self) -> float:
        return self.vllm_results["temperature"]

    def get_vllm_key(self, key: str):
        return self.vllm_results[key]

## 0. Data summary

In [None]:
# Parent folder containing the data
data_path = Path("data", "calibration")  # ← FIXME
# The experiments
exp_throughput_latency = Experiment(data_path.joinpath("throughput_latency"))
exp_saturation_point   = Experiment(data_path.joinpath("saturation_point"))
exp_sequence_overhead  = Experiment(data_path.joinpath("sequence_overhead"))
exp_energy             = Experiment(data_path.joinpath("energy"))
# All experiments
all_exps = (exp_throughput_latency, exp_saturation_point, exp_sequence_overhead, exp_energy)

In [None]:
def format_seconds_long(seconds: float) -> str:
    h, remainder = divmod(int(seconds), 3600)
    m, s = divmod(remainder, 60)
    return f"{h:02d}:{m:02d}:{s:02d}"

nb_total_measurements = 0
duration_total = 0
print(f"• Number of experiments: {len(all_exps)}")
for exp in all_exps:
    print(f"  • {str(exp)}")
    all_measurements = exp.get_all_measurements()
    nb_total_measurements += len(all_measurements)
    for m in all_measurements:
        duration_total += m.get_vllm_key("duration")
print(f"• Total measurements: {nb_total_measurements}")
print(f"• Total duration: {format_seconds_long(duration_total)}")
print(f"• Estimated Azure price: {round(duration_total / 3600 * 7, 3)} €")


## 1. Throughput and Latency

### 1.1. Data summary

In [None]:
rows = []  # We build row by row
for c in exp_throughput_latency.conditions:
    for m in c.measurements:
        assert all(e == "" for e in m.get_vllm_key("errors")), (
            f"vLLM reported an error during measurement. Check .json {m.path_vllm_json}"
        )
        rows.append(
            {
                # Condition
                "condition": c.name,
                "tee_mode": c.tee_mode.value,
                # Measurement
                "Measurement #": m.index,
                "duration (s)": round(m.get_vllm_key("duration")),
                # Throughput
                "output throughput (tok/s)": m.get_vllm_key("output_throughput"),
                "total token throughput (tok/s)": m.get_vllm_key(
                    "total_token_throughput"
                ),
                # Latency TODO
                # Cloud
                "Azure cost (€)": round(m.get_vllm_key("duration") / 3600 * 7, 3),
            }
        )

pd.DataFrame(rows)

### 1.2. Throughput

In [None]:
condition_labels = []
tee_on_medians, tee_on_stds = [], []
tee_off_medians, tee_off_stds = [], []

for c in exp_throughput_latency.get_all_conditions_names(sort_by_model_size=True):
    condition_labels.append(c)
    median_tee_on, std_tee_on = exp_throughput_latency.get_condition(
        c, TEE_Mode.TEE_ON
    ).get_median_throughput_with_std()
    tee_on_medians.append(median_tee_on)
    tee_on_stds.append(std_tee_on)
    median_tee_off, std_tee_off = exp_throughput_latency.get_condition(
        c, TEE_Mode.TEE_OFF
    ).get_median_throughput_with_std()
    tee_off_medians.append(median_tee_off)
    tee_off_stds.append(std_tee_off)

# Plot grouped bar chart with error bars
x = np.arange(len(condition_labels))
width = 0.20  # Width of the bars

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(
    x - width / 2,
    tee_on_medians,
    width,
    yerr=tee_on_stds,
    label="TEE On",
    capsize=5,
    alpha=0.8,
)
bars2 = ax.bar(
    x + width / 2,
    tee_off_medians,
    width,
    yerr=tee_off_stds,
    label="TEE Off",
    capsize=5,
    alpha=0.8,
)

# Add labels, title and legend
ax.set_xlabel("Conditions")
ax.set_ylabel("Throughput")
ax.set_title("Throughput Comparison: TEE On vs TEE Off")
ax.set_xticks(x)
ax.set_xticklabels(condition_labels, rotation=45, ha="right")
ax.legend()

plt.show()

### 1.3. Latency

In [None]:
# TODO

## 2. Saturation point

In [None]:
# vllm_data.get("max_concurrent_requests")

## 3. Sequence length overhead

## 4. Energy efficiency

In [None]:
# Clean columns for plotting
def pre_process_gpu_metrics_for_condition(Measurement: m):
    MAX_WATTS = 700
    gpu_metrics = m.gpu_metrics
    gpu_metrics["power_draw_watts"] = gpu_metrics[" power.draw [W]"].str.rstrip("W").str.strip().astype(float)
    gpu_metrics["power_draw_percent"] = gpu_metrics["power_draw_watts"] / MAX_WATTS * 100
    gpu_metrics["utilization_gpu_percent"] = gpu_metrics[" utilization.gpu [%]"].str.rstrip("%").str.strip().astype(float)
    gpu_metrics["utilization_memory_percent"] = gpu_metrics[" utilization.memory [%]"].str.rstrip("%").str.strip().astype(float)
    gpu_metrics["temperature_gpu_celsius"] = gpu_metrics[" temperature.gpu"]
    return gpu_metrics

In [None]:
def plot_gpu_metrics_for_condition(condition: Condition):
    measurements = condition.get_all_measurements()
    n_measurements = len(measurements)
    
    # Calculate grid dimensions
    n_cols = min(3, n_measurements)  # Max 3 columns
    n_rows = (n_measurements + n_cols - 1) // n_cols  # Ceiling division
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4*n_rows))
    
    # Flatten axes array for easier iteration
    if n_measurements == 1:
        axes = [axes]
    else:
        axes = axes.flatten()
    
    for idx, measurement in enumerate(measurements):
        ax = axes[idx]
        gpu_metrics = pre_process_gpu_metrics_for_condition(measurement)
        
        # Plot metrics
        ax.plot(gpu_metrics.index, gpu_metrics["power_draw_percent"], linewidth=1.5, alpha=0.8, label="Power Draw (%)")
        ax.plot(gpu_metrics.index, gpu_metrics["utilization_gpu_percent"], linewidth=1.5, alpha=0.8, label="GPU Utilization (%)")
        ax.plot(gpu_metrics.index, gpu_metrics["utilization_memory_percent"], linewidth=1.5, alpha=0.8, label="Memory Utilization (%)")
        ax.plot(gpu_metrics.index, gpu_metrics["temperature_gpu_celsius"], linewidth=1.5, alpha=0.8, label="Temperature (°C)")
        
        ax.set_ylim(0, 100)
        ax.set_yticks(range(0, 101, 10))
        ax.set_xlabel("Sample Index")
        ax.set_ylabel("Value")
        ax.set_title(f"GPU Usage Over Time - Measurement {measurement.index}")
        ax.legend(fontsize=8)
        ax.grid(True, alpha=0.3)
    
    # Hide unused subplots
    for idx in range(len(measurements), len(axes)):
        axes[idx].set_visible(False)
    
    # Add main title
    fig.suptitle(f"GPU Metrics: {condition.name} ({condition.tee_mode.value})", fontsize=14, fontweight='bold', y=0.995)
    
    plt.tight_layout()
    plt.show()

In [None]:
c = exp_throughput_latency.get_condition("gemma-3-1b-it", TEE_Mode.TEE_ON)
plot_gpu_metrics_for_condition(c)

In [None]:
c = exp_throughput_latency.get_condition("Llama-3.1-8B-Instruct", TEE_Mode.TEE_OFF)
plot_gpu_metrics_for_condition(c)

## 4. Price of operations