<!-- du4://thèse/cai/results.ipynb?d=20251024?loc=ttum?hPa=1020 -->

# Confidential Artificial Intelligence: What's the Catch?
### _Performance and costs_

In [None]:
import json
import matplotlib.pyplot as plt
import math
import numpy as np
import pandas as pd
import re
import seaborn as sns

from dataclasses import dataclass, field
from functools import cached_property
from pathlib import Path
from typing import List, Tuple

In [None]:
sns.set_theme(style="ticks", context="paper")
sns.set_palette("colorblind")

In [None]:
# Parent folder containing the data
data_path = Path("data", "ko")  # ← FIXME

In [None]:
@dataclass
class Experiment:
    path: Path

    @property
    def name(self) -> str:
        return self.path.stem

    @cached_property
    def conditions(self) -> List["Condition"]:
        return [
            Condition(q, q.parent.name, q.name == "tee_on")
            for q in self.path.glob("*/*")
            if q.is_dir() and q.name in {"tee_on", "tee_off"}
        ]

    def get_all_conditions(self):
        return self.conditions

    def get_all_conditions_names(self, sort_by_model_size: bool=False):
        all_conditions = self.conditions
        if sort_by_model_size:
            all_conditions_sorted = sorted(all_conditions, key=lambda c: int(c.model_size))
            return list(dict.fromkeys(c.name for c in all_conditions_sorted))
        else:
            return list(dict.fromkeys(c.name for c in all_conditions))

    def get_conditions(self, tee_on: bool=False):
        return [c for c in self.conditions if c.tee_on == tee_on]

    def get_condition(self, name: str, tee_on: bool=False):
        return next(
            filter(lambda c: c.name == name and c.tee_on == tee_on, self.conditions),
            None,
        )

    def __str__(self):
        return f"Experiment: {self.name}, Path: {self.path.absolute()}, {len(self.conditions)} conditions: {[c.name for c in self.conditions]}"

@dataclass
class Condition:
    path: Path
    name: str
    tee_on: bool

    @property
    def model_name(self) :
        return self.path.parent.name.split("_")

    @property
    def model_size(self) -> str:
        return re.search(r"(\d+)[bB]", "_".join(self.model_name)).group(1)

    @cached_property
    def repetitions(self) -> List["Repetition"]:
        repetitions_paths = list(self.path.glob("*repetition_*"))
        json_files = sorted([r for r in repetitions_paths if r.suffix == ".json"])
        csv_files = sorted([r for r in repetitions_paths if r.suffix == ".csv"])

        assert len(list(repetitions_paths)) > 0, "Empty results"
        assert len(json_files) == len(csv_files), f"Mismatch: {len(json_files)} .json vs. {len(csv_files)} .csv"

        return [
            Repetition(
                idx, json_file, self.path / f"{json_file.stem}_power_metrics.csv"
            )
            for idx, json_file in enumerate(json_files)
        ]

    def get_all_repetitions(self) -> List["Repetition"]:
        return self.repetitions

    def get_repetition(self, tee_on: bool, index: int) -> "Repetition":
        return self.repetitions[index]

    def get_median_throughput_with_std(self) -> Tuple[float, float]:
        output_throughputs = [
            rep.get_vllm_key("output_throughput") for rep in self.repetitions
        ]
        return np.median(output_throughputs), np.std(output_throughputs)

@dataclass
class Repetition:
    index: int
    path_vllm_json: Path
    path_power_csv: Path

    @cached_property
    def vllm_results(self) -> dict:
        return json.loads(self.path_vllm_json.read_text())

    @cached_property
    def power_results(self) -> pd.DataFrame:
        return pd.read_csv(self.path_power_csv)

    @property
    def dataset(self) -> str:
        return self.vllm_results["dataset"]

    @property
    def model_id(self) -> str:
        return self.vllm_results["model"]

    @property
    def input_length(self) -> int:
        return self.vllm_results["input_length"]

    @property
    def output_length(self) -> int:
        return self.vllm_results["output_length"]

    @property
    def concurrency(self) -> int:
        return self.vllm_results["concurrency"]

    @property
    def temperature(self) -> float:
        return self.vllm_results["temperature"]

    def get_vllm_key(self, key: str):
        return self.vllm_results[key]

# 0. Data summary

- Number of run
- Total accumulated time (+ estimated cost)

## 1. Throughput and Latency

In [None]:
experiment_throughput_latency = Experiment(data_path.joinpath("throughput_latency"))

## Data summary

In [None]:
rows = []  # We build row by row
for condition in experiment_throughput_latency.get_all_conditions():
    repetitions_paths = condition.get_all_repetitions()

    for rep in repetitions_paths:
        assert all(e == "" for e in rep.get_vllm_key("errors")), (
            "vLLM reported an error. Check .json."
        )
        rows.append(
            {
                # Repetition
                "condition": condition.name,
                "tee_on": condition.tee_on,
                "repetition #": rep.index,
                "duration (s)": round(rep.get_vllm_key("duration")),
                # Throughput
                "output throughput (tok/s)": rep.get_vllm_key("output_throughput"),
                "total token throughput (tok/s)": rep.get_vllm_key(
                    "total_token_throughput"
                ),
                # Latency
                "Azure cost (€)": round(rep.get_vllm_key("duration") / 3600 * 7, 3),
            }
        )

pd.DataFrame(rows)

In [None]:
for condition in experiment_throughput_latency.get_all_conditions_names(sort_by_model_size=True):
    print(condition)

In [None]:
condition_labels = []
tee_on_medians, tee_on_stds = [], []
tee_off_medians, tee_off_stds = [], []

for condition in experiment_throughput_latency.get_all_conditions_names(sort_by_model_size=True):
    condition_labels.append(condition)
    median_tee_on, std_tee_on = experiment_throughput_latency.get_condition(
        condition, True
    ).get_median_throughput_with_std()
    tee_on_medians.append(median_tee_on)
    tee_on_stds.append(std_tee_on)
    median_tee_off, std_tee_off = experiment_throughput_latency.get_condition(
        condition, False
    ).get_median_throughput_with_std()
    tee_off_medians.append(median_tee_off)
    tee_off_stds.append(std_tee_off)

# Plot grouped bar chart with error bars
x = np.arange(len(condition_labels))
width = 0.20  # Width of the bars

fig, ax = plt.subplots(figsize=(10, 6))
bars1 = ax.bar(
    x - width / 2,
    tee_on_medians,
    width,
    yerr=tee_on_stds,
    label="TEE On",
    capsize=5,
    alpha=0.8,
)
bars2 = ax.bar(
    x + width / 2,
    tee_off_medians,
    width,
    yerr=tee_off_stds,
    label="TEE Off",
    capsize=5,
    alpha=0.8,
)

# Add labels, title and legend
ax.set_xlabel("Conditions")
ax.set_ylabel("Throughput")
ax.set_title("Throughput Comparison: TEE On vs TEE Off")
ax.set_xticks(x)
ax.set_xticklabels(condition_labels, rotation=45, ha="right")
ax.legend()

plt.show()


## 2. Saturation point

In [None]:
# vllm_data.get("max_concurrent_requests")

## 3. Sequence length overhead

## 5. Energy efficiency

## 4. Price of operations