In [1]:
import numpy as np

from parameters import *
from utils import *

# Hardware acquisition cost

ai_accelerator_cost = hardware_quantity * hardware_price

server_cost = ai_accelerator_cost * server_cost_overhead

cluster_cost = server_cost * cluster_interconnect_overhead

90% credible intervals:
- hardware_quantity: 0.8x to 1.25x of the central value. Generally confident, as it was usually reported directly by the developers. Could be slightly inaccurate if it doesn't account for hardware failures during training, or it describes the number of chips available in a cluster rather than the number of chips actually employed for training.
- hardware_price: 0.5x to 2x of the central value. We know of two reports of NVIDIA's profit margin for Hopper GPUs: 1000% and 340%. So the minimum sale price could be 10 to 30% of the value we used. But this excludes R&D cost, and there should always be some profit, so we set the minimum higher than this. Big customers could be getting much bigger discounts compared to the prices we use. On the other hand, demand could have led to price spikes that weren't captured in our sparse data.
- server_cost_overhead: 1.3 to 2.1. The three actual values we calculated ranged from 1.54 (P100) to 1.69 (V100). A cost breakdown of a DGX H100 by Semianalysis (2023) https://www.semianalysis.com/p/ai-server-cost-analysis-memory-is implies a ratio of approximately 1.4 (total cost divided by "8 GPU + 4 NVSwitch Baseboard" cost).
- cluster_interconnect_overhead: 1.07 to 1.32. Prices of interconnect components are plausibly off by a factor of 2x due to variation in the brand, the merchant, and supply/demand.

In [6]:
NUM_SAMPLES = 10000
num_ml_models = 47

hardware_acquisition_cost_median = np.zeros(NUM_SAMPLES)
hardware_acquisition_cost_90_ci = np.zeros((NUM_SAMPLES, 2))
for i in range(NUM_SAMPLES):
    # TODO: special cases where hardware quantity is less confident e.g. GPT-4
    hardware_quantity = lognorm_from_90_ci(0.8, 1.25, num_ml_models)
    hardware_price = lognorm_from_90_ci(0.5, 2, num_ml_models)
    server_cost_overhead = lognorm_from_90_ci(1.3, 2.1, num_ml_models)
    cluster_interconnect_overhead = lognorm_from_90_ci(1.07, 1.32, num_ml_models)
    hardware_acquisition_cost = hardware_quantity * hardware_price * server_cost_overhead * cluster_interconnect_overhead
    hardware_acquisition_cost_median[i] = np.median(hardware_acquisition_cost)
    hardware_acquisition_cost_90_ci[i] = np.percentile(hardware_acquisition_cost, [5, 95])


In [7]:
np.median(hardware_acquisition_cost_90_ci, axis=0) / np.median(hardware_acquisition_cost_median)

array([0.48646084, 2.06352952])

# Amortized hardware CapEx

depreciation = 10 ** (ML_GPU_PRICE_PERFORMANCE_OOMS_PER_YEAR * years_since)

training_chip_hours = hardware_quantity * training_time

OR

training_chip_hours = training_compute / (peak_flop_per_second * hardware_utilization)

In the amortized hardware CapEx + energy approach, 32 values use the first method, 15 values use the second method.

amortized_hardware_capex = server_cost_overhead * cluster_interconnect_overhead * (hardware_price / depreciation) * ML_GPU_PRICE_PERFORMANCE_OOMS_PER_YEAR * np.log(10) / HOURS_PER_YEAR * training_chip_hours

90% credible intervals:

- ML_GPU_PRICE_PERFORMANCE_OOMS_PER_YEAR: 0.10 to 0.18, based on Hobbhahn et al. (2023)
- years_since: -120 days or +120 days from the central value.
  - years_since measures the difference between the time the hardware could first be acquired at large scale, and the model training start date. The default is at least 90 days between hardware release date and hardware acquisition date, and 60 days + training time between training start date and publication date. However, shipping could be almost immediate relative to the release date, especially if customers can pre-order. Preparing results after training may only take about 30 days, especially if the evaluations are ready to run and the model is announced before a detailed report is released. In total that's (90 - 0) + (60 - 30) = 120 fewer days than the default. We keep the bounds symmetric to fit a normal distribution, since years_since is used as an exponent in the formula.
- training_time: 0.5x to 2x of the central value. It was usually reported directly by the developers, but sometimes it's reported imprecisely - e.g. "2 days" is plausibly anywhere between 24 and 72 hours; "weeks" could mean 2 weeks or 6 weeks.
- training_compute: 0.33x to 3x of the central value - a rule of thumb based on the "Confident" Confidence level in the database.
- peak_flop_per_second: 0.5 to 2x of the central value. In case we got the number format wrong.
- hardware_utilization: normal, 0.1 to 0.65. The range of values in our dataset is 0.19 to 0.56; we go a bit wider to be conservative.

In [18]:
num_with_imputed_chip_hours = 15
num_with_direct_chip_hours = num_ml_models - num_with_imputed_chip_hours

amortized_hardware_capex_medians = np.zeros(NUM_SAMPLES)
amortized_hardware_capex_90_cis = np.zeros((NUM_SAMPLES, 2))
for i in range(NUM_SAMPLES):
    price_performance = norm_from_ci(0.10, 0.18, 90, num_ml_models)
    years_since = norm_from_ci(-120, 120, 90, num_ml_models) / DAYS_PER_YEAR
    depreciation = 10 ** (price_performance * years_since)

    hardware_price = lognorm_from_90_ci(0.5, 2, num_ml_models)
    server_cost_overhead = lognorm_from_90_ci(1.3, 2.1, num_ml_models)
    cluster_interconnect_overhead = lognorm_from_90_ci(1.07, 1.32, num_ml_models)

    training_compute = lognorm_from_90_ci(1/3, 3, num_with_imputed_chip_hours)
    peak_flop_per_second = lognorm_from_90_ci(0.5, 2, num_with_imputed_chip_hours)
    hardware_utilization = norm_from_ci(0.1, 0.65, 90, num_with_imputed_chip_hours)
    realised_flop_per_second = peak_flop_per_second * hardware_utilization
    imputed_chip_hours = training_compute / realised_flop_per_second

    hardware_quantity = lognorm_from_90_ci(0.8, 1.25, num_with_direct_chip_hours)
    training_time = lognorm_from_90_ci(0.5, 2, num_with_direct_chip_hours)
    direct_chip_hours = training_time * hardware_quantity

    training_chip_hours = np.concatenate([imputed_chip_hours, direct_chip_hours])
    amortized_hardware_capex = server_cost_overhead * cluster_interconnect_overhead * (hardware_price / depreciation) * price_performance * training_chip_hours

    amortized_hardware_capex_medians[i] = np.median(amortized_hardware_capex)
    amortized_hardware_capex_90_cis[i] = np.percentile(amortized_hardware_capex, [5, 95])


Median: 3.1 [90% CI: -18, 7.8]
Median: 0.96 [90% CI: 0.45, 1.9]
Median: 0.4 [90% CI: 0.094, 0.56]
Median: 3.1 [90% CI: -18, 7.8]
Median: 0.87 [90% CI: 0.4, 2.6]
Median: 0.33 [90% CI: 0.065, 0.58]
Median: 3.1 [90% CI: -18, 7.8]


KeyboardInterrupt: 

In [17]:
np.median(amortized_hardware_capex_90_cis, axis=0) / np.median(amortized_hardware_capex_medians)

array([0.30794631, 6.01740447])