In [1]:
import numpy as np

from parameters import *
from utils import *

# Hardware acquisition cost

ai_accelerator_cost = hardware_quantity * hardware_price

server_cost = ai_accelerator_cost * server_cost_overhead

cluster_cost = server_cost * cluster_interconnect_overhead

90% credible intervals:
- hardware_quantity: 0.8x to 1.25x of the central value. Generally confident, as it was usually reported directly by the developers. Could be slightly inaccurate if it doesn't account for hardware failures during training, or it describes the number of chips available in a cluster rather than the number of chips actually employed for training.
- hardware_price: 0.5x to 2x of the central value. We know of two reports of NVIDIA's profit margin for Hopper GPUs: 1000% and 340%. So the minimum sale price could be 10 to 30% of the value we used. But this excludes R&D cost, and there should always be some profit, so we set the minimum higher than this. Big customers could be getting much bigger discounts compared to the prices we use. On the other hand, demand could have led to price spikes that weren't captured in our sparse data.
- server_cost_overhead: 1.3 to 2.1. The three actual values we calculated ranged from 1.54 (P100) to 1.69 (V100). A cost breakdown of a DGX H100 by Semianalysis (2023) https://www.semianalysis.com/p/ai-server-cost-analysis-memory-is implies a ratio of approximately 1.4 (total cost divided by "8 GPU + 4 NVSwitch Baseboard" cost).
- cluster_interconnect_overhead: 1.07 to 1.32. Prices of interconnect components are plausibly off by a factor of 2x due to variation in the brand, the merchant, and supply/demand.

In [17]:
NUM_SAMPLES = 1000

hardware_quantity = lognorm_from_90_ci(0.8, 1.25, NUM_SAMPLES)
hardware_price = lognorm_from_90_ci(0.5, 2, NUM_SAMPLES)
server_cost_overhead = lognorm_from_90_ci(1.3, 2.1, NUM_SAMPLES)
cluster_interconnect_overhead = lognorm_from_90_ci(1.07, 1.32, NUM_SAMPLES)
hardware_acquisition_cost = hardware_quantity * hardware_price * server_cost_overhead * cluster_interconnect_overhead
print_median_and_ci(hardware_acquisition_cost / np.median(hardware_acquisition_cost))


Median: 1 [90% CI: 0.46, 2.2]


# Amortized hardware CapEx

depreciation = 10 ** (ML_GPU_PRICE_PERFORMANCE_OOMS_PER_YEAR * years_since)

training_chip_hours = hardware_quantity * training_time

OR

training_chip_hours = training_compute / (peak_flop_per_second * hardware_utilization)

In the amortized hardware CapEx + energy approach, 32 values use the first method, 15 values use the second method.

amortized_hardware_capex = server_cost_overhead * cluster_interconnect_overhead * (hardware_price / depreciation) * ML_GPU_PRICE_PERFORMANCE_OOMS_PER_YEAR * np.log(10) / HOURS_PER_YEAR * training_chip_hours

90% credible intervals:

- ML_GPU_PRICE_PERFORMANCE_OOMS_PER_YEAR: 0.10 to 0.18, based on Hobbhahn et al. (2023)
- years_since: -120 days or +120 days from the central value.
  - years_since measures the difference between the time the hardware could first be acquired at large scale, and the model training start date. The default is at least 90 days between hardware release date and hardware acquisition date, and 60 days + training time between training start date and publication date. However, shipping could be almost immediate relative to the release date, especially if customers can pre-order. Preparing results after training may only take about 30 days, especially if the evaluations are ready to run and the model is announced before a detailed report is released. In total that's (90 - 0) + (60 - 30) = 120 fewer days than the default. We keep the bounds symmetric to fit a normal distribution, since years_since is used as an exponent in the formula.
- training_time: 0.5x to 2x of the central value. It was usually reported directly by the developers, but sometimes it's reported imprecisely - e.g. "2 days" is plausibly anywhere between 24 and 72 hours; "weeks" could mean 2 weeks or 6 weeks.
- training_compute: 0.33x to 3x of the central value - a rule of thumb based on the "Confident" Confidence level in the database.
- peak_flop_per_second: 0.5 to 2x of the central value. In case we got the number format wrong.
- hardware_utilization: normal, 0.1 to 0.65. The range of values in our dataset is 0.19 to 0.56; we go a bit wider to be conservative.

In [4]:
num_with_imputed_chip_hours = 15
num_with_direct_chip_hours = num_ml_models - num_with_imputed_chip_hours

In [5]:
def amortized_hardware_capex_sample():    
    price_performance = norm_from_ci(0.10, 0.18, 90, num_ml_models)
    years_since = norm_from_ci(-120, 120, 90, num_ml_models) / DAYS_PER_YEAR
    depreciation = 10 ** (price_performance * years_since)

    hardware_price = lognorm_from_90_ci(0.5, 2, num_ml_models)
    server_cost_overhead = lognorm_from_90_ci(1.3, 2.1, num_ml_models)
    cluster_interconnect_overhead = lognorm_from_90_ci(1.07, 1.32, num_ml_models)

    training_compute = lognorm_from_90_ci(1/3, 3, num_with_imputed_chip_hours)
    peak_flop_per_second = lognorm_from_90_ci(0.5, 2, num_with_imputed_chip_hours)
    hardware_utilization_raw = norm_from_ci(0.1, 0.65, 90, num_with_imputed_chip_hours, clip=[0.01, 0.99])
    hardware_utilization = hardware_utilization_raw / 0.37  # relative to average value
    realised_flop_per_second = peak_flop_per_second * hardware_utilization
    imputed_chip_hours = training_compute / realised_flop_per_second

    hardware_quantity = lognorm_from_90_ci(0.8, 1.25, num_with_direct_chip_hours)
    training_time = lognorm_from_90_ci(0.5, 2, num_with_direct_chip_hours)
    direct_chip_hours = training_time * hardware_quantity

    training_chip_hours = np.concatenate([imputed_chip_hours, direct_chip_hours])
    amortized_hardware_capex = server_cost_overhead * cluster_interconnect_overhead * (hardware_price / depreciation) * price_performance * training_chip_hours

    return amortized_hardware_capex

In [6]:
amortized_hardware_capex_medians = np.zeros(NUM_SAMPLES)
amortized_hardware_capex_90_cis = np.zeros((NUM_SAMPLES, 2))
for i in range(NUM_SAMPLES):
    amortized_hardware_capex = amortized_hardware_capex_sample()
    amortized_hardware_capex_medians[i] = np.median(amortized_hardware_capex)
    amortized_hardware_capex_90_cis[i] = np.percentile(amortized_hardware_capex, [5, 95])


In [15]:
np.median(amortized_hardware_capex_90_cis, axis=0) / np.median(amortized_hardware_capex_medians)

array([0.30220936, 3.66144434])

# Energy

Energy cost rate ($/kWh) ∗ Hardware peak TDP (kW) ∗ Average power to TDP ratio (%) ∗ Data center PUE ∗ Number of chip-hours (h)

- Energy cost rate: 0.6x to 1.7x of central value (based on variation among likely data center states; see code below)
- Hardware peak TDP (kW): assumed accurate based on hardware specifications.
- Average power to TDP ratio: 
  - TPUs: 0.3 to 0.62
  - GPUs: 0.56 to 1
- Data center PUE
  - 1.05 to 1.16 for hyperscalers (50 of 64 selected models)
  - 1.12 to 1.4 for non-hyperscalers

Uncertainty in energy prices

In [8]:
from energy import *
likely_datacenter_states = ['California', 'Nevada', 'Oregon', 'Washington']
energy_prices = [US_STATE_ENERGY_PRICES_PER_KWH[state] for state in likely_datacenter_states]
mean = np.mean(energy_prices)
np.max(energy_prices) / mean, np.min(energy_prices) / mean

(1.727914785226352, 0.6247539654972791)

In [9]:
num_with_imputed_chip_hours = 15
num_with_direct_chip_hours = num_ml_models - num_with_imputed_chip_hours
num_with_TPUs = 22
num_with_GPUs = 23
num_hyperscalers = 35  # 50/64 initially, scaled down to 45 models
num_non_hyperscalers = num_ml_models - num_hyperscalers

In [10]:
def energy_sample():
    training_compute = lognorm_from_90_ci(1/3, 3, num_with_imputed_chip_hours)
    peak_flop_per_second = lognorm_from_90_ci(0.5, 2, num_with_imputed_chip_hours)
    hardware_utilization_raw = norm_from_ci(0.1, 0.65, 90, num_with_imputed_chip_hours, clip=[0.01, 0.99])
    hardware_utilization = hardware_utilization_raw / 0.37  # relative to average value
    realised_flop_per_second = peak_flop_per_second * hardware_utilization
    imputed_chip_hours = training_compute / realised_flop_per_second

    hardware_quantity = lognorm_from_90_ci(0.8, 1.25, num_with_direct_chip_hours)
    training_time = lognorm_from_90_ci(0.5, 2, num_with_direct_chip_hours)
    direct_chip_hours = training_time * hardware_quantity

    training_chip_hours = np.concatenate([imputed_chip_hours, direct_chip_hours])

    avg_power_ratio_tpu_raw = lognorm_from_90_ci(0.3, 0.62, num_with_TPUs)
    avg_power_ratio_tpu = avg_power_ratio_tpu_raw / 0.43  # relative to central estimate
    avg_power_ratio_gpu_raw = lognorm_from_90_ci(0.56, 1.00, num_with_GPUs)
    avg_power_ratio_gpu = avg_power_ratio_gpu_raw / 0.75  # relative to central estimate
    avg_power_ratio = np.concatenate([avg_power_ratio_tpu, avg_power_ratio_gpu])

    pue_hyperscalers = lognorm_from_90_ci(1.05, 1.16, num_hyperscalers)
    pue_non_hyperscalers = lognorm_from_90_ci(1.12, 1.4, num_non_hyperscalers)
    pue = np.concatenate([pue_hyperscalers, pue_non_hyperscalers])

    tdp = 1

    energy_price = lognorm_from_90_ci(0.6, 1.7, num_ml_models)

    energy = energy_price * tdp * avg_power_ratio * pue * training_chip_hours

    return energy

In [11]:
energy_medians = np.zeros(NUM_SAMPLES)
energy_90_cis = np.zeros((NUM_SAMPLES, 2))
for i in range(NUM_SAMPLES):
    energy = energy_sample()
    energy_medians[i] = np.median(energy)
    energy_90_cis[i] = np.percentile(energy, [5, 95])

In [12]:
np.median(energy_90_cis, axis=0) / np.median(energy_medians)

array([0.33353193, 3.27406934])

# Amortized hardware CapEx + energy

In [13]:
amortized_hardware_capex_energy_medians = np.zeros(NUM_SAMPLES)
amortized_hardware_capex_energy_90_cis = np.zeros((NUM_SAMPLES, 2))
for i in range(NUM_SAMPLES):
    amortized_hardware_capex = amortized_hardware_capex_sample()
    energy = energy_sample()
    amortized_hardware_capex_energy_medians[i] = np.median(amortized_hardware_capex + energy)
    amortized_hardware_capex_energy_90_cis[i] = np.percentile(amortized_hardware_capex + energy, [5, 95])
np.median(amortized_hardware_capex_energy_90_cis, axis=0) / np.median(amortized_hardware_capex_energy_medians)

array([0.42441896, 2.93423517])

The overall uncertainty is a factor of about 3.