# Setup

In [1]:
from datetime import datetime
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from parameters import *
from plotting import *
from utils import *

In [2]:
results_dir = f'results/full-costs-amortized-hardware/'
os.makedirs(results_dir, exist_ok=True)

Sample size for Monte Carlo

In [3]:
N = 1000

In [4]:
cost_data_upfront_capex = {}
cost_data_amortized_capex = {}

# GPT-3

## Project duration

Training end time
- [Shelvane (2022, Ch. 2 p.3, or p.66)](https://uploads-ssl.webflow.com/614b70a71b9f71c9c240c7a7/6262a1a55526a373cc93207d_Shevlane%20dissertation%20preprint.pdf): “[An OpenAI staff member] told me: ‘GPT-3 existed for a long time before the paper came out. We delayed the paper. That was one of the things we could do for AGI stuff. But it’s months, it doesn't really count.’”
- Paper publication date: 2020-May-28
- [Shevlane (2022)](https://uploads-ssl.webflow.com/614b70a71b9f71c9c240c7a7/6262a1a55526a373cc93207d_Shevlane%20dissertation%20preprint.pdf). Ch.2 p.4 or p. 67: CAMERON said "Firstly, [the idea for a commercial API for GPT-3] started out as a research API. It probably was . . . early January 2020." This is a strong indication that GPT-3 had finished training by early January 2020
- It’s plausible that GPT-3 was produced soon after the Microsoft deal was announced in July 2019. Supposing the announcement coincided with Microsoft giving OpenAI access to the necessary compute, and OpenAI already having almost everything about GPT-3 planned in advance, and it took less than 1 month to train (reported training time is ~15 days), then GPT-3 could have been produced in August 2019.
- So I take August to January as the 90% CI.

In [5]:
training_end_low = datetime(2019, 8, 1)
training_end_high = datetime(2020, 1, 1)


Project start time
- Lower bound on start time: when GPT-2 was published, 2019-Feb-14
  - It could have been earlier than this, but that seems unlikely
- Upper bound on start time: 2019-Nov-01, two months before the upper bound on training completion
  - I think they’d need six weeks at the absolute minimum, to prepare training data and scale up the experiments successfully


In [6]:
project_start_low = datetime(2019, 2, 14)
project_start_high = datetime(2019, 11, 1)


In [7]:
dt2float = lambda x: x.year + x.month/12 + x.day/365
float2dt = lambda x: datetime(int(x), int((x-int(x))*12), int(((x-int(x))*12-int((x-int(x))*12))*365))

In [8]:
min_project_duration = 15 / 365  # training duration
project_duration = []
while len(project_duration) < N:
    project_start = lognorm_from_90_ci(dt2float(project_start_low), dt2float(project_start_high), 1)
    training_end = lognorm_from_90_ci(dt2float(training_end_low), dt2float(training_end_high), 1)
    project_duration_sample = training_end[0] - project_start[0]
    if project_duration_sample > min_project_duration:
        project_duration.append(project_duration_sample)
project_duration = np.array(project_duration)
print_median_and_ci(project_duration * 12)

Median: 4.3 [90% CI: 1, 8.9]


## GPU hardware

Price at release (2017 Q3) for a NVIDIA DGX-1 with 8x V100 GPUs was $149,000. This includes CPUs, node-level interconnect hardware, and cooling equipment inside the server.
The cluster used to train GPT-3 was reportedly 10,000 V100 GPUs. Assume that they used DGX-1 servers. 10,000 / 8 = 1250 DGX-1 servers.
1250 * $149,000 = $186,250,000.


In [9]:
num_gpus = 10000
price_per_server = 149e3
gpus_per_server = 8
num_servers = num_gpus / gpus_per_server
gpu_hardware_cost = price_per_server * num_servers
printg(gpu_hardware_cost)

1.8625e+08


## Other hardware

The NVIDIA DGX-1 server already covers CPUs and node-level interconnect
We assume that the biggest cost on top of that is cluster-level interconnect (transceivers, switches, and cables). 
Assume $11 per Gbps of server-level interconnect
Unsure if $11 is accurate for V100. $11 may only be accurate for current generation hardware.
How fast is V100 interconnect? With NVLink it’s 300 Gbps per GPU (https://www.nvidia.com/en-gb/data-center/tesla-v100/). It’s likely that OpenAI was using NVLink by the time they trained GPT-3.
$11/Gbps * 10000 GPUs * 300 Gbps = $33M


In [10]:
cluster_interconnect_cost_per_gbps = 11
cluster_interconnect_bandwidth = 300
cluster_interconnect_cost = cluster_interconnect_cost_per_gbps * cluster_interconnect_bandwidth * num_gpus
printg(cluster_interconnect_cost)

3.3e+07


## Energy

- Total energy consumption of training: 1287 MWh (Patterson et al. 2021)
- I don’t see the location of the Microsoft datacenter. Will assume it was Washington.
- According to this [2018 Industrial Energy Data Book](https://www.nrel.gov/docs/fy20osti/73901.pdf), industrial energy retailed for $0.046/kWh in Washington, the cheapest rate of any state in the US
- In the most expensive state, Hawaii, energy prices are 4x higher.

In [11]:
experiment_factor = 2.0
energy_consumption_kwh = 1287e3 * experiment_factor
price_per_kwh = 0.083
energy_cost = energy_consumption_kwh * price_per_kwh
printg(energy_cost)

213642


## Personnel

Full-time equivalents (FTEs)
- 31 authors
- Guess: lognormal, 90% CI: 30% to 150% of #authors FTEs (i.e. ~10 to 46 people)

Salary
- Current data at https://www.levels.fyi/companies/openai/salaries/software-engineer?country=254 suggests - $500k to $1M is typical (including stock). Given recent investments in OpenAI, this may have increased a lot since 2019-2020, especially the stock component.
- Guess at the salary distribution: lognormal, 90% CI: $300K to $2M


In [12]:
total_salaries = np.zeros(N)
for i in range(N):
    ftes = int(lognorm_from_90_ci(0.3 * 31, 1.5 * 31, 1)[0])
    salary = lognorm_from_90_ci(3e5, 2e6, ftes)
    total_salaries[i] = salary.sum()

In [13]:
personnel_cost = total_salaries * project_duration

In [14]:
print_median_and_ci(personnel_cost)

Median: 6.1e+06 [90% CI: 1.3e+06, 2.1e+07]


## Overall

In [15]:
cost_data_upfront_capex['GPT-3'] = {
    'GPU servers': gpu_hardware_cost,
    'Other hardware': cluster_interconnect_cost,
    'Energy': energy_cost,
    'Personnel': personnel_cost.mean(),
}

In [16]:
v100_server_lifetime = lognorm_from_90_ci(2, 7, N)  # years

In [17]:
cost_data_amortized_capex['GPT-3'] = {
    'GPU servers': np.mean(gpu_hardware_cost * project_duration / v100_server_lifetime),
    'Other hardware': np.mean(cluster_interconnect_cost * project_duration / v100_server_lifetime),  # assume same lifetime for interconnect
    'Energy': energy_cost,
    'Personnel': personnel_cost.mean(),
}

# OPT-175B

## GPU hardware

$167000 for DGX A100, 8 GPU, 640GB. In Jan 2022. From a reseller.
https://web.archive.org/web/20220120191032/https://www.dihuni.com/product/dgx-a100-comparable-dihuni-optiready-ai-a100-sxm4-8nve-8-x-ampere-a100-sxm4-nvlink-epyc-deep-learning-server/

In [18]:
num_gpus = 992
price_per_server = 167e3
gpus_per_server = 8
num_servers = num_gpus / gpus_per_server
gpu_hardware_cost = price_per_server * num_servers
printg(gpu_hardware_cost)

2.0708e+07


## Other hardware

The NVIDIA DGX A100 server already covers CPUs and node-level interconnect
We assume that the biggest cost on top of that is cluster-level interconnect (transceivers, switches, and cables). 
Assume $11 per Gbps of server-level interconnect
How fast is A100 interconnect? With NVLink it’s 600 Gbps per GPU (https://www.nvidia.com/en-gb/data-center/tesla-v100/).


In [19]:
cluster_interconnect_cost_per_gbps = 11
cluster_interconnect_bandwidth = 600
cluster_interconnect_cost = cluster_interconnect_cost_per_gbps * cluster_interconnect_bandwidth * num_gpus
printg(cluster_interconnect_cost)

6.5472e+06


## Energy

https://arxiv.org/abs/2205.01068: "[...] OPT-175B was developed with an estimated carbon emissions footprint (CO2eq) of 75 tons [...] With ablations, baselines and downtime, our own estimates of total cost is roughly 2× higher."

- I haven't found numbers for carbon intensity, or how they calculate it
- According to https://arxiv.org/pdf/2104.10350.pdf (Table 1), Gross CO2e/KWh (kg/KWh) for the “Google Iowa Council Bluffs” data center in 2020 was 0.478 CO2e/kWh (kg/kWh)
- Energy use:
  - 992 A100 GPU units
  - 300W per unit (https://www.nvidia.com/en-us/data-center/a100/)
  - Training time 33 days (https://github.com/facebookresearch/metaseq/blob/d703cf1ae1e0faaff6c20629398dfbe02b98cf77/projects/OPT/chronicles/final_update.md)
  - Energy = 992 units * 0.3 kW/unit * 33 days * 24h/day ~= 236000 kWh
  - 75000 kg / 236000 kWh ~= 0.318 CO2e/kWh (kg/kWh)
  - Sanity check: this is similar to the Google data center figure of 0.478

In [20]:
training_time = 33 * 24
power_per_gpu = 0.3
experiment_factor = 2.0
energy_consumption_kwh = training_time * num_gpus * power_per_gpu * experiment_factor
price_per_kwh = 0.083
energy_cost = energy_consumption_kwh * price_per_kwh
energy_cost

39126.0672

## Personnel

2021-Oct-20: First date in [OPT logbook](https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/chronicles/OPT175B_Logbook.pdf)

2022-Jan-06: Date in the logbook where the final training run is stated to have completed

In [21]:
project_duration = 78 / DAYS_PER_YEAR

In [22]:
num_authors = 19

In [23]:
total_salaries = np.zeros(N)
for i in range(N):
    ftes = int(lognorm_from_90_ci(0.3 * num_authors, 1.5 * num_authors, 1)[0])
    salary = lognorm_from_90_ci(3e5, 2e6, ftes)
    total_salaries[i] = salary.sum()

In [24]:
personnel_cost = total_salaries * project_duration

In [25]:
print_median_and_ci(personnel_cost)

Median: 2.3e+06 [90% CI: 8.6e+05, 5.8e+06]


## Overall

In [26]:
cost_data_upfront_capex['OPT-175B'] = {
    'GPU servers': gpu_hardware_cost,
    'Other hardware': cluster_interconnect_cost,
    'Energy': energy_cost,
    'Personnel': personnel_cost.mean(),
}

In [27]:
a100_server_lifetime = lognorm_from_90_ci(2, 7, N)  # years

In [28]:
cost_data_amortized_capex['OPT-175B'] = {
    'GPU servers': np.mean(gpu_hardware_cost * project_duration / a100_server_lifetime),
    'Other hardware': np.mean(cluster_interconnect_cost * project_duration / a100_server_lifetime),
    'Energy': energy_cost,
    'Personnel': personnel_cost.mean(),
}
cost_data_amortized_capex['OPT-175B']

{'GPU servers': 1269351.2408554377,
 'Other hardware': 401327.8174680666,
 'Energy': 39126.0672,
 'Personnel': 2676758.3713047686}

# GPT-4

Use the [estimate by Yafah Edelman](https://www.lesswrong.com/posts/nXcHe7t4rqHMjhzau/report-on-frontier-model-training#Cost_Breakdown_of_ML_Training) as a placeholder

In [29]:
total_cost = 5e8
cost_data_upfront_capex['GPT-4'] = {
    'GPU servers': 0.569 * total_cost,
    'Other hardware': 0.244 * total_cost,
    'Energy': 0.01 * total_cost,
    'Personnel': 0.178 * total_cost,
}

In [30]:
# Amortized capex
upfront_hardware_capex = (0.569 + 0.244) * total_cost
training_time = 2280 / HOURS_PER_YEAR  # in years, from PCD database
experiment_factor = 2
amortized_server_capex = upfront_hardware_capex * (training_time * experiment_factor) / a100_server_lifetime
print_median_and_ci(amortized_server_capex)
print_median_and_ci(amortized_server_capex + 0.01 * total_cost + 0.178 * total_cost)

Median: 5.6e+07 [90% CI: 3.1e+07, 1e+08]
Median: 1.5e+08 [90% CI: 1.2e+08, 2e+08]


In [31]:
# Yafah's labour cost estimate was based on 1 year of spending, but we're only counting 6 months of training + experiments
adjusted_personnel_cost = (training_time * experiment_factor) * 0.178 * total_cost
adjusted_personnel_cost

46297056.81040383

In [32]:
cost_data_amortized_capex['GPT-4'] = {
    'GPU servers': amortized_server_capex.mean() * 0.569,
    'Other hardware': amortized_server_capex.mean() * 0.244,
    'Energy': 0.01 * total_cost,
    'Personnel': adjusted_personnel_cost,
}

# Gemini Ultra

See https://colab.research.google.com/drive/1XEKlSo-3DCFp686yGOwwfS6_DEHsFimd#scrollTo=yqWMux2iZL8L

In [33]:
# Lognormal distribution parameters for project duration
p_5th_project_duration_years = 7 / 12
p_95th_project_duration_years = 20 / 12

project_duration = lognorm_from_90_ci(p_5th_project_duration_years, p_95th_project_duration_years, N)

project_duration_hours = project_duration * HOURS_PER_YEAR

print_median_and_ci(project_duration, ci=[5, 95])

Median: 1 [90% CI: 0.58, 1.7]


In [34]:
chips_per_pod = 4096
number_of_pods = np.random.uniform(12, 19, N)
number_of_chips = number_of_pods * chips_per_pod
print_median_and_ci(number_of_chips)

Median: 6.4e+04 [90% CI: 5.1e+04, 7.6e+04]


In [35]:
cost_per_kwh = 0.083  # default assumption for US data centers
power_per_chip = 0.3  # Unsure - TPUv3 is under 300W for most models in https://arxiv.org/pdf/2104.10350.pdf, but this is TPUv4. See also https://cloud.google.com/tpu/docs/v4.
energy_cost = number_of_chips * power_per_chip * project_duration_hours * cost_per_kwh
print_median_and_ci(energy_cost)

Median: 1.4e+07 [90% CI: 7.5e+06, 2.4e+07]


In [36]:
hardware_cost = 6.7e8
# Use GPU/non-GPU fractions from Yafah Edelman as a placeholder
cost_data_upfront_capex['Gemini Ultra'] = {
    'GPU servers': 0.7 * hardware_cost,
    'Other hardware': 0.3 * hardware_cost,
    'Energy': energy_cost.mean(),
    'Personnel': 1.6e8,
}

In [37]:
tpuv4_server_lifetime = lognorm_from_90_ci(3, 8, N)  # years

In [38]:
cost_data_amortized_capex['Gemini Ultra'] = {
    'GPU servers': 0.7 * np.mean(hardware_cost * project_duration / tpuv4_server_lifetime),
    'Other hardware': 0.3 * np.mean(hardware_cost * project_duration / tpuv4_server_lifetime),
    'Energy': energy_cost.mean(),
    'Personnel': 1.6e8,
}

In [39]:
cost_data_amortized_capex

{'GPT-3': {'GPU servers': 19873148.362262834,
  'Other hardware': 3521148.434655965,
  'Energy': 213642.0,
  'Personnel': 7908516.951958606},
 'OPT-175B': {'GPU servers': 1269351.2408554377,
  'Other hardware': 401327.8174680666,
  'Energy': 39126.0672,
  'Personnel': 2676758.3713047686},
 'GPT-4': {'GPU servers': 34536273.121728174,
  'Other hardware': 14809930.829001185,
  'Energy': 5000000.0,
  'Personnel': 46297056.81040383},
 'Gemini Ultra': {'GPU servers': 104630802.00613347,
  'Other hardware': 44841772.288342915,
  'Energy': 14454554.150147015,
  'Personnel': 160000000.0}}

# Plots

In [40]:
# Prepare data for Plotly
categories = list(next(iter(cost_data_amortized_capex.values())).keys())
models = [k for k in cost_data_amortized_capex.keys()] # if 'OPT' not in k]

In [41]:
total_costs = {model:sum([cost_data_amortized_capex[model][category] for category in categories]) for model in models}
total_costs

{'GPT-3': 31516455.748877406,
 'OPT-175B': 4386563.496828273,
 'GPT-4': 100643260.7611332,
 'Gemini Ultra': 323927128.4446234}

In [42]:
# Creating the figure
fig = go.Figure()

for category in ['GPU servers', 'Other hardware', 'Personnel', 'Energy']:
    fig.add_trace(go.Bar(
        name=category,
        x=list(models),
        y=[cost_data_amortized_capex[model][category] for model in models],
        # text=[human_format(cost_data[model][category]) for model in models],
        # text=[f'{cost_data_amortized_capex[model][category] / total_costs[model] * 100:.1f}%' for model in models],
        textposition='auto',
    ))

# Log y axis
# fig.update_yaxes(type='log')
# fig.update_yaxes(range=[7, 9])

# y ticks
fig.update_yaxes(range=[0, 1e9])
fig.update_yaxes(tickvals=[0, 2e8, 4e8, 6e8, 8e8, 1e9], ticktext=['0', '200M', '400M', '600M', '800M', '1B'])

# Update the layout
fig.update_layout(
    barmode='stack',
    title='Full capital and operational costs of experiments and training',
    xaxis_title='Models',
    yaxis_title='Cost',
    legend_title='Cost Components',
    width=600,
    height=400,
)

# Save plot
save_plot(fig, results_dir, 'full_costs_stacked')

# Show the figure
fig.show()

In [43]:
# Creating the figure
fig = go.Figure()

for category in ['GPU servers', 'Other hardware', 'Personnel', 'Energy']:
    proportions = [cost_data_amortized_capex[model][category] / total_costs[model] * 100 for model in models]
    fig.add_trace(go.Bar(
        name=category,
        x=list(models),
        y=proportions,
        # text=[human_format(cost_data[model][category]) for model in models],
        # text=[f'{pp:.1f}%' for pp in proportions],
        textposition='auto',
    ))

# y ticks
fig.update_yaxes(range=[0, 100])

# Update the layout
fig.update_layout(
    barmode='stack',
    title='Proportions of cost for experiments and training',
    xaxis_title='Models',
    yaxis_title='Proportion (%)',
    legend_title='Cost Components',
    width=600,
    height=400,
)

# Save plot
save_plot(fig, results_dir, 'cost_proportions_stacked')

# Show the figure
fig.show()