# Setup

In [53]:
from collections import defaultdict
from datetime import datetime
import json
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from cost import *
from data import *
from energy import *
from inflation import *
from parameters import *
from plotting import *
from utils import *

In [54]:
results_dir = f'results/development-costs/'
os.makedirs(results_dir, exist_ok=True)

Sample size for distributions

In [55]:
N = 1000

Cost of staff compensation
- Use Google Software Engineer as the reference: https://web.archive.org/web/20240515221234/https://www.levels.fyi/companies/google/salaries/software-engineer/locations/san-francisco-bay-area?dma=807
- Approximately 200K to 1.2M


In [56]:
compensation_low = 2e5
compensation_high = 1.2e6


Apply 1.25 - 1.4 overhead factor for benefits, payroll, etc. https://www.sba.gov/blog/how-much-does-employee-cost-you 

In [57]:
compensation_overhead_low = 1.25
compensation_overhead_high = 1.4

Anchor number of FTEs to the author count on the paper.
For models we consider here, the author list is a decent anchor point for the actual contributors, because e.g. engineers are included.

In [58]:
author_factor_low = 0.3
author_factor_high = 1.5

In [59]:
cost_data = {}
cost_data_with_hardware_acq = {}

Default ratio of total experiment + training compute to final training run compute
- Smaller versions of GPT-3 made up ~4.5e22 FLOP (based on the formula compute = 6 * parameters * tokens), compared to 175B using ~3e23 FLOP. See Table 2.1 of the [paper](https://arxiv.org/pdf/2005.14165). That's a factor of ~1.14 to go from 175B to total FLOP. Certainly more than that was needed due to failures and other experiments. So 1.2x seems like a good low-end estimate.
- https://arxiv.org/pdf/2111.00364.pdf, p.3, Fig. 3 caption: "At Facebook, we observe a rough power capacity breakdown of 10:20:70 for AI infrastructures devoted to the three key phases — Experimentation, Training, and Inference". 10:20 ratio means a 1.5x multiplier from training to training+experiment compute. Note however that the "training" category is not just the final training run; it includes additional hyper-parameter tuning and retraining for some models. So the actual multiplier would be higher under our definition.
- BLOOM project - https://arxiv.org/abs/2211.02001, p.8, Table 5. ~63% of compute was spent training smaller models than 176B. That's a multiplier of ~2.7x.
- https://arxiv.org/abs/2205.01068: "[...] OPT-175B was developed with an estimated carbon emissions footprint (CO2eq) of 75 tons [...] With ablations, baselines and downtime, our own estimates of total cost is roughly 2× higher."
- We think the Facebook, BLOOM and OPT-175B cases are the more central examples as they account better for all experiments. A factor close to 2x seems like a reasonable median estimate.
- On the high end, it's plausible that several, full iterations of the full-scale training run are necessary before success - say, 4x.

In [60]:
experiment_factor = lognorm_from_90_ci(1.2, 4, N)
print_median_and_ci(experiment_factor)

Median: 2.2 [90% CI: 1.2, 4.2]


Uncertainty in energy prices

In [61]:
likely_datacenter_states = ['California', 'Nevada', 'Oregon', 'Washington']
energy_prices = [US_STATE_ENERGY_PRICES_PER_KWH[state] for state in likely_datacenter_states]
mean = np.mean(energy_prices)
np.max(energy_prices) / mean, np.min(energy_prices) / mean

(1.727914785226352, 0.6247539654972791)

In [62]:
energy_price_uncertainty_factor = lognorm_from_90_ci(0.6, 1.7, N)

In [63]:
dt2float = lambda x: x.year + x.month/12 + x.day/365.25
float2dt = lambda x: datetime(int(x), int((x-int(x))*12), int(((x-int(x))*12-int((x-int(x))*12))*365.25))

In [64]:
frontier_pcd_df, hardware_df, price_df = load_data_for_cost_estimation(
    compute_threshold_method='top_n', compute_threshold=10,
)

In [65]:
frontier_pcd_df['Publication date'] = pd.to_datetime(frontier_pcd_df['Publication date'])

Models

In [66]:
models = ['GPT-3 175B (davinci)', 'OPT-175B', 'GPT-4', 'Gemini Ultra']

In [67]:
selected_model_df = frontier_pcd_df.loc[[s in models for s in frontier_pcd_df['System']]]

## Hardware

In [68]:
# Unset index
selected_model_df.reset_index(inplace=True)

In [69]:
def _estimate_chip_hours(row):
    return estimate_chip_hours(row, hardware_df)

selected_model_df['Training chip-hours'] = selected_model_df.apply(_estimate_chip_hours, axis=1)
selected_model_df['Training chip-hours']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_model_df['Training chip-hours'] = selected_model_df.apply(_estimate_chip_hours, axis=1)


0    132000000.0
1     57000000.0
2       812544.0
3      3552000.0
Name: Training chip-hours, dtype: float64

In [70]:
selected_model_df.loc[:, 'Original training chip-hours'] = selected_model_df.loc[:, 'Training chip-hours']
hardware_costs = defaultdict(dict)
cost_component_names = [
    'AI accelerator chip cost',
    'Other server components cost',
    'Cluster-level interconnect cost',
    'Energy cost',
]

for percentile in [2.5, 50, 97.5]:
    print(f'\n\n{percentile}th percentile of experiments time')
    experiment_factor_value = np.percentile(experiment_factor, percentile)
    selected_model_df.loc[:, 'Training chip-hours'] = selected_model_df.loc[:, 'Original training chip-hours'] * experiment_factor_value
    cost_df = estimate_hardware_capex_opex(selected_model_df, hardware_df, price_df, separate_components=True)
    cost_df = cost_df.set_index('System', inplace=False)

    for model in models:
        cost_components = {}
        for cost_component_name in cost_component_names:
            cost_components[cost_component_name] = cost_df.loc[model, cost_component_name]
        hardware_costs[model][str(percentile)] = cost_components

hardware_costs



2.5th percentile of experiments time
==== System: Gemini Ultra ====
Estimated the value of Google TPU v4 server, available from 2021-05-20 00:00:00 and used from 2023-05-10 00:00:00: 10527.742991314644

==== System: GPT-4 ====
Estimated the value of NVIDIA A100 SXM4 40 GB server, available from 2020-08-12 00:00:00 and used from 2022-05-12 00:00:00: 14165.054738257799

==== System: OPT-175B ====
Estimated the value of NVIDIA A100 SXM4 80 GB server, available from 2021-02-14 00:00:00 and used from 2022-02-27 23:00:00: 17818.700400260856

==== System: GPT-3 175B (davinci) ====
Estimated the value of NVIDIA Tesla V100 DGXS 32 GB server, available from 2018-06-25 00:00:00 and used from 2019-10-01 00:00:00: 11463.61551009175

==== System: Gemini Ultra ====
Estimated cost: {'AI accelerator chip cost': 33947393.76762005, 'Other server components cost': 21726332.011276826, 'Cluster-level interconnect cost': 8351058.866834526, 'Energy cost': 3190085.818631811}
==== System: GPT-4 ====
Estimated

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_model_df.loc[:, 'Original training chip-hours'] = selected_model_df.loc[:, 'Training chip-hours']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_model_df.loc[:, 'Training chip-hours'] = selected_model_df.loc[:, 'Original training chip-hours'] * experiment_factor_value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

defaultdict(dict,
            {'GPT-3 175B (davinci)': {'2.5': {'AI accelerator chip cost': 965270.3015984484,
               'Other server components cost': 666036.5081029293,
               'Cluster-level interconnect cost': 244696.02145520653,
               'Energy cost': 120987.05188204613},
              '50': {'AI accelerator chip cost': 1939969.396907448,
               'Other server components cost': 1338578.883866139,
               'Cluster-level interconnect cost': 491782.24211603776,
               'Energy cost': 243155.90947380333},
              '97.5': {'AI accelerator chip cost': 4181376.0871243533,
               'Other server components cost': 2885149.5001158034,
               'Cluster-level interconnect cost': 1059978.838086023,
               'Energy cost': 524093.9918627186}},
             'OPT-175B': {'2.5': {'AI accelerator chip cost': 349426.55463209096,
               'Other server components cost': 230621.52605718002,
               'Cluster-level interconne

# GPT-3

## Project duration

Training end time
1. Shevlane (2022)
https://uploads-ssl.webflow.com/614b70a71b9f71c9c240c7a7/6262a1a55526a373cc93207d_Shevlane%20dissertation%20preprint.pdf
p.66 of the PDF: A senior member of OpenAI (specified anonymously on p.27 of the PDF) 
told the author "GPT-3 existed for a long time before the paper came out. We delayed the 
paper. [...] But it's months, it doesn't really count."
p.67 of the PDF: CAMERON said "Firstly, [the idea for a commercial API for GPT-3] 
started out as a research API. It probably was . . . early January 2020."

1. We think it plausibly could have been produced soon after the Microsoft deal was 
announced in July 2019. Supposing the announcement coincided with Microsoft giving 
OpenAI access to the necessary compute, and OpenAI already having almost everything 
about GPT-3 planned in advance, and it took less than 1 month to train, then GPT-3 could 
have been produced in August 2019.

1. So we estimate August to January as our 90% CI, and halfway between (October) as the
central estimate.

In [71]:
training_end_low = datetime(2019, 8, 1)
training_end_high = datetime(2020, 1, 1)


Project start time
- Lower bound on start time: when GPT-2 was published, 2019-Feb-14
  - It could have been earlier than this, but that seems unlikely unless GPT-2 was delayed for months
  - The scaling laws paper was only published in January 2020, and GPT-3 was based on the findings of that paper
- Upper bound on start time: 2019-Nov-01, two months before the upper bound on training completion
  - We think they’d need six weeks at the absolute minimum, to prepare training data and scale up the experiments successfully


In [72]:
project_start_low = datetime(2019, 2, 14)
project_start_high = datetime(2019, 11, 1)


In [73]:
publication_date = frontier_pcd_df.loc[frontier_pcd_df['System'] == 'GPT-3 175B (davinci)', 'Publication date'].values[0]
publication_date

numpy.datetime64('2020-05-28T00:00:00.000000000')

In [74]:
min_project_duration = 15 / 365  # training duration
project_duration = []
while len(project_duration) < N:
    project_start_sample = lognorm_from_90_ci(dt2float(project_start_low), dt2float(project_start_high), 1)[0]
    training_end_sample = lognorm_from_90_ci(dt2float(training_end_low), dt2float(training_end_high), 1)[0]
    experiment_duration_sample = training_end_sample - project_start_sample
    if experiment_duration_sample > min_project_duration:
        project_duration.append(experiment_duration_sample)

project_duration = []
while len(project_duration) < N:
    project_start_sample = lognorm_from_90_ci(dt2float(project_start_low), dt2float(project_start_high), 1)[0]
    # Paper was delayed by "months"
    publication_delay_sample = lognorm_from_90_ci(2*30, 4*30, 1)[0]
    project_end_sample = dt2float(publication_date - pd.to_timedelta(publication_delay_sample, unit='D'))
    project_duration_sample = project_end_sample - project_start_sample
    if project_duration_sample > min_project_duration:
        project_duration.append(project_duration_sample)

project_duration = np.array(project_duration)
project_duration = np.array(project_duration)
print_median_and_ci(project_duration * 12)  # convert years to months
print_median_and_ci(project_duration * 12)

Median: 8.3 [90% CI: 3.9, 13]
Median: 8.3 [90% CI: 3.9, 13]


## Energy

Total energy consumption of training: 1287 MWh ([Patterson et al. 2021](https://arxiv.org/abs/2104.10350), Table 4)

In [75]:
energy_consumption_kwh = 1287e3 * experiment_factor
price_per_kwh = energy_price(2020) * energy_price_uncertainty_factor
energy_cost = energy_consumption_kwh * price_per_kwh
print_median_and_ci(energy_cost)

Median: 1.9e+05 [90% CI: 8.5e+04, 4.1e+05]


## Personnel

In [76]:
total_salaries = np.zeros(N)
num_authors = 31
for i in range(N):
    ftes = int(lognorm_from_90_ci(author_factor_low * num_authors, author_factor_high * num_authors, 1)[0])
    salary = lognorm_from_90_ci(compensation_low, compensation_high, ftes)
    total_salaries[i] = salary.sum()

In [77]:
personnel_cost = total_salaries * project_duration

In [78]:
print_median_and_ci(personnel_cost, ci=[2.5, 97.5])

Median: 7.8e+06 [95% CI: 2.2e+06, 2.5e+07]


## Overall

In [79]:
hardware_costs['GPT-3 175B (davinci)']['2.5']

{'AI accelerator chip cost': 965270.3015984484,
 'Other server components cost': 666036.5081029293,
 'Cluster-level interconnect cost': 244696.02145520653,
 'Energy cost': 120987.05188204613}

In [80]:
def get_hardware_cost_dist(model, component, ci=[2.5, 97.5]):
    return lognorm_from_ci(
        hardware_costs[model][str(ci[0])][component],
        hardware_costs[model][str(ci[1])][component],
        ci[1] - ci[0],
        N
    )

In [81]:
cost_data['GPT-3 175B (davinci)'] = {
    'AI accelerator chip cost': get_hardware_cost_dist('GPT-3 175B (davinci)', 'AI accelerator chip cost'),
    'Other server components cost': get_hardware_cost_dist('GPT-3 175B (davinci)', 'Other server components cost'),
    'Cluster-level interconnect cost': get_hardware_cost_dist('GPT-3 175B (davinci)', 'Cluster-level interconnect cost'),
    'Energy cost': energy_cost,
    'R&D staff cost': personnel_cost,
}

# OPT-175B

## Energy

https://arxiv.org/abs/2205.01068: "[...] OPT-175B was developed with an estimated carbon emissions footprint (CO2eq) of 75 tons [...] With ablations, baselines and downtime, our own estimates of total cost is roughly 2× higher."

- I haven't found numbers for carbon intensity, or how they calculate it
- According to https://arxiv.org/pdf/2104.10350.pdf (Table 1), Gross CO2e/KWh (kg/KWh) for the “Google Iowa Council Bluffs” data center in 2020 was 0.478 CO2e/kWh (kg/kWh)
- Energy use:
  - 992 A100 GPU units
  - 300W per unit (https://www.nvidia.com/en-us/data-center/a100/)
  - Training time 33 days (https://github.com/facebookresearch/metaseq/blob/d703cf1ae1e0faaff6c20629398dfbe02b98cf77/projects/OPT/chronicles/final_update.md)
  - Energy = 992 units * 0.3 kW/unit * 33 days * 24h/day ~= 236000 kWh
  - 75000 kg / 236000 kWh ~= 0.318 CO2e/kWh (kg/kWh)
  - Sanity check: this is not far from the Google data center gross carbon intensity of 0.478 reported by [Patterson et al. (2021)](https://arxiv.org/abs/2104.10350)

In [82]:
training_time_hours = 33 * 24
power_per_gpu = 0.3
num_gpus = 992
energy_consumption_kwh = training_time_hours * num_gpus * power_per_gpu * experiment_factor
price_per_kwh = energy_price(2022) * energy_price_uncertainty_factor
energy_cost = energy_consumption_kwh * price_per_kwh
print_median_and_ci(energy_cost, ci=[2.5, 97.5])

Median: 4.3e+04 [95% CI: 1.7e+04, 1.2e+05]


## Personnel

2021-Oct-20: First date in [OPT logbook](https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/chronicles/OPT175B_Logbook.pdf)

2022-Jan-06: Date in the logbook where the final training run is stated to have completed

2022-May-02: Publication on arxiv

In [83]:
project_duration = 192 / DAYS_PER_YEAR

In [84]:
num_authors = 19

In [85]:
total_salaries = np.zeros(N)
for i in range(N):
    ftes = int(lognorm_from_90_ci(author_factor_low * num_authors, author_factor_high * num_authors, 1)[0])
    salary = lognorm_from_90_ci(compensation_low, compensation_high, ftes)
    total_salaries[i] = salary.sum()

In [86]:
personnel_cost = total_salaries * project_duration

In [87]:
print_median_and_ci(personnel_cost)

Median: 3.7e+06 [90% CI: 1.4e+06, 8.7e+06]


## Overall

In [88]:
cost_data['OPT-175B'] = {
    'AI accelerator chip cost': get_hardware_cost_dist('OPT-175B', 'AI accelerator chip cost'),
    'Other server components cost': get_hardware_cost_dist('OPT-175B', 'Other server components cost'),
    'Cluster-level interconnect cost': get_hardware_cost_dist('OPT-175B', 'Cluster-level interconnect cost'),
    'Energy cost': energy_cost,
    'R&D staff cost': personnel_cost,
}

# GPT-4

## Project duration

https://arxiv.org/abs/2303.08774
"This system card analyzes GPT-4 [...] Since it finished training in August of 2022 [...]"

In [89]:
training_end_low = datetime(2022, 8, 1)
training_end_high = datetime(2022, 8, 31)

Project start time
- We know they started training GPT-3.5 about one year before [announcement](https://openai.com/index/gpt-4-research/) in March 2023, and then trained GPT-4 after that. They built new infrastructure/codebase for and GPT-3.5 was a test run. Maybe some research experiments were going on for a few months in total prior to that.
- Indirect source that final training run of GPT-4 took about three months: [SemiAnalysis](https://www.semianalysis.com/p/gpt-4-architecture-infrastructure). So would have started at the end of May at the latest.


In [90]:
project_start_low = datetime(2022, 1, 1)
project_start_high = datetime(2022, 5, 31)

In [91]:
publication_date = frontier_pcd_df.loc[frontier_pcd_df['System'] == 'GPT-4', 'Publication date'].values[0]
publication_date

numpy.datetime64('2023-03-15T00:00:00.000000000')

In [92]:
min_project_duration = 90 / 365  # training duration
project_duration = []
while len(project_duration) < N:
    project_start_sample = lognorm_from_90_ci(dt2float(project_start_low), dt2float(project_start_high), 1)[0]
    training_end_sample = lognorm_from_90_ci(dt2float(training_end_low), dt2float(training_end_high), 1)[0]
    experiment_duration_sample = training_end_sample - project_start_sample
    if experiment_duration_sample > min_project_duration:
        project_duration.append(experiment_duration_sample)

project_duration = []
while len(project_duration) < N:
    project_start_sample = lognorm_from_90_ci(dt2float(project_start_low), dt2float(project_start_high), 1)[0]
    # Technical report said six months was spent on safety research, risk assessment, and iteration,
    # which covers the time from the end of training to publication
    project_duration_sample = dt2float(pd.to_datetime(publication_date)) - project_start_sample
    if project_duration_sample > min_project_duration:
        project_duration.append(project_duration_sample)

project_duration = np.array(project_duration)
project_duration = np.array(project_duration)
print_median_and_ci(project_duration * 12)  # convert years to months
print_median_and_ci(project_duration * 12)

Median: 12 [90% CI: 9.4, 14]
Median: 12 [90% CI: 9.4, 14]


## Energy

In [93]:
energy_cost = get_hardware_cost_dist('GPT-4', 'Energy cost') * energy_price_uncertainty_factor
print_median_and_ci(energy_cost, ci=[2.5, 97.5])

Median: 7.9e+06 [95% CI: 3.2e+06, 2.1e+07]


## Personnel

In [94]:
# Load file: data/gpt-4_contributions.json
with open('data/gpt-4_contributions.json') as f:
    gpt_4_contributions = json.load(f)
gpt_4_contributions

{'Pretraining': {'Core contributors': ['Christopher Berner',
   'Greg Brockman',
   'Trevor Cai',
   'David Farhi',
   'Chris Hesse',
   'Shantanu Jain',
   'Kyle Kosic',
   'Jakub Pachocki',
   'Alex Paino',
   'Mikhail Pavlov',
   'Michael Petrov',
   'Nick Ryder',
   'Szymon Sidor',
   'Nikolas Tezak',
   'Phil Tillet',
   'Amin Tootoonchian',
   'Qiming Yuan',
   'Wojciech Zaremba'],
  'Compute cluster scaling': ['Christopher Berner',
   'Oleg Boiko',
   'Andrew Cann',
   'Ben Chess',
   'Christian Gibson',
   'Mateusz Litwin',
   'Emy Parparita',
   'Henri Roussez',
   'Eric Sigler',
   'Akila Welihinda'],
  'Data': ['Sandhini Agarwal',
   'Suchir Balaji',
   'Mo Bavarian',
   'Che Chang',
   'Sheila Dunning',
   'Leo Gao',
   'Jonathan Gordon',
   'Peter Hoeschele',
   'Shawn Jain',
   'Shantanu Jain',
   'Roger Jiang',
   'Heewoo Jun',
   'Łukasz Kaiser',
   'Nitish Shirish Keskar',
   'Jong Wook Kim',
   'Aris Konstantinidis',
   'Chak Ming Li',
   'Todor Markov',
   'Bianca Ma

In [95]:
unique_contributors = set()
for category in gpt_4_contributions.values():
    for group in category.values():
        for contributor in group:
            unique_contributors.add(contributor)
len(unique_contributors)

284

In [96]:
num_contributors = len(unique_contributors)

In [97]:
total_salaries = np.zeros(N)
for i in range(N):
    # A given contributor plausibly spent anywhere from 10% to 90% of their time on this
    ftes = lognorm_from_90_ci(0.1, 0.9, num_contributors)
    salary = lognorm_from_90_ci(compensation_low, compensation_high, num_contributors)
    salary *= ftes
    salary *= lognorm_from_90_ci(compensation_overhead_low, compensation_overhead_high, num_contributors)
    total_salaries[i] = salary.sum()

In [98]:
personnel_cost = total_salaries * project_duration

In [99]:
print_median_and_ci(personnel_cost)

Median: 7.9e+07 [90% CI: 6.1e+07, 1e+08]


## Overall

In [100]:
cost_data['GPT-4'] = {
    'AI accelerator chip cost': get_hardware_cost_dist('GPT-4', 'AI accelerator chip cost'),
    'Other server components cost': get_hardware_cost_dist('GPT-4', 'Other server components cost'),
    'Cluster-level interconnect cost': get_hardware_cost_dist('GPT-4', 'Cluster-level interconnect cost'),
    'Energy cost': energy_cost,
    'R&D staff cost': personnel_cost,
}

# Gemini Ultra

See https://colab.research.google.com/drive/1XEKlSo-3DCFp686yGOwwfS6_DEHsFimd#scrollTo=yqWMux2iZL8L

In [101]:
# Lognormal distribution parameters for project duration
p_5th_project_duration_years = 7 / 12
p_95th_project_duration_years = 20 / 12

project_duration = lognorm_from_90_ci(p_5th_project_duration_years, p_95th_project_duration_years, N)

project_duration_hours = project_duration * HOURS_PER_YEAR

print_median_and_ci(project_duration, ci=[5, 95])

Median: 0.99 [90% CI: 0.61, 1.7]


In [102]:
chips_per_pod = 4096
number_of_pods = np.random.uniform(12, 19, N)
number_of_chips = number_of_pods * chips_per_pod
print_median_and_ci(number_of_chips)

Median: 6.3e+04 [90% CI: 5.1e+04, 7.6e+04]


In [103]:
cost_per_kwh = 0.083  # default assumption for US data centers
power_per_chip = 0.3  # Unsure - TPUv3 is under 300W for most models in https://arxiv.org/pdf/2104.10350.pdf, but this is TPUv4. See also https://cloud.google.com/tpu/docs/v4.
energy_cost = number_of_chips * power_per_chip * project_duration_hours * cost_per_kwh
print_median_and_ci(energy_cost)

Median: 1.4e+07 [90% CI: 8e+06, 2.4e+07]


In [104]:
hardware_cost = 6.7e8
# Use GPU/non-GPU fractions from Yafah Edelman as a placeholder
cost_data_upfront_capex['Gemini Ultra'] = {
    'GPU servers': 0.7 * hardware_cost,
    'Other hardware': 0.3 * hardware_cost,
    'Energy': energy_cost.mean(),
    'Personnel': 1.6e8,
}

NameError: name 'cost_data_upfront_capex' is not defined

In [None]:
tpuv4_server_lifetime = lognorm_from_90_ci(3, 8, N)  # years

In [None]:
cost_data_amortized_capex['Gemini Ultra'] = {
    'GPU servers': 0.7 * np.mean(hardware_cost * project_duration / tpuv4_server_lifetime),
    'Other hardware': 0.3 * np.mean(hardware_cost * project_duration / tpuv4_server_lifetime),
    'Energy': energy_cost.mean(),
    'Personnel': 1.6e8,
}

In [None]:
cost_data_amortized_capex

{'GPT-3': {'GPU servers': 21101057.56614471,
  'Other hardware': 3738710.870780002,
  'Energy': 213642.0,
  'Personnel': 8326041.174462708},
 'OPT-175B': {'GPU servers': 1251889.3745022374,
  'Other hardware': 395806.9399623841,
  'Energy': 39126.0672,
  'Personnel': 2621423.5046858904},
 'GPT-4': {'GPU servers': 34061173.90082,
  'Other hardware': 14606197.595430722,
  'Energy': 5000000.0,
  'Personnel': 46297056.81040383},
 'Gemini Ultra': {'GPU servers': 104527413.47373964,
  'Other hardware': 44797462.91731699,
  'Energy': 14313389.058145,
  'Personnel': 160000000.0}}

# Plots

In [None]:
cost_component_names.insert(0, 'R&D staff cost')

In [None]:
# Create a grouped bar chart with error bars
fig = go.Figure()

# Group 1
for component in cost_component_names:
    y_values = [np.median(cost_data[model][component]) for model in models[:3]]
    y_lows = [np.percentile(cost_data[model][component], 2.5) for model in models[:3]]
    y_highs = [np.percentile(cost_data[model][component], 97.5) for model in models[:3]]
    print(component)
    print(y_values, y_lows, y_highs)
    fig.add_trace(go.Bar(
        x=models[:3],
        y=y_values,
        error_y=dict(
            type='data',
            symmetric=False,  # Use asymmetric error bars
            array=[y_highs[i] - y_values[i] for i in range(len(y_values))],  # Upper bounds of the error bars
            arrayminus=[y_values[i] - y_lows[i] for i in range(len(y_values))],  # Lower bounds
            visible=True  # Make error bars visible
        ),
        name=component
    ))

# log y axis
fig.update_yaxes(type="log")

# Customize the layout
fig.update_layout(
    width=800,
    height=600,
    barmode='group',  # Group bars together
    title='Total amortized model development costs',
    xaxis=dict(
        title='Model'
    ),
    yaxis=dict(
        title='Cost (2023 USD, log scale)'
    ),
)

save_plot(fig, results_dir, 'total_amortized_model_development_costs')

# Show the plot
fig.show()


R&D staff cost
[7551285.066947095, 3549416.132660362, 79979768.47244003] [2063527.2665873237, 1160041.5896891947, 57688982.88919383] [25254180.101036076, 9480133.618856251, 103507732.32652797]
AI accelerator chip cost
[1962125.4273683922, 716160.9422062889, 39554705.98455888] [949015.3253523274, 354565.645839486, 19390688.184284717] [3994536.232585356, 1432287.6967039572, 86703737.67547105]
Other server components cost
[1355652.9834082732, 473059.9414767102, 26989831.642662495] [671583.8228633152, 238663.77271750377, 13058409.646629605] [2781528.9606980523, 971121.4656579593, 56573208.66963568]
Cluster-level interconnect cost
[512000.39184307214, 178300.0822160283, 10101705.515745563] [246202.37735644972, 82167.72118769035, 4607323.713346146] [1024343.4824198956, 376570.787157555, 20145542.536439896]
Energy cost
[187204.84389297687, 42765.55728603531, 7617954.038204856] [70265.0027469286, 16051.518420618899, 2839244.964465217] [556057.5915326377, 127027.23012134609, 19221653.343449485]

In [None]:
# Prepare data for Plotly
categories = list(next(iter(cost_data_amortized_capex.values())).keys())
models = [k for k in cost_data_amortized_capex.keys()] # if 'OPT' not in k]

In [None]:
total_costs = {model:sum([cost_data[model][category] for category in categories]) for model in models}
total_costs

{'GPT-3': 33379451.611387417,
 'OPT-175B': 4308245.886350512,
 'GPT-4': 99964428.30665456,
 'Gemini Ultra': 323638265.4492016}

In [None]:
# Creating the figure
fig = go.Figure()

for category in ['GPU servers', 'Other hardware', 'Personnel', 'Energy']:
    proportions = [cost_data_amortized_capex[model][category] / total_costs[model] * 100 for model in models]
    fig.add_trace(go.Bar(
        name=category,
        x=list(models),
        y=proportions,
        # text=[human_format(cost_data[model][category]) for model in models],
        # text=[f'{pp:.1f}%' for pp in proportions],
        textposition='auto',
    ))

# y ticks
fig.update_yaxes(range=[0, 100])

# Update the layout
fig.update_layout(
    barmode='stack',
    title='Proportions of cost for experiments and training',
    xaxis_title='Models',
    yaxis_title='Proportion (%)',
    legend_title='Cost Components',
    width=600,
    height=400,
)

# Save plot
save_plot(fig, results_dir, 'cost_proportions_stacked')

# Show the figure
fig.show()