# Setup

In [1]:
from collections import defaultdict
from datetime import datetime
import json
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from cost import *
from data import *
from energy import *
from inflation import *
from parameters import *
from plotting import *
from utils import *

In [2]:
results_dir = f'results/development-costs/'
os.makedirs(results_dir, exist_ok=True)

Sample size for distributions

In [3]:
N = 1000

Cost of staff compensation
- Use Google Software Engineer as the reference: https://web.archive.org/web/20240515221234/https://www.levels.fyi/companies/google/salaries/software-engineer/locations/san-francisco-bay-area?dma=807
- Approximately 200K to 1.2M


In [4]:
compensation_low = 2e5
compensation_high = 1.2e6


Apply 1.25 - 1.4 overhead factor for benefits, payroll, etc. https://www.sba.gov/blog/how-much-does-employee-cost-you 

In [5]:
compensation_overhead_low = 1.25
compensation_overhead_high = 1.4

In [6]:
cost_data = {}
cost_data_with_hardware_acq = {}

Default ratio of total experiment + training compute to final training run compute
- Smaller versions of GPT-3 made up ~4.5e22 FLOP (based on the formula compute = 6 * parameters * tokens), compared to 175B using ~3e23 FLOP. See Table 2.1 of the [paper](https://arxiv.org/pdf/2005.14165). That's a factor of ~1.14 to go from 175B to total FLOP. Certainly more than that was needed due to failures and other experiments. So 1.2x seems like a good low-end estimate.
- https://arxiv.org/pdf/2111.00364.pdf, p.3, Fig. 3 caption: "At Facebook, we observe a rough power capacity breakdown of 10:20:70 for AI infrastructures devoted to the three key phases — Experimentation, Training, and Inference". 10:20 ratio means a 1.5x multiplier from training to training+experiment compute. Note however that the "training" category is not just the final training run; it includes additional hyper-parameter tuning and retraining for some models. So the actual multiplier would be higher under our definition.
- BLOOM project - https://arxiv.org/abs/2211.02001, p.8, Table 5. ~63% of compute was spent training smaller models than 176B. That's a multiplier of ~2.7x.
- https://arxiv.org/abs/2205.01068: "[...] OPT-175B was developed with an estimated carbon emissions footprint (CO2eq) of 75 tons [...] With ablations, baselines and downtime, our own estimates of total cost is roughly 2× higher."
- We think the Facebook, BLOOM and OPT-175B cases are the more central examples as they account better for all experiments. A factor close to 2x seems like a reasonable median estimate.
- On the high end, it's plausible that several, full iterations of the full-scale training run are necessary before success - say, 4x.

In [7]:
experiment_factor = lognorm_from_90_ci(1.2, 4, N)
print_median_and_ci(experiment_factor)

Median: 2.2 [90% CI: 1.2, 3.9]


Uncertainty in energy prices

In [8]:
likely_datacenter_states = ['California', 'Nevada', 'Oregon', 'Washington']
energy_prices = [US_STATE_ENERGY_PRICES_PER_KWH[state] for state in likely_datacenter_states]
mean = np.mean(energy_prices)
np.max(energy_prices) / mean, np.min(energy_prices) / mean

(1.727914785226352, 0.6247539654972791)

In [9]:
energy_price_uncertainty_factor = lognorm_from_90_ci(0.6, 1.7, N)

In [10]:
dt2float = lambda x: x.year + x.month/12 + x.day/365.25
float2dt = lambda x: datetime(int(x), int((x-int(x))*12), int(((x-int(x))*12-int((x-int(x))*12))*365.25))

In [11]:
frontier_pcd_df, hardware_df, price_df = load_data_for_cost_estimation(
    compute_threshold_method='top_n', compute_threshold=10,
)

In [12]:
frontier_pcd_df['Publication date'] = pd.to_datetime(frontier_pcd_df['Publication date'])

Models

In [13]:
models = ['GPT-3 175B (davinci)', 'OPT-175B', 'GPT-4', 'Gemini Ultra']

In [14]:
selected_model_df = frontier_pcd_df.loc[[s in models for s in frontier_pcd_df['System']]]

## Hardware

In [15]:
# Unset index
selected_model_df.reset_index(inplace=True)

In [16]:
def _estimate_chip_hours(row):
    return estimate_chip_hours(row, hardware_df)

selected_model_df['Training chip-hours'] = selected_model_df.apply(_estimate_chip_hours, axis=1)
selected_model_df['Training chip-hours']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_model_df['Training chip-hours'] = selected_model_df.apply(_estimate_chip_hours, axis=1)


0    132000000.0
1     57000000.0
2       812544.0
3      3552000.0
Name: Training chip-hours, dtype: float64

In [17]:
selected_model_df.loc[:, 'Original training chip-hours'] = selected_model_df.loc[:, 'Training chip-hours']
hardware_costs = defaultdict(dict)
cost_component_names = [
    'AI accelerator chip cost',
    'Other server components cost',
    'Cluster-level interconnect cost',
    'Energy cost',
]

for percentile in [2.5, 50, 97.5]:
    print(f'\n\n{percentile}th percentile of experiments time')
    experiment_factor_value = np.percentile(experiment_factor, percentile)
    selected_model_df.loc[:, 'Training chip-hours'] = selected_model_df.loc[:, 'Original training chip-hours'] * experiment_factor_value
    cost_df = estimate_hardware_capex_energy(selected_model_df, hardware_df, price_df, separate_components=True)
    cost_df = cost_df.set_index('System', inplace=False)

    for model in models:
        cost_components = {}
        for cost_component_name in cost_component_names:
            cost_components[cost_component_name] = cost_df.loc[model, cost_component_name]
        hardware_costs[model][str(percentile)] = cost_components

hardware_costs



2.5th percentile of experiments time
==== System: Gemini Ultra ====
Estimated the value of Google TPU v4 server, available from 2021-05-20 00:00:00 and used from 2023-05-10 00:00:00: 10527.742991314644

==== System: GPT-4 ====
Estimated the value of NVIDIA A100 SXM4 40 GB server, available from 2020-08-12 00:00:00 and used from 2022-05-12 00:00:00: 14165.054738257799

==== System: OPT-175B ====
Estimated the value of NVIDIA A100 SXM4 80 GB server, available from 2021-02-14 00:00:00 and used from 2022-02-27 23:00:00: 17818.700400260856

==== System: GPT-3 175B (davinci) ====
Estimated the value of NVIDIA Tesla V100 DGXS 32 GB server, available from 2018-06-25 00:00:00 and used from 2019-10-01 00:00:00: 11463.61551009175

==== System: Gemini Ultra ====
Estimated cost: {'AI accelerator chip cost': 34478142.80448855, 'Other server components cost': 22066011.394872673, 'Cluster-level interconnect cost': 10743389.297878629, 'Energy cost': 3239961.0752525907}
==== System: GPT-4 ====
Estimat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_model_df.loc[:, 'Original training chip-hours'] = selected_model_df.loc[:, 'Training chip-hours']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_model_df.loc[:, 'Training chip-hours'] = selected_model_df.loc[:, 'Original training chip-hours'] * experiment_factor_value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

defaultdict(dict,
            {'GPT-3 175B (davinci)': {'2.5': {'AI accelerator chip cost': 980361.7777334974,
               'Other server components cost': 676449.6266361132,
               'Cluster-level interconnect cost': 314794.1668302259,
               'Energy cost': 122878.61863086693},
              '50': {'AI accelerator chip cost': 1972379.8165294528,
               'Other server components cost': 1360942.0734053222,
               'Cluster-level interconnect cost': 633331.159087607,
               'Energy cost': 247218.23389612624},
              '97.5': {'AI accelerator chip cost': 3844958.628958232,
               'Other server components cost': 2653021.4539811797,
               'Cluster-level interconnect cost': 1234616.2157584878,
               'Energy cost': 481927.40246514836}},
             'OPT-175B': {'2.5': {'AI accelerator chip cost': 354889.6487534476,
               'Other server components cost': 234227.16817727534,
               'Cluster-level interconnec

# GPT-3

## Project duration

Training end time
1. Shevlane (2022)
https://uploads-ssl.webflow.com/614b70a71b9f71c9c240c7a7/6262a1a55526a373cc93207d_Shevlane%20dissertation%20preprint.pdf
p.66 of the PDF: A senior member of OpenAI (specified anonymously on p.27 of the PDF) 
told the author "GPT-3 existed for a long time before the paper came out. We delayed the 
paper. [...] But it's months, it doesn't really count."
p.67 of the PDF: CAMERON said "Firstly, [the idea for a commercial API for GPT-3] 
started out as a research API. It probably was . . . early January 2020."

1. We think it plausibly could have been produced soon after the Microsoft deal was 
announced in July 2019. Supposing the announcement coincided with Microsoft giving 
OpenAI access to the necessary compute, and OpenAI already having almost everything 
about GPT-3 planned in advance, and it took less than 1 month to train, then GPT-3 could 
have been produced in August 2019.

1. So we estimate August to January as our 90% CI, and halfway between (October) as the
central estimate.

In [18]:
training_end_low = datetime(2019, 8, 1)
training_end_high = datetime(2020, 1, 1)


Project start time
- Lower bound on start time: when GPT-2 was published, 2019-Feb-14
  - It could have been earlier than this, but that seems unlikely unless GPT-2 was delayed for months
  - The scaling laws paper was only published in January 2020, and GPT-3 was based on the findings of that paper
- Upper bound on start time: 2019-Nov-01, two months before the upper bound on training completion
  - We think they’d need six weeks at the absolute minimum, to prepare training data and scale up the experiments successfully


In [19]:
project_start_low = datetime(2019, 2, 14)
project_start_high = datetime(2019, 11, 1)


In [20]:
publication_date = frontier_pcd_df.loc[frontier_pcd_df['System'] == 'GPT-3 175B (davinci)', 'Publication date'].values[0]
publication_date

numpy.datetime64('2020-05-28T00:00:00.000000000')

In [21]:
min_project_duration = 15 / 365  # training duration
experiments_duration = []
while len(experiments_duration) < N:
    project_start_sample = lognorm_from_90_ci(dt2float(project_start_low), dt2float(project_start_high), 1)[0]
    training_end_sample = lognorm_from_90_ci(dt2float(training_end_low), dt2float(training_end_high), 1)[0]
    experiment_duration_sample = training_end_sample - project_start_sample
    if experiment_duration_sample > min_project_duration:
        experiments_duration.append(experiment_duration_sample)

project_duration = []
while len(project_duration) < N:
    project_start_sample = lognorm_from_90_ci(dt2float(project_start_low), dt2float(project_start_high), 1)[0]
    # Paper was delayed by "months"
    publication_delay_sample = lognorm_from_90_ci(2*30, 4*30, 1)[0]
    project_end_sample = dt2float(publication_date - pd.to_timedelta(publication_delay_sample, unit='D'))
    project_duration_sample = project_end_sample - project_start_sample
    if project_duration_sample > min_project_duration:
        project_duration.append(project_duration_sample)

experiments_duration = np.array(experiments_duration)
project_duration = np.array(project_duration)
print_median_and_ci(experiments_duration * 12)  # convert years to months
print_median_and_ci(project_duration * 12)

Median: 4.3 [90% CI: 0.97, 8.9]
Median: 8.4 [90% CI: 4, 13]


## Energy

Total energy consumption of training: 1287 MWh ([Patterson et al. 2021](https://arxiv.org/abs/2104.10350), Table 4)

In [22]:
energy_consumption_kwh = 1287e3 * experiment_factor
price_per_kwh = energy_price(2020) * energy_price_uncertainty_factor
energy_cost = energy_consumption_kwh * price_per_kwh
print_median_and_ci(energy_cost)

Median: 1.9e+05 [90% CI: 8.8e+04, 4.2e+05]


## Personnel

In [23]:
total_salaries = np.zeros(N)
num_authors = 31
num_contributors = 25  # counting listed contributors in paper
for i in range(N):
    ftes = lognorm_from_90_ci(0.05, 0.8, num_contributors)
    salary = lognorm_from_90_ci(compensation_low, compensation_high, num_contributors)
    salary *= ftes
    salary *= lognorm_from_90_ci(compensation_overhead_low, compensation_overhead_high, num_contributors)
    total_salaries[i] = salary.sum()

In [24]:
personnel_cost = total_salaries * project_duration

In [25]:
print_median_and_ci(personnel_cost, ci=[2.5, 97.5])

Median: 3.5e+06 [95% CI: 1.2e+06, 7.2e+06]


## Overall

In [26]:
hardware_costs['GPT-3 175B (davinci)']['2.5']

{'AI accelerator chip cost': 980361.7777334974,
 'Other server components cost': 676449.6266361132,
 'Cluster-level interconnect cost': 314794.1668302259,
 'Energy cost': 122878.61863086693}

In [27]:
def get_hardware_cost_dist(model, component, ci=[2.5, 97.5]):
    return lognorm_from_ci(
        hardware_costs[model][str(ci[0])][component],
        hardware_costs[model][str(ci[1])][component],
        ci[1] - ci[0],
        N
    )

In [28]:
cost_data['GPT-3 175B (davinci)'] = {
    'AI accelerator chip cost': get_hardware_cost_dist('GPT-3 175B (davinci)', 'AI accelerator chip cost'),
    'Other server components cost': get_hardware_cost_dist('GPT-3 175B (davinci)', 'Other server components cost'),
    'Cluster-level interconnect cost': get_hardware_cost_dist('GPT-3 175B (davinci)', 'Cluster-level interconnect cost'),
    'Energy cost': energy_cost,
    'R&D staff cost': personnel_cost,
}

# OPT-175B

## Energy

https://arxiv.org/abs/2205.01068: "[...] OPT-175B was developed with an estimated carbon emissions footprint (CO2eq) of 75 tons [...] With ablations, baselines and downtime, our own estimates of total cost is roughly 2× higher."

- I haven't found numbers for carbon intensity, or how they calculate it
- According to https://arxiv.org/pdf/2104.10350.pdf (Table 1), Gross CO2e/KWh (kg/KWh) for the “Google Iowa Council Bluffs” data center in 2020 was 0.478 CO2e/kWh (kg/kWh)
- Energy use:
  - 992 A100 GPU units
  - 300W per unit (https://www.nvidia.com/en-us/data-center/a100/)
  - Training time 33 days (https://github.com/facebookresearch/metaseq/blob/d703cf1ae1e0faaff6c20629398dfbe02b98cf77/projects/OPT/chronicles/final_update.md)
  - Energy = 992 units * 0.3 kW/unit * 33 days * 24h/day ~= 236000 kWh
  - 75000 kg / 236000 kWh ~= 0.318 CO2e/kWh (kg/kWh)
  - Sanity check: this is not far from the Google data center gross carbon intensity of 0.478 reported by [Patterson et al. (2021)](https://arxiv.org/abs/2104.10350)

In [29]:
training_time_hours = 33 * 24
power_per_gpu = 0.3
num_gpus = 992
energy_consumption_kwh = training_time_hours * num_gpus * power_per_gpu * experiment_factor
price_per_kwh = energy_price(2022) * energy_price_uncertainty_factor
energy_cost = energy_consumption_kwh * price_per_kwh
print_median_and_ci(energy_cost, ci=[2.5, 97.5])

Median: 4.3e+04 [95% CI: 1.7e+04, 1.1e+05]


## Personnel

2021-Oct-20: First date in [OPT logbook](https://github.com/facebookresearch/metaseq/blob/main/projects/OPT/chronicles/OPT175B_Logbook.pdf)

2022-Jan-06: Date in the logbook where the final training run is stated to have completed

2022-May-02: Publication on arxiv

In [30]:
project_duration = 192 / DAYS_PER_YEAR

In [31]:
num_authors = 19
num_contributors = num_authors

In [32]:
total_salaries = np.zeros(N)
for i in range(N):
    ftes = lognorm_from_90_ci(0.05, 0.8, num_contributors)
    salary = lognorm_from_90_ci(compensation_low, compensation_high, num_contributors)
    salary *= ftes
    salary *= lognorm_from_90_ci(compensation_overhead_low, compensation_overhead_high, num_contributors)
    total_salaries[i] = salary.sum()

In [33]:
personnel_cost = total_salaries * project_duration

In [34]:
print_median_and_ci(personnel_cost, ci=[2.5, 97.5])

Median: 2e+06 [95% CI: 1.2e+06, 3.5e+06]


## Overall

In [35]:
cost_data['OPT-175B'] = {
    'AI accelerator chip cost': get_hardware_cost_dist('OPT-175B', 'AI accelerator chip cost'),
    'Other server components cost': get_hardware_cost_dist('OPT-175B', 'Other server components cost'),
    'Cluster-level interconnect cost': get_hardware_cost_dist('OPT-175B', 'Cluster-level interconnect cost'),
    'Energy cost': energy_cost,
    'R&D staff cost': personnel_cost,
}

# GPT-4

## Project duration

https://arxiv.org/abs/2303.08774
"This system card analyzes GPT-4 [...] Since it finished training in August of 2022 [...]"

In [36]:
training_end_low = datetime(2022, 8, 1)
training_end_high = datetime(2022, 8, 31)

Project start time
- We know they started training GPT-3.5 about one year before [announcement](https://openai.com/index/gpt-4-research/) in March 2023, and then trained GPT-4 after that. They built new infrastructure/codebase for and GPT-3.5 was a test run. Maybe some research experiments were going on for a few months in total prior to that.
- Indirect source that final training run of GPT-4 took about three months: [SemiAnalysis](https://www.semianalysis.com/p/gpt-4-architecture-infrastructure). So would have started at the end of May at the latest.


In [37]:
project_start_low = datetime(2022, 1, 1)
project_start_high = datetime(2022, 5, 31)

In [38]:
publication_date = frontier_pcd_df.loc[frontier_pcd_df['System'] == 'GPT-4', 'Publication date'].values[0]
publication_date

numpy.datetime64('2023-03-15T00:00:00.000000000')

In [39]:
min_project_duration = 90 / 365  # training duration
experiments_duration = []
while len(experiments_duration) < N:
    project_start_sample = lognorm_from_90_ci(dt2float(project_start_low), dt2float(project_start_high), 1)[0]
    training_end_sample = lognorm_from_90_ci(dt2float(training_end_low), dt2float(training_end_high), 1)[0]
    experiment_duration_sample = training_end_sample - project_start_sample
    if experiment_duration_sample > min_project_duration:
        experiments_duration.append(experiment_duration_sample)

project_duration = []
while len(project_duration) < N:
    project_start_sample = lognorm_from_90_ci(dt2float(project_start_low), dt2float(project_start_high), 1)[0]
    # Technical report said six months was spent on safety research, risk assessment, and iteration,
    # which covers the time from the end of training to publication
    project_duration_sample = dt2float(pd.to_datetime(publication_date)) - project_start_sample
    if project_duration_sample > min_project_duration:
        project_duration.append(project_duration_sample)

experiments_duration = np.array(experiments_duration)
project_duration = np.array(project_duration)
print_median_and_ci(experiments_duration * 12)  # convert years to months
print_median_and_ci(project_duration * 12)

Median: 5.3 [90% CI: 3.4, 7.5]
Median: 12 [90% CI: 9.5, 14]


## Energy

In [40]:
energy_cost = get_hardware_cost_dist('GPT-4', 'Energy cost') * energy_price_uncertainty_factor
print_median_and_ci(energy_cost, ci=[2.5, 97.5])

Median: 7.6e+06 [95% CI: 3e+06, 1.9e+07]


## Personnel

In [41]:
# Load file: data/gpt-4_contributions.json
with open('data/gpt-4_contributions.json') as f:
    gpt_4_contributions = json.load(f)
gpt_4_contributions

{'Pretraining': {'Core contributors': ['Christopher Berner',
   'Greg Brockman',
   'Trevor Cai',
   'David Farhi',
   'Chris Hesse',
   'Shantanu Jain',
   'Kyle Kosic',
   'Jakub Pachocki',
   'Alex Paino',
   'Mikhail Pavlov',
   'Michael Petrov',
   'Nick Ryder',
   'Szymon Sidor',
   'Nikolas Tezak',
   'Phil Tillet',
   'Amin Tootoonchian',
   'Qiming Yuan',
   'Wojciech Zaremba'],
  'Compute cluster scaling': ['Christopher Berner',
   'Oleg Boiko',
   'Andrew Cann',
   'Ben Chess',
   'Christian Gibson',
   'Mateusz Litwin',
   'Emy Parparita',
   'Henri Roussez',
   'Eric Sigler',
   'Akila Welihinda'],
  'Data': ['Sandhini Agarwal',
   'Suchir Balaji',
   'Mo Bavarian',
   'Che Chang',
   'Sheila Dunning',
   'Leo Gao',
   'Jonathan Gordon',
   'Peter Hoeschele',
   'Shawn Jain',
   'Shantanu Jain',
   'Roger Jiang',
   'Heewoo Jun',
   'Łukasz Kaiser',
   'Nitish Shirish Keskar',
   'Jong Wook Kim',
   'Aris Konstantinidis',
   'Chak Ming Li',
   'Todor Markov',
   'Bianca Ma

In [42]:
unique_contributors = set()
for category in gpt_4_contributions.values():
    for group in category.values():
        for contributor in group:
            unique_contributors.add(contributor)
len(unique_contributors)

284

In [43]:
num_contributors = len(unique_contributors)

In [44]:
total_salaries = np.zeros(N)
for i in range(N):
    # For each i, each contributor is assigned a random FTE workload, salary, and overhead
    # A given contributor plausibly spent anywhere from 10% to 90% of their time on this
    ftes = lognorm_from_90_ci(0.05, 0.8, num_contributors)
    salary = lognorm_from_90_ci(compensation_low, compensation_high, num_contributors)
    overhead = lognorm_from_90_ci(compensation_overhead_low, compensation_overhead_high, num_contributors)
    salary *= ftes
    salary *= overhead
    total_salaries[i] = salary.sum()

In [45]:
personnel_cost = total_salaries * project_duration

In [46]:
print_median_and_ci(personnel_cost, ci=[2.5, 97.5])

Median: 6e+07 [95% CI: 4.5e+07, 8e+07]


## Overall

In [47]:
cost_data['GPT-4'] = {
    'AI accelerator chip cost': get_hardware_cost_dist('GPT-4', 'AI accelerator chip cost'),
    'Other server components cost': get_hardware_cost_dist('GPT-4', 'Other server components cost'),
    'Cluster-level interconnect cost': get_hardware_cost_dist('GPT-4', 'Cluster-level interconnect cost'),
    'Energy cost': energy_cost,
    'R&D staff cost': personnel_cost,
}

# Gemini Ultra

See https://colab.research.google.com/drive/1XEKlSo-3DCFp686yGOwwfS6_DEHsFimd#scrollTo=yqWMux2iZL8L

## Project duration

In [48]:
amortized_hardware_cost = lognorm_from_90_ci(5.9e6, 1.1e8, N)
server_cost = amortized_hardware_cost / 1.15
ai_accelerator_chip_cost = server_cost / 1.64
other_server_components_cost = server_cost - ai_accelerator_chip_cost
cluster_level_interconnect_cost = amortized_hardware_cost - server_cost

In [49]:
# Training time - from Gemini compute notebook
percentile_5th = 1*31*24*3600 # 5th percentile in seconds
percentile_95th = 6*31*24*3600  # 95th percentile in seconds
training_time_seconds = lognorm_from_90_ci(percentile_5th, percentile_95th, N)
print_median_and_ci(training_time_seconds)

experiments_duration_seconds = training_time_seconds * experiment_factor
experiments_duration_hours = experiments_duration_seconds / 3600
experiments_duration_years = experiments_duration_hours / (365.25 * 24)
print_median_and_ci(experiments_duration_years)

Median: 6.3e+06 [90% CI: 2.7e+06, 1.6e+07]
Median: 0.46 [90% CI: 0.16, 1.3]


## Energy

In [50]:
chips_per_pod = 4096
number_of_pods = np.random.uniform(12, 19, N)
number_of_chips = number_of_pods * chips_per_pod
print_median_and_ci(number_of_chips)

Median: 6.3e+04 [90% CI: 5.1e+04, 7.6e+04]


In [51]:
hardware_model = 'Google TPU v4'
organization = 'Google'
"""
https://cloud.google.com/blog/topics/systems/tpu-v4-enables-performance-energy-and-co2e-efficiency-gains
"Google's Cloud TPU v4 outperforms TPU v3 by 2.1x on average on a per-chip basis and improves performance/Watt by 2.7x."
TPU v3 performance per Watt: 123 TFLOPS / 450W = 0.273 TFLOPS/W
0.273 * 2.7 = 0.738 TFLOPS/W
TPU v4 is 275 TFLOPS => 275 / 0.738 = 373W
"""
chip_TDP_kw = 373 / 1000
# Adjust for whole server power draw (CPUs, memory, cooling)
server_TDP_kw = chip_TDP_kw * chip_to_server_power(hardware_model)
# Adjust for average power draw
server_power_kw = server_TDP_kw * server_TDP_fraction(hardware_model)
# Adjust for data center power distribution and cooling
adj_server_power_kw = server_power_kw * power_usage_effectiveness(organization)
cluster_kwh = adj_server_power_kw * number_of_chips * experiments_duration_hours

cost_per_kwh = energy_price(2023) * energy_price_uncertainty_factor
energy_cost = cluster_kwh * cost_per_kwh
print_median_and_ci(energy_cost)

Median: 5.5e+06 [90% CI: 1.8e+06, 1.9e+07]


## Personnel

In [52]:
# Testing personnel cost using the same method as other models (rather than the method in the Gemini Ultra cost notebook)
num_contributors = 984
total_salaries = np.zeros(N)
for i in range(N):
    # For each i, each contributor is assigned a random FTE workload, salary, and overhead
    # A given contributor plausibly spent anywhere from 10% to 90% of their time on this
    ftes = lognorm_from_90_ci(0.05, 0.8, num_contributors)
    salary = lognorm_from_90_ci(compensation_low, compensation_high, num_contributors)
    overhead = lognorm_from_90_ci(compensation_overhead_low, compensation_overhead_high, num_contributors)
    salary *= ftes
    salary *= overhead
    total_salaries[i] = salary.sum()
personnel_cost = total_salaries * project_duration
print_median_and_ci(personnel_cost, ci=[2.5, 97.5])

Median: 2.1e+08 [95% CI: 1.6e+08, 2.7e+08]


In [53]:
personnel_cost = lognorm_from_90_ci(9.7e7, 2.7e8, N)
print_median_and_ci(personnel_cost)

Median: 1.6e+08 [90% CI: 9.4e+07, 2.7e+08]


## Overall

In [54]:
cost_data['Gemini Ultra'] = {
    'AI accelerator chip cost': ai_accelerator_chip_cost,
    'Other server components cost': other_server_components_cost,
    'Cluster-level interconnect cost': cluster_level_interconnect_cost,
    'Energy cost': energy_cost,
    'R&D staff cost': personnel_cost,
}

# Inflation adjustment

In [55]:
def adjust_value_for_inflation(value, path_to_price_index, publication_date, to_year_month):
    price_index = pd.read_csv(path_to_price_index)
    from_date = str(publication_date)
    from_year_month = from_date.rsplit('-', maxsplit=1)[0] + '-01'
    from_price_index = price_index[price_index['DATE'] == from_year_month]['PCU518210518210'].values[0]
    to_price_index = price_index[price_index['DATE'] == to_year_month]['PCU518210518210'].values[0]
    adjust_factor = to_price_index / from_price_index
    return value * adjust_factor

In [56]:
for model in cost_data:
    for cost_component in cost_data[model]:
        # Adjust for inflation
        publication_date = frontier_pcd_df.loc[frontier_pcd_df['System'] == model, 'Publication date'].values[0]
        value = cost_data[model][cost_component]
        cost_data[model][cost_component] = adjust_value_for_inflation(value, 'data/PCU518210518210.csv', publication_date, '2023-12-01')

# Plots

In [57]:
cost_component_names.insert(0, 'R&D staff cost')

In [58]:
# Create a grouped bar chart with error bars
fig = go.Figure()

# Group 1
for component in cost_component_names:
    y_values = [np.median(cost_data[model][component]) for model in models]
    y_lows = [np.percentile(cost_data[model][component], 2.5) for model in models]
    y_highs = [np.percentile(cost_data[model][component], 97.5) for model in models]
    print(component)
    print(y_values, y_lows, y_highs)
    fig.add_trace(go.Bar(
        x=models,
        y=y_values,
        error_y=dict(
            type='data',
            symmetric=False,  # Use asymmetric error bars
            array=[y_highs[i] - y_values[i] for i in range(len(y_values))],  # Upper bounds of the error bars
            arrayminus=[y_values[i] - y_lows[i] for i in range(len(y_values))],  # Lower bounds
            visible=True  # Make error bars visible
        ),
        name=component
    ))

# log y axis
fig.update_yaxes(type="log")

# Customize the layout
fig.update_layout(
    width=800,
    height=600,
    barmode='group',  # Group bars together
    title='Amortized hardware CapEx, energy, and R&D staff costs for training and experiments',
    xaxis=dict(
        title='Model'
    ),
    yaxis=dict(
        title='Cost (2023 USD, log scale)'
    ),
)

save_plot(fig, results_dir, 'total_amortized_model_development_costs')

# Show the plot
fig.show()


R&D staff cost
[3699812.137482164, 2058461.0466851594, 61136073.66507499, 162115489.99744642] [1267815.5628886884, 1215633.2492341765, 45295091.48163555, 82225245.33071743] [7539275.771880239, 3560786.8746570437, 80427187.91315024, 298569971.04345375]
AI accelerator chip cost
[2001657.3159939395, 723141.5698171624, 38741591.7180551, 13171215.616312604] [975722.0685038089, 371290.71878062835, 20299174.67752671, 2448561.077633404] [3897079.3335759593, 1437471.8235095749, 80852614.60041523, 74810581.06949915]
Other server components cost
[1422302.4023819866, 476465.8893585418, 26342273.202573247, 8429577.994440064] [704670.1666631859, 234539.29188845266, 13422120.797472958, 1567079.0896853781] [2738921.461380745, 954824.2018584389, 51356187.17463251, 47878771.88447944]
Cluster-level interconnect cost
[659275.3315497236, 232346.01601526455, 12522680.66992208, 3240119.041612897] [320191.21767572634, 112767.64822757158, 6424407.682242967, 602346.0250978172] [1239108.8789019284, 456318.759431

In [59]:
total_costs = {}
for model in models:
    total_costs[model] = sum(np.median(cost_data[model][component]) for component in cost_component_names)
total_costs

{'GPT-3 175B (davinci)': 7981506.275015453,
 'OPT-175B': 3534290.0574047626,
 'GPT-4': 146403832.0808149,
 'Gemini Ultra': 192497958.58318412}

In [60]:
for model in models:
    print(model)
    for percentile in [2.5, 50, 97.5]:
        print(f'{percentile}th percentile')
        print(sum(np.percentile(cost_data[model][component], percentile) for component in cost_component_names[1:]))

GPT-3 175B (davinci)
2.5th percentile
2079460.575329677
50th percentile
4281694.137533289
97.5th percentile
8370285.508384831
OPT-175B
2.5th percentile
736035.8928006345
50th percentile
1475829.010719603
97.5th percentile
2958088.7584158536
GPT-4
2.5th percentile
43195343.31743777
50th percentile
85267758.41573991
97.5th percentile
178011266.23202875
Gemini Ultra
2.5th percentile
5994845.533430393
50th percentile
30382468.585737728
97.5th percentile
165922391.18869555


In [61]:
len(price_df['Hardware model'].unique())

24

In [62]:
# Creating the figure
fig = go.Figure()

for component in cost_component_names:
    proportions = [np.median(cost_data[model][component]) / total_costs[model] * 100 for model in models]
    fig.add_trace(go.Bar(
        name=component,
        x=list(models),
        y=proportions,
        text=[f'{np.around(pp/100, 1) * 100:.0f}%' for pp in proportions],
        textposition='auto',
    ))

# y ticks
fig.update_yaxes(range=[0, 100])

# Update the layout
fig.update_layout(
    barmode='stack',
    title='Proportions of cost for training and experiments',
    xaxis_title='Models',
    yaxis_title='Proportion (%)',
    legend_title='Cost components',
    width=600,
    height=400,
)

# Save plot
save_plot(fig, results_dir, 'cost_proportions_stacked')

# Show the figure
fig.show()