# Compare cost estimates and regression results

This notebook is used for sensitivity analysis.

Example usage:
- Run `cost_analysis.ipynb` using the default settings
- Run `cost_analysis.ipynb`, setting `estimation_method = 'cloud'`
- Set `results_name1` and `results_name2` to the folders containing results from the above
- Run this notebook to compare the distributions of cost estimates, and test if the slope of the regressions is significantly different.

The end of the notebook compares regression results for different N in top-N model selection, and other model selection methods.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from contextlib import redirect_stdout
import numpy as np
import os
import pandas as pd
import plotly.express as px

from plotting import *
from regression import *
from utils import *

In [3]:
# Load two results CSVs
results_name1 = 'hardware-capex-energy-top_n=10-original'
results_name2 = 'hardware-acquisition-top_n=10-original'
results_dir1 = f'results/{results_name1}/'
results_dir2 = f'results/{results_name2}/'
results_dir = f'results/{results_name1}-vs-{results_name2}/'
os.makedirs(results_dir, exist_ok=True)

In [4]:
cost_df1 = pd.read_csv(results_dir1 + 'cost_dataset.csv')
cost_df2 = pd.read_csv(results_dir2 + 'cost_dataset.csv')

In [5]:
col = 'Cost (inflation-adjusted)'

In [7]:
# Relative error stats
ratios = cost_df2[col] / cost_df1[col]
print(f'Ratio stats: {results_name2} / {results_name1}')
print_median_and_ci(ratios.dropna(), ci=[10, 90])
print(f'Mean: {ratios.mean():.2g}')

Ratio stats: hardware-acquisition-top_n=10-original / hardware-capex-energy-top_n=10-original
Median: 53 [80% CI: 14, 2e+02]
Mean: 88


In [8]:
len(ratios.dropna())

33

In [9]:
cost_df1[cost_df1['System'] == 'Inflection-2'][col] / cost_df2[cost_df2['System'] == 'Inflection-2'][col]

1    0.047088
Name: Cost (inflation-adjusted), dtype: float64

In [13]:
print(f"{cost_df1.iloc[ratios.argmin()]['System']}: {cost_df1.iloc[ratios.argmin()][col]}")
print(f"{cost_df2.iloc[ratios.argmin()]['System']}: {cost_df2.iloc[ratios.argmin()][col]}")

AlphaZero: 229918.61469698732
AlphaZero: 708010.1874542783


In [12]:
print(f"{cost_df1.iloc[ratios.argmax()]['System']}: {cost_df1.iloc[ratios.argmax()][col]}")
print(f"{cost_df2.iloc[ratios.argmax()]['System']}: {cost_df2.iloc[ratios.argmax()][col]}")

BigGAN-deep 512x512: 5170.456705747183
BigGAN-deep 512x512: 2945737.804137869


## Difference in regression

In [31]:
cost_df1['Publication date (float)'] = datetime_to_float_year(pd.to_datetime(cost_df1['Publication date']))
cost_df2['Publication date (float)'] = datetime_to_float_year(pd.to_datetime(cost_df2['Publication date']))

In [32]:
# Compare slopes
with open(f'{results_dir}/regression_comparison.out', 'w') as f:
    with redirect_stdout(f):
        print(results_dir)
        regression_slope_t_test(cost_df1, cost_df2, ['Publication date (float)'], col, logy=True, adj_corr=True)

## Plots

In [33]:
# Add a new column to differentiate the DataFrames
cost_df1['Source'] = results_name1
cost_df2['Source'] = results_name2

# Combine the DataFrames
combined_cost_df = pd.concat([cost_df1, cost_df2])

In [34]:
fig = px.scatter(
    combined_cost_df,
    x='Publication date',
    y=col,
    color='Source',
    hover_data=['System'],
    log_y=True,
)
fig.update_traces(textposition='top center')

# legend
fig.update_layout(
    legend_title_text='',
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0.05,
        xanchor='right',
        x=0.95,
    ),
)

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Cost (2023 USD)')

# title
fig.update_layout(title_text='')

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_font=dict(
        size=16,
    )
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_xaxes(range=['2015-01-01', '2025-01-01'])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'cost_scatter_comparison')

fig.show()

# Top-N comparison

In [35]:
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from plotting import *

In [36]:
results_dir = 'results/other-comparisons/'
os.makedirs(results_dir, exist_ok=True)

In [37]:
ns = [3, 5, 10, 20]
subplot_titles = [f'Top {n}' for n in ns]
# Create figure with subplots
fig = make_subplots(rows=2, cols=2, subplot_titles=subplot_titles)

for i, n in enumerate(ns):
    results_name = f'hardware-capex-energy-top_n={n}-original'
    top_n_results_dir = f'results/{results_name}/'
    cost_df = pd.read_csv(top_n_results_dir + 'cost_dataset.csv')
    col = 'Cost (inflation-adjusted)'

    fig.add_trace(
        go.Scatter(
            x=cost_df['Publication date'],
            y=cost_df[col],
            mode='markers',
            # open circles
            marker=dict(
                size=8,
                line=dict(width=1, color='DarkSlateGrey'),
                symbol='circle-open',
            ),
            name=f'Top {n}',
        ),
        row=i // 2 + 1,
        col=i % 2 + 1,
    )

    # predictions
    predicted_cost_df = pd.read_csv(top_n_results_dir + 'predicted_cost_dataset.csv')
    fig.add_trace(
        go.Scatter(
            x=predicted_cost_df['Publication date'],
            y=10**predicted_cost_df['mean'],
            mode='lines',
            name='Predicted',
        ),
        row=i // 2 + 1,
        col=i % 2 + 1,
    )

    slope = predicted_cost_df['mean'].diff().iloc[1] / predicted_cost_df['Publication date (float)'].diff().iloc[1]
    # annotate slope
    fig.add_annotation(
        x=predicted_cost_df['Publication date'].iloc[-15],
        y=2.5,
        text=f'{10**slope:.2g}x per year',
        showarrow=False,
        font=dict(
            size=12,
        ),
        row=i // 2 + 1,
        col=i % 2 + 1,
    )

    # log y axis
    fig.update_yaxes(type='log', row=i // 2 + 1, col=i % 2 + 1)
    # y limits
    fig.update_yaxes(range=[1, 9], row=i // 2 + 1, col=i % 2 + 1)
    # Same color for all traces: #00a5a6
    fig.update_traces(marker=dict(color='#00a5a6'), row=i // 2 + 1, col=i % 2 + 1)
    # axis labels
    if i % 2 == 0:
        fig.update_yaxes(title_text='Cost (2023 USD, log scale)', row=i // 2 + 1, col=i % 2 + 1)
    if i // 2 == 1:
        fig.update_xaxes(title_text='Publication date', row=i // 2 + 1, col=i % 2 + 1)

# Update layout
fig.update_layout(
    width=800,
    height=600,
    showlegend=False,
)

# plotly white style
fig.update_layout(template='plotly_white')
# no grid
# fig.update_xaxes(showgrid=False)
# fig.update_yaxes(showgrid=False)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'top_n_comparison')

# Show the figure
fig.show()

# Model selection method comparison

In [38]:
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from plotting import *

In [39]:
results_dir = 'results/other-comparisons/'
os.makedirs(results_dir, exist_ok=True)

In [40]:
selection_methods = ['top_n=10', 'window_percentile=80', 'backward_window_percentile=85', 'residual_from_trend=80']
subplot_titles = ['Top-N=10', 'Top 20% of models in year before/after', 'Top 15% of models in year before', 'Top 20% of residuals from trend']
# Create figure with subplots
fig = make_subplots(rows=2, cols=2, subplot_titles=subplot_titles)

for i, selection_method in enumerate(selection_methods):
    results_name = f'hardware-capex-energy-{selection_method}-original'
    method_results_dir = f'results/{results_name}/'
    cost_df = pd.read_csv(method_results_dir + 'cost_dataset.csv')
    col = 'Cost (inflation-adjusted)'

    fig.add_trace(
        go.Scatter(
            x=cost_df['Publication date'],
            y=cost_df[col],
            mode='markers',
            # open circles
            marker=dict(
                size=8,
                line=dict(width=1, color='DarkSlateGrey'),
                symbol='circle-open',
            ),
            name=f'Top {n}',
        ),
        row=i // 2 + 1,
        col=i % 2 + 1,
    )

    # predictions
    predicted_cost_df = pd.read_csv(method_results_dir + 'predicted_cost_dataset.csv')
    fig.add_trace(
        go.Scatter(
            x=predicted_cost_df['Publication date'],
            y=10**predicted_cost_df['mean'],
            mode='lines',
            name='Predicted',
        ),
        row=i // 2 + 1,
        col=i % 2 + 1,
    )

    slope = predicted_cost_df['mean'].diff().iloc[1] / predicted_cost_df['Publication date (float)'].diff().iloc[1]
    # annotate slope
    fig.add_annotation(
        x=predicted_cost_df['Publication date'].iloc[-15],
        y=2.5,
        text=f'{10**slope:.2g}x per year',
        showarrow=False,
        font=dict(
            size=12,
        ),
        row=i // 2 + 1,
        col=i % 2 + 1,
    )

    # log y axis
    fig.update_yaxes(type='log', row=i // 2 + 1, col=i % 2 + 1)
    # y limits
    fig.update_yaxes(range=[1, 9], row=i // 2 + 1, col=i % 2 + 1)
    fig.update_xaxes(range=['2015-01-01', '2025-01-01'], row=i // 2 + 1, col=i % 2 + 1)
    # Same color for all traces: #00a5a6
    fig.update_traces(marker=dict(color='#00a5a6'), row=i // 2 + 1, col=i % 2 + 1)
    # axis labels
    if i % 2 == 0:
        fig.update_yaxes(title_text='Cost (2023 USD, log scale)', row=i // 2 + 1, col=i % 2 + 1)
    if i // 2 == 1:
        fig.update_xaxes(title_text='Publication date', row=i // 2 + 1, col=i % 2 + 1)

# Update layout
fig.update_layout(
    width=800,
    height=600,
    showlegend=False,
)

# plotly white style
fig.update_layout(template='plotly_white')
# no grid
# fig.update_xaxes(showgrid=False)
# fig.update_yaxes(showgrid=False)

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'model_selection_comparison')

# Show the figure
fig.show()