In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import numpy as np
import os
import pandas as pd
import plotly.express as px
import statsmodels.api as sm
from statsmodels.stats.diagnostic import breaks_cusumolsresid

from plotting import *
from regression import *
from utils import *

In [17]:
# Load two results CSVs
results_name1 = 'hardware-capex-opex-top_n=10-clip-training-start-date'
results_name2 = 'hardware-capex-opex-top_n=10-long-depreciation-time'
results_dir1 = f'results/{results_name1}/'
results_dir2 = f'results/{results_name2}/'
results_dir = f'results/{results_name1}-vs-{results_name2}/'
os.makedirs(results_dir, exist_ok=True)

In [18]:
cost_df1 = pd.read_csv(results_dir1 + 'price dataset.csv')
cost_df2 = pd.read_csv(results_dir2 + 'price dataset.csv')

In [19]:
col = 'Cost (inflation-adjusted)'

In [20]:
# Relative error stats
ratios = cost_df2[col] / cost_df1[col]
print(f'Ratio stats: {results_name2} / {results_name1}')
print_median_and_ci(ratios.dropna(), ci=[2.5, 97.5])
print(f'Mean: {ratios.mean():.2g}')

Ratio stats: hardware-capex-opex-top_n=10-long-depreciation-time / hardware-capex-opex-top_n=10-clip-training-start-date
Median: 0.98 [95% CI: 0.92, 1]
Mean: 0.96


In [21]:
len(ratios.dropna())

47

In [22]:
cost_df1.iloc[ratios.argmin()][col], cost_df2.iloc[ratios.argmin()][col]

(75432.13544543555, 69084.70145319075)

In [23]:
cost_df1.iloc[ratios.argmax()][col], cost_df2.iloc[ratios.argmax()][col]

(61697354.64033398, 61697354.64033398)

# Difference in regression

In [24]:
cost_df1['Publication date (float)'] = datetime_to_float_year(pd.to_datetime(cost_df1['Publication date']))
cost_df2['Publication date (float)'] = datetime_to_float_year(pd.to_datetime(cost_df2['Publication date']))

In [25]:
# Compare slopes
regression_slope_t_test(cost_df1, cost_df2, ['Publication date (float)'], col, logy=True)

Slope 1: 0.38 (SE: 0.05)
Slope 2: 0.38 (SE: 0.05)
Test statistic: -0.01
p-value: 1.00


(-0.006117921703975701, 0.9951321757698937)

In [26]:
# Compare slopes and intercepts
chow_test(cost_df1, cost_df2, ['Publication date (float)'], col, logy=True)

Chow Test F-statistic: 0.008307776359542725
p-value: 0.9917273982881428


(0.008307776359542725, 0.9917273982881428)

# Plots

In [27]:
# Add a new column to differentiate the DataFrames
cost_df1['Source'] = results_name1
cost_df2['Source'] = results_name2

# Combine the DataFrames
combined_cost_df = pd.concat([cost_df1, cost_df2])

In [28]:
fig = px.scatter(
    combined_cost_df,
    x='Publication date',
    y=col,
    color='Source',
    hover_data=['System'],
    log_y=True,
)
fig.update_traces(textposition='top center')

# legend
fig.update_layout(
    legend_title_text='',
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=0.05,
        xanchor='right',
        x=0.95,
    ),
)

# axis labels
fig.update_xaxes(title_text='Publication date')
fig.update_yaxes(title_text='Cost (2023 USD)')

# title
fig.update_layout(title_text='')

# update size
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_font=dict(
        size=16,
    )
)

# font size
fig.update_layout(
    font=dict(
        size=14,
    )
)

# axis limits
fig.update_xaxes(range=['2015-01-01', '2025-01-01'])

# margins
fig.update_layout(margin=dict(l=10, r=10, t=40, b=10))

save_plot(fig, results_dir, 'cost_scatter_comparison')

fig.show()