# Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from contextlib import redirect_stdout
import numpy as np
import os
import pandas as pd
from cost import *
from prices import *
from inflation import *
from regression import *
from utils import *
import imputation

In [None]:
compute_threshold_method = 'top_n'  # top_n, window_percentile
compute_threshold = 10  # e.g. 10 to select top 10; 75 to select top 25%
variant = '2025-03-17_exclude_finetunes_at_threshold_stage'  # whatever else distinguishes this run, e.g. 'excluding-AlphaGo'
exclude_models_containing = []  # ['GNMT', 'AlphaZero', 'AlphaGo Master', 'AlphaGo Zero']

# Imputation configuration (matching john_og_version)
enable_imputation = True  # Set to False to disable imputation
imputation_method = 'most_common'  # 'knn', 'most_common', 'none'
knn_neighbors = 5  # Number of neighbors for KNN imputation (if using KNN)

# Run all three cost estimation methods
estimation_methods = ['hardware-capex-energy', 'hardware-acquisition', 'cloud']
estimation_method_lookup = {
    'hardware-capex-energy': estimate_hardware_capex_energy,
    'hardware-acquisition': estimate_hardware_acquisition_cost,
    'cloud': estimate_cloud_costs,
}

results_dir = f'results/all-methods-{compute_threshold_method}={compute_threshold}-{variant}/'
os.makedirs(results_dir, exist_ok=True)

# Load data

In [None]:
frontier_pcd_df, hardware_df, price_df = load_data_for_cost_estimation(
    compute_threshold_method=compute_threshold_method, compute_threshold=compute_threshold,
)

In [None]:
len(frontier_pcd_df), len(hardware_df), len(price_df)

In [None]:
# Data quality report before imputation
print("Data Quality Report (Before Imputation):")
print(f"Models with known Training hardware: {frontier_pcd_df['Training hardware'].notna().sum()}/{len(frontier_pcd_df)}")
print(f"Models with known Hardware quantity: {frontier_pcd_df['Hardware quantity'].notna().sum()}/{len(frontier_pcd_df)}")
print(f"Models with known Hardware utilization: {frontier_pcd_df['Hardware utilization'].notna().sum()}/{len(frontier_pcd_df)}")
print(f"Models with known Training time (hours): {frontier_pcd_df['Training time (hours)'].notna().sum()}/{len(frontier_pcd_df)}")

# Apply imputation if enabled
if enable_imputation and imputation_method != 'none':
    print(f"\nApplying {imputation_method} imputation...")
    if imputation_method == 'knn':
        # Apply KNN imputation - uses the established pipeline from imputation.py
        frontier_pcd_df = imputation.knn_impute_pcd(
            frontier_pcd_df.copy(), num_neighbors=knn_neighbors
        )
        print(f"Applied KNN imputation with {knn_neighbors} neighbors")
    elif imputation_method == 'most_common':
        # Apply most common value imputation for training hardware
        frontier_pcd_df = imputation.most_common_impute_training_hardware(frontier_pcd_df.copy())
        print("Applied most common value imputation for training hardware")
    
    # Data quality report after imputation
    print("\nData Quality Report (After Imputation):")
    print(f"Models with known Training hardware: {frontier_pcd_df['Training hardware'].notna().sum()}/{len(frontier_pcd_df)}")
    print(f"Models with known Hardware quantity: {frontier_pcd_df['Hardware quantity'].notna().sum()}/{len(frontier_pcd_df)}")
    print(f"Models with known Hardware utilization: {frontier_pcd_df['Hardware utilization'].notna().sum()}/{len(frontier_pcd_df)}")
    print(f"Models with known Training time (hours): {frontier_pcd_df['Training time (hours)'].notna().sum()}/{len(frontier_pcd_df)}")
else:
    print("\nSkipping imputation (disabled in configuration)")

In [None]:
# Determine imputation function based on configuration
if enable_imputation and imputation_method != 'none':
    if imputation_method == 'knn':
        impute_pcd_fn = imputation.knn_impute_pcd
        impute_kwargs = {'num_neighbors': knn_neighbors}
    elif imputation_method == 'most_common':
        impute_pcd_fn = imputation.most_common_impute_training_hardware
        impute_kwargs = {}
    else:
        impute_pcd_fn = None
        impute_kwargs = {}
else:
    impute_pcd_fn = None
    impute_kwargs = {}

print(f"Imputation enabled: {enable_imputation}")
if enable_imputation:
    print(f"Imputation method: {imputation_method}")
    if imputation_method == 'knn':
        print(f"KNN neighbors: {knn_neighbors}")
    print(f"Imputation function: {impute_pcd_fn}")
else:
    print("No imputation will be applied")

In [None]:
# Run all three cost estimation methods with imputation support
cost_dfs = {}
component_cost_df = None

for estimation_method in estimation_methods:
    print(f"\n=== Running {estimation_method} estimation ===")
    cost_estimation_function = estimation_method_lookup[estimation_method]
    
    with open(f'{results_dir}/cost_estimation_{estimation_method}.out', 'w') as f:
        with redirect_stdout(f):
            if impute_pcd_fn is not None:
                # Call with imputation parameters (matching john_og_version)
                cost_df = cost_estimation_function(
                    frontier_pcd_df.copy(), hardware_df, price_df,
                    impute_pcd_fn=impute_pcd_fn, **impute_kwargs
                )
            else:
                # Call without imputation (original behavior)
                cost_df = cost_estimation_function(frontier_pcd_df.copy(), hardware_df, price_df)
    
    cost_dfs[estimation_method] = cost_df
    
    # Create component cost breakdown only for hardware-capex-energy method
    if estimation_method == 'hardware-capex-energy':
        frontier_pcd_df_copy = frontier_pcd_df.copy()
        with open(f'{results_dir}/component_cost_estimation.out', 'w') as f:
            with redirect_stdout(f):
                if impute_pcd_fn is not None:
                    component_cost_df = cost_estimation_function(
                        frontier_pcd_df_copy, hardware_df, price_df,
                        separate_components=True, impute_pcd_fn=impute_pcd_fn, **impute_kwargs
                    )
                else:
                    component_cost_df = cost_estimation_function(
                        frontier_pcd_df_copy, hardware_df, price_df, separate_components=True
                    )

print(f"\nCost estimation completed for all methods")

In [None]:
# Validation: Report on imputation impact
if enable_imputation and imputation_method != 'none':
    print(f"Imputation Impact Assessment:")
    print(f"Imputation method used: {imputation_method}")
    if imputation_method == 'knn':
        print(f"KNN neighbors: {knn_neighbors}")
    
    # For comparison, calculate how many models would succeed without imputation
    # Load original data without imputation
    original_frontier_df, _, _ = load_data_for_cost_estimation(
        compute_threshold_method=compute_threshold_method, 
        compute_threshold=compute_threshold
    )
    
    # Run cost estimation WITHOUT imputation to get baseline (using hardware-capex-energy as reference)
    print("Running baseline cost estimation without imputation for comparison...")
    original_results = estimate_hardware_capex_energy(original_frontier_df, hardware_df, price_df)
    original_success_count = original_results['Cost'].notna().sum()
    
    models_with_missing_data = (
        original_frontier_df['Training hardware'].isna() |
        original_frontier_df['Hardware quantity'].isna() |
        original_frontier_df['Training time (hours)'].isna()
    ).sum()
    
    hardware_capex_success_count = cost_dfs['hardware-capex-energy']['Cost'].notna().sum()
    
    print(f"Models with missing critical data (pre-imputation): {models_with_missing_data}")
    print(f"Models with successful cost estimates WITHOUT imputation: {original_success_count}")
    print(f"Models with successful cost estimates WITH imputation (hardware-capex-energy): {hardware_capex_success_count}")
    
    # Calculate the actual imputation impact
    imputation_enabled_count = hardware_capex_success_count - original_success_count
    if imputation_enabled_count > 0:
        print(f"✓ Imputation enabled cost estimation for {imputation_enabled_count} additional models")
    elif imputation_enabled_count == 0:
        print("= Imputation did not enable cost estimation for any additional models")
    else:
        print(f"! Imputation resulted in {-imputation_enabled_count} fewer successful cost estimates")
        
    # Show specific models that benefited from imputation
    if imputation_enabled_count > 0:
        imputed_models = cost_dfs['hardware-capex-energy'][
            (cost_dfs['hardware-capex-energy']['Cost'].notna()) & 
            (~cost_dfs['hardware-capex-energy']['Model'].isin(original_results[original_results['Cost'].notna()]['Model']))
        ]['Model'].tolist()
        
        if len(imputed_models) > 0:
            print(f"\nModels that benefited from imputation:")
            for model in imputed_models:
                print(f"  - {model}")
else:
    print("Imputation was disabled, so no impact assessment available.")

# Imputation Impact Assessment

# Cost estimation

In [None]:
# Run all three cost estimation methods
cost_dfs = {}
component_cost_df = None

for estimation_method in estimation_methods:
    print(f"\n=== Running {estimation_method} estimation ===")
    cost_estimation_function = estimation_method_lookup[estimation_method]
    
    with open(f'{results_dir}/cost_estimation_{estimation_method}.out', 'w') as f:
        with redirect_stdout(f):
            cost_df = cost_estimation_function(frontier_pcd_df.copy(), hardware_df, price_df)
    
    cost_dfs[estimation_method] = cost_df
    
    # Create component cost breakdown only for hardware-capex-energy method
    if estimation_method == 'hardware-capex-energy':
        frontier_pcd_df_copy = frontier_pcd_df.copy()
        with open(f'{results_dir}/component_cost_estimation.out', 'w') as f:
            with redirect_stdout(f):
                component_cost_df = cost_estimation_function(frontier_pcd_df_copy, hardware_df, price_df, separate_components=True)

print(f"\nCost estimation completed for all methods")

In [None]:
# Remove the old conditional component cost creation since it's now handled in the loop above
# Display results for each method
for method, df in cost_dfs.items():
    print(f"\n=== {method} results ===")
    print(f"Total models: {len(df)}")
    print(f"Models with cost estimates: {df['Cost'].notna().sum()}")
    print(f"Models with training time: {df.dropna(subset=['Cost'])['Training time (hours)'].notna().sum()}")
    print(f"Models with hardware utilization: {df.dropna(subset=['Cost'])['Hardware utilization'].notna().sum()}")
    print(f"Cost range: ${df['Cost'].min():.0f} - ${df['Cost'].max():.0f}")
    print()

In [None]:
# Use hardware-capex-energy results as the base for further analysis
cost_df = cost_dfs['hardware-capex-energy']
cost_df

In [None]:
cost_df['Cost'].notna().sum()

In [None]:
cost_df.dropna(subset=['Cost'])['Training time (hours)'].notna().sum()

In [None]:
cost_df.dropna(subset=['Cost'])['Hardware utilization'].notna().sum()

Exclusion

In [None]:
cost_df[['Model', 'Publication date']].tail(15)

In [None]:
# Apply exclusions to all cost dataframes
for method in estimation_methods:
    for kw in exclude_models_containing:
        cost_dfs[method] = cost_dfs[method][cost_dfs[method]['Model'].str.contains(kw) == False]

# Show the models after exclusion (using hardware-capex-energy as reference)
cost_dfs['hardware-capex-energy'][['Model', 'Publication date']].tail(15)

Use the below to check data availability for specific systems

In [None]:
# system = 'WizardLM-7B'
# row = cost_df.loc[cost_df['Model'] == system]
# print('Cost:', row['Cost'].values[0])
# print('Training hardware:', row['Training hardware'].values[0])
# print('Training time (hours):', row['Training time (hours)'].values[0])
# print('Hardware quantity:', row['Hardware quantity'].values[0])
# print('Hardware utilization:', row['Hardware utilization'].values[0])

# Apply inflation adjustment

In [None]:
# Show costs before inflation adjustment (using hardware-capex-energy)
cost_dfs['hardware-capex-energy']['Cost'].dropna()

In [None]:
# Apply inflation adjustment to all cost dataframes
for method in estimation_methods:
    cost_dfs[method] = adjust_column_for_inflation(cost_dfs[method], 'Cost', 'data/PCU518210518210.csv', '2024-12-01')

# Update the main cost_df reference
cost_df = cost_dfs['hardware-capex-energy']

In [None]:
cost_df['Cost (inflation-adjusted)'].dropna()

In [None]:
# Equal number of non-null values
assert cost_df['Cost (inflation-adjusted)'].notna().sum() == cost_df['Cost'].notna().sum()

In [None]:
cost_df['Publication date (float)'] = datetime_to_float_year(pd.to_datetime(cost_df['Publication date']))

In [None]:
pred_start_year = 2015
pred_end_year = 2025
pred_start_date = f'{pred_start_year}-01-01'
pred_end_date = f'{pred_end_year}-01-01'

pred_years = pd.DataFrame({'Publication date (float)': np.linspace(pred_start_year, pred_end_year, 100)})
pred_years

In [None]:

predicted_cost_df = get_predictions(reg_results, pred_years, ['Publication date (float)'])
predicted_cost_df['Publication date'] = predicted_cost_df['Publication date (float)'].apply(float_year_to_datetime)
predicted_cost_df

In [None]:
# Save prediction dataset - this uses hardware-capex-energy method for regression
predicted_cost_df.to_csv(results_dir + 'predicted_cost_dataset_hardware_capex_energy.csv', index=False)

# Export data

In [None]:
# Create cost_dataset_3_estimates.csv with Model + 3 cost columns
cost_comparison_df = pd.DataFrame()
cost_comparison_df['Model'] = cost_dfs['hardware-capex-energy']['Model']

# Add inflation-adjusted costs from each method
for method in estimation_methods:
    method_df = cost_dfs[method]
    # Apply inflation adjustment to each method's costs
    method_df = adjust_column_for_inflation(method_df, 'Cost', 'data/PCU518210518210.csv', '2024-12-01')
    cost_comparison_df[f'{method.replace("-", "_")}_cost'] = method_df['Cost (inflation-adjusted)']

# Display the comparison
print("Cost comparison across methods:")
print(cost_comparison_df.dropna().head(10))

# Save the 3-method comparison dataset
cost_comparison_df.to_csv(results_dir + 'cost_dataset_3_estimates.csv', index=False)
print(f"\nSaved cost_dataset_3_estimates.csv with {len(cost_comparison_df)} models")

# Also keep the original detailed export for the hardware-capex-energy method
keep_cols = [
    'Model',
    'Domain',
    'Task',
    'Model accessibility',
    'Reference',
    'Publication date',
    'Organization',
    'Parameters',
    'Training compute (FLOP)',
    'Training dataset size (datapoints)',
    'Epochs',
    'Training time (hours)',
    'Training hardware',
    'Base model',
    'Finetune compute (FLOP)',
    'Hardware quantity',
    'Hardware utilization',
    'Training cloud compute vendor',
    'Training data center',
    'Cost',
    'Cost (inflation-adjusted)',
]
cost_df[keep_cols]

In [None]:
# Keep the detailed export for the hardware-capex-energy method
cost_df[keep_cols].to_csv(results_dir + 'cost_dataset_detailed.csv', index=False)

In [None]:
cost_component_names = [
    'AI accelerator chip cost',
    'Other server components cost',
    'Cluster-level interconnect cost',
    'Energy cost',
]

In [None]:
for key in cost_component_names:
    component_cost_df[f"{key} (%)"] = component_cost_df[key] / component_cost_df['Cost'] * 100
component_cost_df['AI accelerator chip cost (%)']

In [None]:
cost_component_pc_names = [name + ' (%)' for name in cost_component_names]
filtered_component_cost_df = component_cost_df.dropna(subset=cost_component_pc_names).sort_values(by='Publication date')

In [None]:
filtered_component_cost_df.head()

In [None]:
filtered_component_cost_df.to_csv(results_dir + 'cost_components.csv', index=False)

In [None]:
# Average percentage for each component
filtered_component_cost_df[cost_component_pc_names].mean()

In [None]:
filtered_component_cost_df.columns

In [None]:
filtered_component_cost_df = filtered_component_cost_df.dropna(subset=['Training hardware'])
power_col = 'Power capacity for final training run (kW)'
filtered_component_cost_df.loc[:, power_col] = [
    cluster_power_capacity(row['Training hardware'], row['Hardware quantity'], hardware_df, row['Organization'])
    for _, row in filtered_component_cost_df.iterrows()
]

In [None]:
filtered_component_cost_df['Publication date (float)'] = datetime_to_float_year(
    pd.to_datetime(filtered_component_cost_df['Publication date'])
)