# Model Cost Calculator
# 
# This notebook imports functions from the codebase and is NOT fully standalone
# Requires: cost.py, data.py, prices.py, hardware.py, energy.py, parameters.py, inflation.py
# Also requires data files: All ML Systems - full view.csv, Hardware prices.csv, Chip dataset-Grid view.csv, PCU518210518210.csv, frontier_systems_by_*.json
# 
# Simplified cost estimation for frontier ML models with configurable parameters and real pricing data."

In [19]:
# Configuration
compute_threshold_method = 'top_n'  # 'top_n', 'window_percentile', 'backward_window_percentile', 'residual_from_trend'
compute_threshold = 10  # e.g. 10 for top 10 models, 75 for top 25%

# Imputation configuration
enable_imputation = True  # Set to False to disable imputation
imputation_method = 'most_common'  # 'knn', 'most_common', 'none'
knn_neighbors = 5  # Number of neighbors for KNN imputation (if using KNN)

# Output file
output_file = 'model_costs.csv'

In [20]:
# Install required packages
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install core dependencies
try:
    import scipy
except ImportError:
    install_package("scipy")
    
try:
    import sklearn
except ImportError:
    install_package("scikit-learn")

import json
import numpy as np
import pandas as pd

# Import core modules - use full module imports to avoid circular dependency issues
import cost
import data
import prices
import inflation
import energy
import hardware
import parameters
import imputation  # Import imputation module for data quality checks

In [21]:
# Load data using existing pipeline
frontier_df, hardware_df, price_df = data.load_data_for_cost_estimation(
    compute_threshold_method=compute_threshold_method, 
    compute_threshold=compute_threshold
)

print(f"Loaded {len(frontier_df)} frontier models for cost estimation")

# Data quality report before imputation
print("\nData Quality Report (Before Imputation):")
print(f"Models with known Training hardware: {frontier_df['Training hardware'].notna().sum()}/{len(frontier_df)}")
print(f"Models with known Hardware quantity: {frontier_df['Hardware quantity'].notna().sum()}/{len(frontier_df)}")
print(f"Models with known Hardware utilization: {frontier_df['Hardware utilization'].notna().sum()}/{len(frontier_df)}")
print(f"Models with known Training time (hours): {frontier_df['Training time (hours)'].notna().sum()}/{len(frontier_df)}")
print(f"Models with known Training compute (FLOP): {frontier_df['Training compute (FLOP)'].notna().sum()}/{len(frontier_df)}")

# Apply imputation if enabled
if enable_imputation and imputation_method != 'none':
    print(f"\nApplying {imputation_method} imputation...")
    
    if imputation_method == 'knn':
        # Apply KNN imputation - uses the established pipeline from imputation.py
        frontier_df = imputation.knn_impute_pcd(
            frontier_df.copy(), 
            num_neighbors_general=knn_neighbors,
            num_neighbors_training_hardware=knn_neighbors
        )
        print(f"Applied KNN imputation with {knn_neighbors} neighbors")
        
    elif imputation_method == 'most_common':
        # Apply most common value imputation for training hardware
        frontier_df = imputation.most_common_impute_training_hardware(frontier_df.copy())
        print("Applied most common value imputation for training hardware")
    
    # Data quality report after imputation
    print("\nData Quality Report (After Imputation):")
    print(f"Models with known Training hardware: {frontier_df['Training hardware'].notna().sum()}/{len(frontier_df)}")
    print(f"Models with known Hardware quantity: {frontier_df['Hardware quantity'].notna().sum()}/{len(frontier_df)}")
    print(f"Models with known Hardware utilization: {frontier_df['Hardware utilization'].notna().sum()}/{len(frontier_df)}")
    print(f"Models with known Training time (hours): {frontier_df['Training time (hours)'].notna().sum()}/{len(frontier_df)}")
    print(f"Models with known Training compute (FLOP): {frontier_df['Training compute (FLOP)'].notna().sum()}/{len(frontier_df)}")
else:
    print("\nSkipping imputation (disabled in configuration)")

# Summary of missing data
missing_data_summary = {
    'Total models': len(frontier_df),
    'Missing training hardware': frontier_df['Training hardware'].isna().sum(),
    'Missing hardware quantity': frontier_df['Hardware quantity'].isna().sum(),
    'Missing hardware utilization': frontier_df['Hardware utilization'].isna().sum(),
    'Missing training time': frontier_df['Training time (hours)'].isna().sum(),
    'Missing training compute': frontier_df['Training compute (FLOP)'].isna().sum()
}

print(f"\nMissing Data Summary:")
for key, value in missing_data_summary.items():
    print(f"{key}: {value}")

Loaded 89 frontier models for cost estimation

Data Quality Report (Before Imputation):
Models with known Training hardware: 66/89
Models with known Hardware quantity: 49/89
Models with known Hardware utilization: 23/89
Models with known Training time (hours): 44/89
Models with known Training compute (FLOP): 89/89

Applying most_common imputation...
Applied most common value imputation for training hardware

Data Quality Report (After Imputation):
Models with known Training hardware: 89/89
Models with known Hardware quantity: 49/89
Models with known Hardware utilization: 23/89
Models with known Training time (hours): 44/89
Models with known Training compute (FLOP): 89/89

Missing Data Summary:
Total models: 89
Missing training hardware: 0
Missing hardware quantity: 40
Missing hardware utilization: 66
Missing training time: 45
Missing training compute: 0


In [22]:
# Removed: Manual cost calculation functions
# Now using the integrated cost estimation pipeline with proper imputation support
# The cost.py functions handle all the complexity of cost estimation and imputation

In [23]:
# Run all three cost estimation methods
print("Running all cost estimation methods...")

# Determine imputation function based on configuration
impute_pcd_fn = None
impute_kwargs = {}

if enable_imputation and imputation_method != 'none':
    if imputation_method == 'knn':
        impute_pcd_fn = imputation.knn_impute_pcd
        impute_kwargs = {
            'num_neighbors_general': knn_neighbors,
            'num_neighbors_training_hardware': knn_neighbors
        }
    elif imputation_method == 'most_common':
        impute_pcd_fn = imputation.most_common_impute_training_hardware
        impute_kwargs = {}

# Create separate DataFrames for each method to avoid interference
methods = ['cloud', 'hardware-acquisition', 'hardware-capex-energy']
method_results = {}

for method in methods:
    print(f"\n{'='*50}")
    print(f"Calculating costs using {method} method...")
    print(f"{'='*50}")
    
    # Create fresh copy of data for each method
    method_df = frontier_df.copy()
    
    try:
        # Calculate costs using the appropriate method with imputation support
        if method == 'cloud':
            method_df = cost.estimate_cloud_costs(
                method_df, hardware_df, price_df,
                impute_pcd_fn=impute_pcd_fn, **impute_kwargs
            )
        elif method == 'hardware-acquisition':
            method_df = cost.estimate_hardware_acquisition_cost(
                method_df, hardware_df, price_df,
                impute_pcd_fn=impute_pcd_fn, **impute_kwargs
            )
        elif method == 'hardware-capex-energy':
            method_df = cost.estimate_hardware_capex_energy(
                method_df, hardware_df, price_df,
                impute_pcd_fn=impute_pcd_fn, **impute_kwargs
            )
        
        # Apply inflation adjustment using existing function
        print("Applying inflation adjustment to 2024 dollars...")
        method_df = inflation.adjust_column_for_inflation(method_df, 'Cost', 'data/PCU518210518210.csv', '2024-12-01')
        
        # Store results with inflation-adjusted costs
        cost_column = 'Cost (inflation-adjusted)'
        successful_estimates = method_df[method_df[cost_column].notna()]
        
        print(f"Successfully calculated costs for {len(successful_estimates)} models")
        print(f"Failed to calculate costs for {len(method_df) - len(successful_estimates)} models")
        
        # Store the results
        method_results[method] = successful_estimates[['Model', cost_column]].copy()
        
    except Exception as e:
        print(f"Error in {method} method: {str(e)}")
        method_results[method] = pd.DataFrame(columns=['Model', 'Cost (inflation-adjusted)'])

# Display summary of results
print(f"\n{'='*60}")
print("SUMMARY OF ALL METHODS")
print(f"{'='*60}")
for method, df in method_results.items():
    print(f"{method}: {len(df)} successful cost estimates")
    if len(df) > 0:
        print(f"  Min: ${df['Cost (inflation-adjusted)'].min():,.0f}")
        print(f"  Max: ${df['Cost (inflation-adjusted)'].max():,.0f}")
        print(f"  Median: ${df['Cost (inflation-adjusted)'].median():,.0f}")
    print()

# Validation: Report on imputation impact for the first method
if enable_imputation and imputation_method != 'none':
    print(f"Imputation Impact Assessment:")
    print(f"Imputation method used: {imputation_method}")
    if imputation_method == 'knn':
        print(f"KNN neighbors: {knn_neighbors}")
    
    # Calculate how many models benefited from imputation by comparing before/after
    original_frontier_df, _, _ = data.load_data_for_cost_estimation(
        compute_threshold_method=compute_threshold_method, 
        compute_threshold=compute_threshold
    )
    
    # Run cost estimation WITHOUT imputation to get baseline (using hardware-capex-energy as reference)
    original_results = cost.estimate_hardware_capex_energy(original_frontier_df, hardware_df, price_df)
    original_results = inflation.adjust_column_for_inflation(original_results, 'Cost', 'data/PCU518210518210.csv', '2024-12-01')
    original_success_count = original_results['Cost (inflation-adjusted)'].notna().sum()
    
    models_with_missing_data = (
        original_frontier_df['Training hardware'].isna() |
        original_frontier_df['Hardware quantity'].isna() |
        original_frontier_df['Training time (hours)'].isna()
    ).sum()
    
    print(f"Models with missing critical data (pre-imputation): {models_with_missing_data}")
    print(f"Models with successful cost estimates WITHOUT imputation: {original_success_count}")
    print(f"Models with successful cost estimates WITH imputation (hardware-capex-energy): {len(method_results['hardware-capex-energy'])}")
    
    # Calculate the actual imputation impact
    imputation_enabled_count = len(method_results['hardware-capex-energy']) - original_success_count
    if imputation_enabled_count > 0:
        print(f"✓ Imputation enabled cost estimation for {imputation_enabled_count} additional models")
    elif imputation_enabled_count == 0:
        print("= Imputation did not enable cost estimation for any additional models")
    else:
        print(f"! Imputation resulted in {-imputation_enabled_count} fewer successful cost estimates")

Running all cost estimation methods...

Calculating costs using cloud method...
==== System: Llama 4 Behemoth (preview) ====
No training time found, assuming 33.0625 days

Trying NVIDIA H100 SXM5 80GB at 2024-11-02 22:30:00
Trying Amazon Web Services, Price per chip-hour (3-year CUD)
Could not find price

Trying Microsoft Azure, Price per chip-hour (3-year CUD)
Could not find price

Trying Google Cloud, Price per chip-hour (3-year CUD)
Found price: 4.86 at 2025-03-17 00:00:00
Difference between acquisition date and price date: -135 days +22:30:00 

==== System: GPT-4.5 ====
No training time found, assuming 33.0625 days

Trying NVIDIA H100 SXM5 80GB at 2024-09-26 22:30:00
Trying Microsoft Azure, Price per chip-hour (3-year CUD)
Could not find price

Trying Amazon Web Services, Price per chip-hour (3-year CUD)
Could not find price

Trying Google Cloud, Price per chip-hour (3-year CUD)
Found price: 4.86 at 2025-03-17 00:00:00
Difference between acquisition date and price date: -172 days +

In [24]:
# Combine results from all methods into a single CSV with 4 columns
print(f"\n{'='*60}")
print("COMBINING RESULTS FROM ALL METHODS")
print(f"{'='*60}")

# Get all unique models from all methods
all_models = set()
for method_df in method_results.values():
    all_models.update(method_df['Model'].tolist())

print(f"Total unique models across all methods: {len(all_models)}")

# Create the combined results DataFrame
combined_results = pd.DataFrame({'Model': sorted(all_models)})

# Add cost columns for each method
method_column_names = {
    'cloud': 'Cloud_Cost',
    'hardware-acquisition': 'Hardware_Acquisition_Cost', 
    'hardware-capex-energy': 'Hardware_Capex_Energy_Cost'
}

for method, column_name in method_column_names.items():
    method_df = method_results[method]
    if len(method_df) > 0:
        # Create a mapping from model to cost
        model_to_cost = method_df.set_index('Model')['Cost (inflation-adjusted)'].to_dict()
        combined_results[column_name] = combined_results['Model'].map(model_to_cost)
    else:
        combined_results[column_name] = None

# Save combined results
combined_results.to_csv(output_file, index=False)
print(f"\nCombined results saved to {output_file}")

# Display statistics for the combined dataset
print(f"\nCombined Dataset Statistics:")
print(f"Total models: {len(combined_results)}")

for method, column_name in method_column_names.items():
    valid_costs = combined_results[column_name].notna()
    count = valid_costs.sum()
    print(f"\n{method} method ({column_name}):")
    print(f"  Models with estimates: {count}")
    if count > 0:
        costs = combined_results[column_name].dropna()
        print(f"  Min: ${costs.min():,.0f}")
        print(f"  Max: ${costs.max():,.0f}")
        print(f"  Median: ${costs.median():,.0f}")
        print(f"  Mean: ${costs.mean():,.0f}")

# Show models with estimates from all three methods
all_methods_valid = (
    combined_results['Cloud_Cost'].notna() & 
    combined_results['Hardware_Acquisition_Cost'].notna() & 
    combined_results['Hardware_Capex_Energy_Cost'].notna()
)
models_with_all_methods = combined_results[all_methods_valid]

print(f"\nModels with estimates from all three methods: {len(models_with_all_methods)}")
if len(models_with_all_methods) > 0:
    print("Top 5 models by Hardware_Capex_Energy_Cost:")
    top_models = models_with_all_methods.nlargest(5, 'Hardware_Capex_Energy_Cost')
    for i, (_, row) in enumerate(top_models.iterrows()):
        print(f"{i+1}. {row['Model']}")
        print(f"   Cloud: ${row['Cloud_Cost']:,.0f}")
        print(f"   Hardware Acquisition: ${row['Hardware_Acquisition_Cost']:,.0f}")
        print(f"   Hardware Capex+Energy: ${row['Hardware_Capex_Energy_Cost']:,.0f}")

# Show preview of the CSV structure
print(f"\nCSV Preview (first 5 rows):")
print(combined_results.head().to_string(index=False, float_format='%.0f'))

# Final validation report
print(f"\n{'='*60}")
print("FINAL VALIDATION REPORT")
print(f"{'='*60}")
print(f"Configuration used:")
print(f"  Compute threshold method: {compute_threshold_method}")
print(f"  Compute threshold: {compute_threshold}")
print(f"  Imputation enabled: {enable_imputation}")
if enable_imputation:
    print(f"  Imputation method: {imputation_method}")
    if imputation_method == 'knn':
        print(f"  KNN neighbors: {knn_neighbors}")

print(f"\nOutput file: {output_file}")
print(f"Total models: {len(combined_results)}")
print(f"Columns: {list(combined_results.columns)}")

# Data quality warnings
total_possible_estimates = len(combined_results) * 3
total_actual_estimates = (
    combined_results['Cloud_Cost'].notna().sum() +
    combined_results['Hardware_Acquisition_Cost'].notna().sum() +
    combined_results['Hardware_Capex_Energy_Cost'].notna().sum()
)
success_rate = total_actual_estimates / total_possible_estimates * 100

print(f"\nOverall success rate: {success_rate:.1f}% ({total_actual_estimates}/{total_possible_estimates} estimates)")

if success_rate < 50:
    print("⚠️  Warning: Low success rate. Consider enabling imputation or reviewing data quality")
else:
    print("✅ Good success rate for cost estimation")

print(f"\n✅ Multi-method cost estimation complete. Results saved to {output_file}")


COMBINING RESULTS FROM ALL METHODS
Total unique models across all methods: 82

Combined results saved to model_costs.csv

Combined Dataset Statistics:
Total models: 82

cloud method (Cloud_Cost):
  Models with estimates: 35
  Min: $11,679
  Max: $1,164,146,794
  Median: $1,507,793
  Mean: $51,252,409

hardware-acquisition method (Hardware_Acquisition_Cost):
  Models with estimates: 46
  Min: $36,098
  Max: $5,481,967,429
  Median: $12,590,292
  Mean: $260,801,191

hardware-capex-energy method (Hardware_Capex_Energy_Cost):
  Models with estimates: 82
  Min: $122
  Max: $300,872,432
  Median: $558,202
  Mean: $12,525,155

Models with estimates from all three methods: 35
Top 5 models by Hardware_Capex_Energy_Cost:
1. Grok-3
   Cloud: $1,164,146,794
   Hardware Acquisition: $5,481,967,429
   Hardware Capex+Energy: $300,872,432
2. Llama 3.1-405B
   Cloud: $169,828,008
   Hardware Acquisition: $896,044,949
   Hardware Capex+Energy: $51,040,524
3. GPT-4
   Cloud: $78,562,017
   Hardware Acqu