# Model Cost Calculator
# 
# This notebook imports functions from the codebase and is NOT fully standalone
# Requires: cost.py, data.py, prices.py, hardware.py, energy.py, parameters.py, inflation.py
# Also requires data files: All ML Systems - full view.csv, Hardware prices.csv, Chip dataset-Grid view.csv, PCU518210518210.csv, frontier_systems_by_*.json
# 
# Simplified cost estimation for frontier ML models with configurable parameters and real pricing data."

In [None]:
# Configuration
estimation_method = 'hardware-capex-energy'  # 'hardware-capex-energy', 'hardware-acquisition', 'cloud'
compute_threshold_method = 'top_n'  # 'top_n', 'window_percentile', 'backward_window_percentile', 'residual_from_trend'
compute_threshold = 10  # e.g. 10 for top 10 models, 75 for top 25%

# Output file
output_file = 'model_costs.csv'

In [ ]:
# Install required packages
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install core dependencies
try:
    import scipy
except ImportError:
    install_package("scipy")
    
try:
    import sklearn
except ImportError:
    install_package("scikit-learn")

import json
import numpy as np
import pandas as pd

# Import core modules - use full module imports to avoid circular dependency issues
import cost
import data
import prices
import inflation
import energy
import hardware
import parameters

In [ ]:
# Load data using existing pipeline
frontier_df, hardware_df, price_df = data.load_data_for_cost_estimation(
    compute_threshold_method=compute_threshold_method, 
    compute_threshold=compute_threshold
)

print(f"Loaded {len(frontier_df)} frontier models for cost estimation")

In [ ]:
def get_organization_vendor_mapping():
    # Get the complete organization to cloud vendor mapping from cost.py
    return {
        'google': 'Google Cloud',
        'deepmind': 'Google Cloud',
        'microsoft': 'Microsoft Azure', 
        'openai': 'Microsoft Azure',
        'anthropic': 'Amazon Web Services',  # Common mapping
        'amazon': 'Amazon Web Services',
        'meta': 'Meta',  # Meta has their own infrastructure
        'facebook': 'Meta',
    }

def estimate_chip_hours_with_real_specs(row):
    # Enhanced chip hours estimation using real hardware specifications
    chip_hours = cost.estimate_chip_hours(row, hardware_df)
    if chip_hours is not None and not np.isnan(chip_hours):
        return chip_hours
    
    # Check for direct chip-hours data
    if 'Training chip-hours' in row.index and not pd.isna(row['Training chip-hours']):
        return row['Training chip-hours']
    
    # Calculate from hardware quantity and training time
    hardware_quantity = row.get('Hardware quantity')
    training_time = row.get('Training time (hours)')
    
    if not pd.isna(hardware_quantity) and not pd.isna(training_time):
        return hardware_quantity * training_time
    
    # Fallback: estimate from compute and real hardware specs
    flop = row.get('Training compute (FLOP)')
    hardware_model = row.get('Training hardware')
    
    if pd.isna(flop) or pd.isna(hardware_model):
        return None
        
    # Get real FLOP/s performance from hardware database
    flop_per_second = hardware.get_flop_per_second(hardware_model, hardware_df)
    if flop_per_second is None:
        return None
        
    # Use real or estimated utilization
    utilization = row.get('Hardware utilization', parameters.MEDIAN_UTILIZATION)
    if pd.isna(utilization):
        utilization = parameters.MEDIAN_UTILIZATION
        
    # Calculate training time
    training_seconds = flop / (flop_per_second * utilization)
    training_chip_hours = training_seconds / parameters.SECONDS_PER_HOUR
    
    return training_chip_hours

def calculate_cloud_cost(row):
    # Calculate cloud rental cost using real pricing data with proper vendor selection
    org_to_cloud_vendor = get_organization_vendor_mapping()
    
    # Use real price lookup with 3-year CUD and proper vendor selection
    price, _ = prices.find_price(
        row, price_df, hardware_df, 
        'Training hardware', 'Price per chip-hour (3-year CUD)', 
        org_to_cloud_vendor
    )
    
    if price is None:
        return None
        
    # Use enhanced chip hours estimation with real hardware specs
    chip_hours = estimate_chip_hours_with_real_specs(row)
    if chip_hours is None or np.isnan(chip_hours):
        return None
        
    return price * chip_hours

def calculate_hardware_acquisition_cost(row):
    # Calculate upfront hardware purchase cost using real pricing
    price, _ = prices.get_hardware_acquisition_price(
        row, price_df, hardware_df,
        'Training hardware', 'Price (hardware purchase)'
    )
    
    if price is None:
        return None
        
    hardware_quantity = row.get('Hardware quantity')
    if pd.isna(hardware_quantity):
        return None
        
    # Base cost
    total_cost = hardware_quantity * price
    
    # Add interconnect cost
    total_cost *= 1 / (1 - parameters.CLUSTER_INTERCONNECT_COST_FRACTION)
    
    return total_cost

def calculate_hardware_capex_energy_cost(row):
    # Calculate amortized hardware + energy cost using real pricing and organization-specific PUE
    price, _ = prices.get_hardware_acquisition_price(
        row, price_df, hardware_df,
        'Training hardware', 'Price (hardware purchase)'  
    )
    
    if price is None:
        return None
        
    system_to_price = {row['Model']: price}
    
    # Use the real cost calculation with organization-specific energy costs
    estimated_cost = cost.estimate_hardware_capex_energy_cost(
        row, system_to_price, frontier_df, hardware_df, separate_components=False
    )
    
    return estimated_cost

In [ ]:
# Define unified cost calculation function
def calculate_cost(row, method):
    """Calculate cost using specified method"""
    if method == 'cloud':
        return calculate_cloud_cost(row)
    elif method == 'hardware-acquisition':
        return calculate_hardware_acquisition_cost(row)
    elif method == 'hardware-capex-energy':
        return calculate_hardware_capex_energy_cost(row)
    else:
        raise ValueError(f"Unknown estimation method: {method}")

# Apply cost calculation
print(f"Calculating costs using {estimation_method} method...")
frontier_df['Cost'] = frontier_df.apply(lambda row: calculate_cost(row, estimation_method), axis=1)

# Apply inflation adjustment using existing function
print("Applying inflation adjustment to 2024 dollars...")
frontier_df = inflation.adjust_column_for_inflation(frontier_df, 'Cost', 'data/PCU518210518210.csv', '2024-12-01')

# Use inflation-adjusted costs
cost_column = 'Cost (inflation-adjusted)'

# Filter to models with successful cost estimates  
results_df = frontier_df[frontier_df[cost_column].notna()][['Model', cost_column]].copy()
results_df = results_df.rename(columns={cost_column: 'Cost'})
results_df = results_df.sort_values('Cost', ascending=False)

print(f"Successfully calculated costs for {len(results_df)} models")
print(f"Failed to calculate costs for {len(frontier_df) - len(results_df)} models")

In [None]:
# Save results
results_df.to_csv(output_file, index=False)
print(f"\nResults saved to {output_file}")
print(f"Total models with cost estimates: {len(results_df)}")