# ML Model for All 500,000 Grids (The Oracle Builder)

* Refinement: Do not just build one ML model; focus on building four high-accuracy models (one for each advisor) and using an AutoML tool (like AutoGluon or a deep ensemble) for robustness.
* Key Action: Feature Engineering is Paramount. Focus heavily on creating spatial features (adjacencies, proximity, density) that the advisors likely use.


In [1]:
# Setup and Data Loading
import numpy as np
import warnings
import os
warnings.filterwarnings('ignore')

# Clone repository if not exists and load data
if not os.path.exists('2155-Challenge-Problem-2'):
    import subprocess
    print("Cloning repository...")
    subprocess.run(['git', 'clone', 'https://github.com/Lyleregenwetter/2155-Challenge-Problem-2'], 
                   check=True, cwd='.')
    print("Repository cloned!")

# Change to the repository directory and load data
os.chdir('2155-Challenge-Problem-2')

# Import utilities
from utils_public import load_grids, plot_n_grids

print("Loading data...")
grids = load_grids()
ratings = np.load("datasets/scores.npy")

print(f"Grids shape: {grids.shape}")
print(f"Ratings shape: {ratings.shape}")
print(f"Available ratings per advisor: {(~np.isnan(ratings)).sum(axis=0)}")

advisor_names = ["Wellness", "Tax", "Transportation", "Business"]

# Change back to parent directory for saving results
os.chdir('..')

Loading data...
Grids shape: (500000, 7, 7)
Ratings shape: (500000, 4)
Available ratings per advisor: [5000 5000 5000 5000]


In [2]:
# Check PyTorch availability for CNN models
print("🔍 Checking PyTorch environment...")
try:
    import torch
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"✅ PyTorch available - Device: {device}")
    if torch.cuda.is_available():
        print(f"   🚀 CUDA detected: {torch.cuda.get_device_name(0)}")
    else:
        print(f"   💻 Using CPU for CNN training")
except ImportError:
    print("❌ PyTorch not found - CNN models will not work")
    print("   Install with: pip install torch torchvision")

print(f"{'='*50}")

🔍 Checking PyTorch environment...
✅ PyTorch available - Device: cpu
   💻 Using CPU for CNN training


In [6]:
# Load pre-trained Oracle models from pickle files
import os
from oracle import CNNTransportationOracle, CNNWellnessOracle
from oracle import BusinessOracle, TaxOracle  # Keep traditional ones for Business and Tax

print("🔄 Loading pre-trained Oracle models from pickle files...")

# Define available oracle classes
oracle_classes = {
    'Business': BusinessOracle,
    'Wellness': CNNWellnessOracle,  # Use CNN version
    'Tax': TaxOracle,
    'Transportation': CNNTransportationOracle  # Use CNN version
}

# Initialize oracles
oracle_files = {
    'Business': 'data/models/business_oracle_model.pkl',
    'Wellness': 'data/models/wellness_oracle_model.pkl',  # Updated filename
    'Tax': 'data/models/tax_oracle_model.pkl',
    'Transportation': 'data/models/transportation_oracle_model.pkl'  # Updated filename
}

# Load available pre-trained models
loaded_models = {}
for advisor_name, filename in oracle_files.items():
    if os.path.exists(filename):
        print(f"📁 Loading {advisor_name} Oracle from {filename}...")
        try:
            # Create oracle instance using the appropriate class
            oracle_class = oracle_classes[advisor_name]
            oracle = oracle_class()
            oracle.load_model(filename)
            
            loaded_models[advisor_name] = oracle
            print(f"✅ {advisor_name} Oracle loaded successfully!")
        except Exception as e:
            print(f"❌ Failed to load {advisor_name} Oracle: {e}")
            print(f"   Error details: {type(e).__name__}")
    else:
        print(f"⚠️  {filename} not found - will need fallback for {advisor_name}")

print(f"\n📊 Successfully loaded {len(loaded_models)} pre-trained models:")
for advisor_name in loaded_models.keys():
    oracle = loaded_models[advisor_name]
    oracle_type = "CNN" if hasattr(oracle, 'model') else "Traditional"
    print(f"   ✓ {advisor_name} Oracle ({oracle_type})")

# Create a combined predictor class for easy use
class PreTrainedOraclePredictor:
    def __init__(self, loaded_models):
        self.loaded_models = loaded_models
        self.advisor_names = ["Wellness", "Tax", "Transportation", "Business"]
        
    def predict_advisor(self, grids, advisor_idx):
        """Predict using pre-trained model for specific advisor"""
        advisor_name = self.advisor_names[advisor_idx]
        
        if advisor_name in self.loaded_models:
            oracle = self.loaded_models[advisor_name]
            print(f"🔮 Predicting with {advisor_name} Oracle...")
            return oracle.predict(grids)
        else:
            print(f"⚠️  No pre-trained model for {advisor_name}, returning zeros")
            return np.zeros(len(grids))
    
    def predict_all_advisors(self, grids):
        """Predict for all advisors using available pre-trained models"""
        print(f"🚀 Generating predictions for all advisors...")
        predictions = []
        for advisor_idx in range(4):
            advisor_predictions = self.predict_advisor(grids, advisor_idx)
            predictions.append(advisor_predictions)
        return np.stack(predictions).T

# Initialize the combined predictor
oracle_predictor = PreTrainedOraclePredictor(loaded_models)
print(f"✅ Pre-trained Oracle predictor ready!")
print(f"🎯 Can predict for {len(loaded_models)} advisors without training")
print(f"🧠 Using CNN models for: Wellness, Transportation")
print(f"📊 Using traditional models for: Business, Tax")

🔄 Loading pre-trained Oracle models from pickle files...
📁 Loading Business Oracle from data/models/business_oracle_model.pkl...
✅ Business Oracle loaded successfully!
📁 Loading Wellness Oracle from data/models/wellness_oracle_model.pkl...
🔥 PyTorch available, using device: cpu
❌ Failed to load Wellness Oracle: 'model'
   Error details: KeyError
📁 Loading Tax Oracle from data/models/tax_oracle_model.pkl...
✅ Tax Oracle loaded successfully!
📁 Loading Transportation Oracle from data/models/transportation_oracle_model.pkl...
🔥 PyTorch available, using device: cpu
❌ Failed to load Transportation Oracle: 'model'
   Error details: KeyError

📊 Successfully loaded 2 pre-trained models:
   ✓ Business Oracle (CNN)
   ✓ Tax Oracle (CNN)
✅ Pre-trained Oracle predictor ready!
🎯 Can predict for 2 advisors without training
🧠 Using CNN models for: Wellness, Transportation
📊 Using traditional models for: Business, Tax


In [4]:
# Create fallback CNN models if pre-trained models don't exist
fallback_models = {}

print("\n🔧 Checking for missing models and creating fallbacks...")

for advisor_name in ["Wellness", "Transportation"]:
    if advisor_name not in loaded_models:
        print(f"🚧 Creating fallback CNN model for {advisor_name}...")
        try:
            if advisor_name == "Wellness":
                oracle = CNNWellnessOracle()
            else:  # Transportation
                oracle = CNNTransportationOracle()
            
            # Train on a small subset for demonstration
            print(f"   📚 Quick training on subset for {advisor_name}...")
            
            # Get some training data
            advisor_idx = ["Wellness", "Tax", "Transportation", "Business"].index(advisor_name)
            train_mask = ~np.isnan(ratings[:, advisor_idx])
            
            if np.sum(train_mask) > 100:  # Need some data to train
                train_grids = grids[train_mask][:1000]  # Use max 1000 for quick training
                train_scores = ratings[train_mask, advisor_idx][:1000]
                
                # Quick training
                oracle.fit_model(train_grids, train_scores, epochs=5, verbose=0)
                
                fallback_models[advisor_name] = oracle
                loaded_models[advisor_name] = oracle  # Add to loaded models
                
                print(f"   ✅ {advisor_name} fallback model trained!")
            else:
                print(f"   ⚠️  Not enough data to train {advisor_name} model")
                
        except Exception as e:
            print(f"   ❌ Failed to create {advisor_name} fallback: {e}")

if fallback_models:
    print(f"\n🔄 Created {len(fallback_models)} fallback CNN models")
    # Update the predictor
    oracle_predictor = PreTrainedOraclePredictor(loaded_models)
    print(f"🎯 Updated predictor with {len(loaded_models)} total models")
else:
    print(f"✅ All models loaded successfully, no fallbacks needed")


🔧 Checking for missing models and creating fallbacks...
🚧 Creating fallback CNN model for Wellness...
🔥 PyTorch available, using device: cpu
   📚 Quick training on subset for Wellness...

Training PyTorch CNN Transportation Oracle
Available training samples: 1000
Training parameters:
  - Epochs: 5
  - Batch size: 128
  - Test size: 0.2
  - Learning rate: 0.0003
  - Model type: standard
  - Device: cpu
🔥 Using PyTorch CNN approach for spatial pattern recognition...
🏗️ CNN Architecture:
CityCNN1(
  (block1): Sequential(
    (0): Conv2d(5, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
  )
  (block2): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padd

KeyboardInterrupt: 

In [None]:
# Generate predictions using pre-trained Oracle models (SMALL SUBSET for speed)
print("🚀 Generating Oracle predictions using pre-trained models...")
print("⚡ Using small subset of grids for faster processing...")

# Use much smaller subset for faster processing
subset_size = 500000  # Reduced from 100,000 to 10,000
print(f"📊 Processing {subset_size:,} grids (subset of {len(grids):,} total)")

# Create random subset for representative sampling
np.random.seed(42)  # For reproducibility
subset_indices = np.random.choice(len(grids), size=subset_size, replace=False)
grids_subset = grids[subset_indices]
ratings_subset = ratings[subset_indices]

print(f"✅ Created subset: {grids_subset.shape}")

# Use the pre-trained predictor to generate predictions for subset
oracle_prediction_matrix_subset = oracle_predictor.predict_all_advisors(grids_subset)
print(f"Oracle predictions shape: {oracle_prediction_matrix_subset.shape}")

# Create full-size prediction matrix with NaN for non-predicted grids
oracle_prediction_matrix = np.full((len(grids), 4), np.nan)
oracle_prediction_matrix[subset_indices] = oracle_prediction_matrix_subset

# Merge with actual ratings where available (for full dataset)
final_oracle_predictions = oracle_prediction_matrix.copy()
for advisor_idx in range(4):
    mask = ~np.isnan(ratings[:, advisor_idx])
    final_oracle_predictions[mask, advisor_idx] = ratings[mask, advisor_idx]

print("\n✅ Oracle predictions complete!")
print(f"Predictions generated for: {subset_size:,} grids")
print(f"Using actual ratings for: {(~np.isnan(ratings)).sum(axis=0)} samples per advisor")
print(f"Using Oracle predictions for: {(~np.isnan(oracle_prediction_matrix)).sum(axis=0)} samples per advisor")

# Quick quality check on predicted subset
print(f"\n📈 Prediction Quality Check (subset):")
for advisor_idx, advisor_name in enumerate(oracle_predictor.advisor_names):
    predictions = oracle_prediction_matrix_subset[:, advisor_idx]
    print(f"   • {advisor_name:15}: Range [{np.min(predictions):.3f}, {np.max(predictions):.3f}], Mean {np.mean(predictions):.3f}")

# Coverage analysis
total_with_data = (~np.isnan(final_oracle_predictions)).sum(axis=0)
print(f"\n📊 Data Coverage per Advisor:")
for advisor_idx, advisor_name in enumerate(oracle_predictor.advisor_names):
    coverage = total_with_data[advisor_idx]
    percentage = (coverage / len(grids)) * 100
    print(f"   • {advisor_name:15}: {coverage:,} grids ({percentage:.1f}% coverage)")

print(f"\n💾 Predictions ready for analysis and grid optimization!")
print(f"💡 Note: Using {subset_size:,} grid subset for faster processing")

# Save subset info for reference
np.save('oracle_subset_indices.npy', subset_indices)
print(f"📁 Saved subset indices to: oracle_subset_indices.npy")

In [None]:
# Analyze the Oracle results (working with subset data)
print("🔍 Analyzing Oracle Results...")

# Since we used a subset, we need to analyze differently
# Extract only the grids where we have predictions (non-NaN values)
valid_prediction_mask = ~np.isnan(oracle_prediction_matrix).any(axis=1)
valid_prediction_indices = np.where(valid_prediction_mask)[0]

print(f"📊 Analysis based on {len(valid_prediction_indices):,} grids with predictions")

# Get predictions for grids that have Oracle predictions
subset_predictions = final_oracle_predictions[valid_prediction_mask]
subset_grids = grids[valid_prediction_mask]

print(f"Subset predictions shape: {subset_predictions.shape}")

# Calculate min scores for the subset with predictions
min_scores_subset = np.min(subset_predictions, axis=1)

# Create full arrays with NaN for compatibility
min_scores = np.full(len(grids), np.nan)
min_scores[valid_prediction_mask] = min_scores_subset

valid_mask = np.full(len(grids), False)
threshold = 0.75
valid_subset_mask = min_scores_subset >= threshold
valid_mask[valid_prediction_mask] = valid_subset_mask

print(f"\n{'='*60}")
print("ORACLE PERFORMANCE ANALYSIS (SUBSET)")
print(f"{'='*60}")

print(f"\n📊 VALIDITY ANALYSIS:")
print(f"   • Total grids with predictions: {len(subset_predictions):,}")
print(f"   • Valid grids (min score ≥ {threshold}): {np.sum(valid_subset_mask):,}")
print(f"   • Validity rate: {np.sum(valid_subset_mask)/len(subset_predictions)*100:.2f}%")

print(f"\n📈 SCORE DISTRIBUTION (SUBSET):")
advisor_names = ["Wellness", "Tax", "Transportation", "Business"]
for i, advisor in enumerate(advisor_names):
    advisor_scores = subset_predictions[:, i]
    print(f"   • {advisor:15}: Range [{np.min(advisor_scores):.3f}, {np.max(advisor_scores):.3f}], Mean {np.mean(advisor_scores):.3f}")

print(f"\n🏆 BEST PERFORMING GRIDS (SUBSET):")
if np.sum(valid_subset_mask) > 0:
    best_indices = np.argsort(min_scores_subset[valid_subset_mask])[-5:]  # Top 5
    print(f"Top 5 minimum scores: {min_scores_subset[valid_subset_mask][best_indices]}")
else:
    print("   • No grids meet the validity threshold in this subset")

print(f"\n💡 Note: Analysis based on {len(subset_predictions):,} grid subset")
print(f"📊 For full dataset analysis, would need to run predictions on all 500K grids")

In [None]:
# Extract top grids from the subset analysis
print(f"\n🎯 EXTRACTING TOP GRIDS FROM SUBSET:")

# Work with the subset data we analyzed
if np.sum(valid_subset_mask) > 0:
    # Get indices of valid grids within the subset
    valid_subset_indices = np.where(valid_subset_mask)[0]
    
    # Sort by minimum score (descending)
    sorted_indices = np.argsort(min_scores_subset[valid_subset_mask])[::-1]
    
    # Get top grids (up to 100 or all valid grids, whichever is smaller)
    n_top = min(100, len(sorted_indices))
    top_subset_indices = valid_subset_indices[sorted_indices[:n_top]]
    
    # Extract the actual top grids and their predictions
    top_grids = subset_grids[top_subset_indices]
    top_predictions = subset_predictions[top_subset_indices]
    top_min_scores = min_scores_subset[valid_subset_mask][sorted_indices[:n_top]]
    
    print(f"   • Selected {len(top_grids)} grids from subset")
    print(f"   • Min score range: {np.min(top_min_scores):.4f} - {np.max(top_min_scores):.4f}")
    
    print(f"\n📊 Score ranges per advisor (top grids):")
    for i, advisor in enumerate(advisor_names):
        scores = top_predictions[:, i]
        print(f"     - {advisor:15}: {np.min(scores):.3f} - {np.max(scores):.3f}")
    
    print(f"   • Average minimum score: {np.mean(top_min_scores):.4f}")
    
    # Save the subset results
    np.save('oracle_subset_predictions.npy', subset_predictions)
    np.save('oracle_subset_min_scores.npy', min_scores_subset)
    np.save('oracle_subset_valid_mask.npy', valid_subset_mask)
    np.save('oracle_top_grids_subset.npy', top_grids)
    np.save('oracle_top_predictions_subset.npy', top_predictions)
    
    print(f"\n💾 Saved subset analysis results:")
    print(f"   • oracle_subset_predictions.npy")
    print(f"   • oracle_subset_min_scores.npy") 
    print(f"   • oracle_subset_valid_mask.npy")
    print(f"   • oracle_top_grids_subset.npy ({len(top_grids)} grids)")
    print(f"   • oracle_top_predictions_subset.npy")
    
    # Visualize some top grids if possible
    if len(top_grids) > 0:
        print(f"\n📊 Top grid analysis:")
        try:
            # Try to use the plotting function
            os.chdir('2155-Challenge-Problem-2')
            plot_n_grids(top_grids[-6:])  # Show top 6 grids
            os.chdir('..')
            print("✅ Displayed top 6 grids")
        except Exception as e:
            print(f"⚠️ Could not display grids: {e}")
            # Show basic stats instead
            print(f"Grid diversity example - Top grid has {len(np.unique(top_grids[-1]))} unique districts")

else:
    print("   • No valid grids found in the subset")
    print("   • Consider lowering the threshold or using a larger subset")

print(f"\n🎉 SUBSET ORACLE ANALYSIS COMPLETE!")
print(f"📊 Found {np.sum(valid_subset_mask)} valid grids out of {len(subset_predictions):,} analyzed")
print(f"💡 Validity rate: {np.sum(valid_subset_mask)/len(subset_predictions)*100:.2f}%")