# Improved ML Model for Heat Flux Prediction

Based on initial results (30.8% within ±5%), implementing improvements:
1. **Larger, deeper network** - More capacity for complex relationships
2. **Better feature engineering** - Add derived physics features  
3. **Ensemble methods** - Combine multiple models
4. **Advanced regularization** - Better generalization


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import warnings
warnings.filterwarnings('ignore')

# Set seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {len(tf.config.list_physical_devices('GPU'))}")


In [None]:
# Load and prepare data (reuse cleaned data)
print("=== LOADING CLEAN DATA ===")

# Load the original data
df = pd.read_csv('../Data/raw/apollo_cfd_database.csv')

# Apply same cleaning as before
df_clean = df.copy()
df_clean = df_clean[df_clean['theta (m)'] >= 0]
df_clean = df_clean[df_clean['Re-theta'] >= 1e-5]

# Apply log transformations
df_clean['log_density'] = np.log10(df_clean['density (kg/m^3)'])
df_clean['log_velocity'] = np.log10(df_clean['velocity (m/s)'])

print(f"Clean dataset: {len(df_clean):,} points")


In [None]:
# Enhanced Feature Engineering - Key Improvement #1
print("=== ENHANCED FEATURE ENGINEERING ===")

# Add physics-based derived features
df_clean['log_dynamic_pressure'] = np.log10(df_clean['dynamic_pressure (Pa)'])
df_clean['log_pressure'] = np.log10(df_clean['pw (Pa)'])

# Velocity-based features (heat flux ∝ velocity^n)
df_clean['velocity_cubed'] = df_clean['velocity (m/s)']**3  # Sutton-Graves relation
df_clean['sqrt_velocity'] = np.sqrt(df_clean['velocity (m/s)'])

# Density-velocity interactions
df_clean['rho_v_squared'] = df_clean['density (kg/m^3)'] * df_clean['velocity (m/s)']**2

# Spatial features (distance from stagnation point)
df_clean['distance_from_center'] = np.sqrt(df_clean['X']**2 + df_clean['Y']**2 + df_clean['Z']**2)
df_clean['x_normalized'] = df_clean['X'] / df_clean['distance_from_center']
df_clean['y_normalized'] = df_clean['Y'] / df_clean['distance_from_center']
df_clean['z_normalized'] = df_clean['Z'] / df_clean['distance_from_center']

# Enhanced input features (14 total vs original 6)
enhanced_features = [
    'log_density', 'log_velocity', 'aoa (degrees)',
    'log_dynamic_pressure', 'log_pressure',
    'X', 'Y', 'Z', 'distance_from_center',
    'x_normalized', 'y_normalized', 'z_normalized',
    'velocity_cubed', 'rho_v_squared'
]

target_variable = 'qw (W/m^2)'

print(f"Enhanced features ({len(enhanced_features)}): {enhanced_features}")
print(f"Target: {target_variable}")
print(f"Feature expansion: 6 → {len(enhanced_features)} features")


In [None]:
# Improved data preparation
print("=== IMPROVED DATA PREPARATION ===")

# Check which features actually exist in the dataframe
available_features = [f for f in enhanced_features if f in df_clean.columns]
missing_features = [f for f in enhanced_features if f not in df_clean.columns]

if missing_features:
    print(f"Missing features: {missing_features}")
    print("Using available features only...")
    enhanced_features = available_features

print(f"Using {len(enhanced_features)} features: {enhanced_features}")

# Remove any infinite or NaN values from available features
df_clean = df_clean.replace([np.inf, -np.inf], np.nan).dropna(subset=enhanced_features + [target_variable])
print(f"After removing invalid values: {len(df_clean):,} points")

# Prepare feature matrix and target
X = df_clean[enhanced_features].values
y = df_clean[target_variable].values

print(f"Feature matrix shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split data: 80% train, 10% val, 10% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"\nData splits:")
print(f"Training: {X_train.shape[0]:,} ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Validation: {X_val.shape[0]:,} ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"Test: {X_test.shape[0]:,} ({X_test.shape[0]/len(X)*100:.1f}%)")


In [None]:
# Advanced preprocessing - Key Improvement #2
print("=== ADVANCED PREPROCESSING ===")

# Use RobustScaler (less sensitive to outliers) instead of StandardScaler
scaler_X = RobustScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_val_scaled = scaler_X.transform(X_val)
X_test_scaled = scaler_X.transform(X_test)

# Log transform and robust scale targets
y_train_log = np.log10(y_train)
y_val_log = np.log10(y_val)
y_test_log = np.log10(y_test)

scaler_y = RobustScaler()
y_train_scaled = scaler_y.fit_transform(y_train_log.reshape(-1, 1)).flatten()
y_val_scaled = scaler_y.transform(y_val_log.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test_log.reshape(-1, 1)).flatten()

print(f"RobustScaler applied to {X_train_scaled.shape[1]} features")
print(f"Target scaling - Log range: {y_train_log.min():.3f} to {y_train_log.max():.3f}")
print(f"Target scaling - Scaled range: {y_train_scaled.min():.3f} to {y_train_scaled.max():.3f}")
print("✅ Advanced preprocessing complete")


In [None]:
# Add the missing critical features
print("=== ADDING MISSING CRITICAL FEATURES ===")

# Add the NASA baseline features that were missing
df_clean['log_density'] = np.log10(df_clean['density (kg/m^3)'])
df_clean['log_velocity'] = np.log10(df_clean['velocity (m/s)'])

# Now use the complete enhanced feature set
complete_enhanced_features = [
    'log_density', 'log_velocity', 'aoa (degrees)',  # NASA baseline features
    'log_dynamic_pressure', 'log_pressure',
    'X', 'Y', 'Z', 'distance_from_center',
    'x_normalized', 'y_normalized', 'z_normalized',
    'velocity_cubed', 'rho_v_squared'
]

print(f"Complete enhanced features ({len(complete_enhanced_features)}): {complete_enhanced_features}")

# Re-prepare data with complete feature set
df_clean = df_clean.replace([np.inf, -np.inf], np.nan).dropna(subset=complete_enhanced_features + [target_variable])
print(f"After removing invalid values: {len(df_clean):,} points")

# Re-create feature matrix with complete features
X_complete = df_clean[complete_enhanced_features].values
y_complete = df_clean[target_variable].values

print(f"Complete feature matrix shape: {X_complete.shape}")
print(f"Feature expansion: 6 original → {X_complete.shape[1]} enhanced features")

# Re-split data with complete features
X_train, X_temp, y_train, y_temp = train_test_split(X_complete, y_complete, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"\nFinal data splits with complete features:")
print(f"Training: {X_train.shape[0]:,} samples, {X_train.shape[1]} features")
print(f"Validation: {X_val.shape[0]:,} samples")
print(f"Test: {X_test.shape[0]:,} samples")
print("✅ Complete feature set ready for improved ML models!")
