In [2]:
%%writefile config.yaml
# Configuration for the solar panel efficiency prediction pipeline

data:
  train_path: 'C:\Users\pc\Downloads\Zelestra AWS ML Ascend Challange - Copy\train.csv'
  test_path: 'C:\Users\pc\Downloads\Zelestra AWS ML Ascend Challange - Copy\test.csv'

output:
  dir: 'C:\Users\pc\Downloads\Zelestra AWS ML Ascend Challange - Copy\Submission'

Overwriting config.yaml


In [3]:
#!/media/ayon1901/SERVER1/Zelestra/.venv/bin/python
# -*- coding: utf-8 -*-

"""
Main script that orchestrates the solar panel efficiency prediction pipeline.
"""

# Standard library imports
import logging
import time
import yaml
import os
import matplotlib.pyplot as plt

# Third-party imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression

# Configure logger
logger = logging.getLogger(__name__)

def load_config():
    """Load configuration from config.yaml"""
    with open('config.yaml', 'r') as f:
        return yaml.safe_load(f)

def create_output_dir(base_dir):
    """Create output directory with timestamp"""
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    output_dir = os.path.join(base_dir, f'run_{timestamp}')
    os.makedirs(output_dir, exist_ok=True)
    return output_dir

def add_basic_features(df):
    """Add basic domain-specific features"""
    df = df.copy()
    
    # Convert columns to numeric
    numeric_columns = ['humidity', 'wind_speed', 'pressure', 'temperature', 
                      'irradiance', 'module_temperature', 'cloud_coverage',
                      'voltage', 'current', 'soiling_ratio', 'panel_age',
                      'maintenance_count']
    
    for col in numeric_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Handle missing values for numeric columns first
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    
    # 1. Enhanced irradiance and soiling features
    if 'irradiance' in df.columns and 'soiling_ratio' in df.columns:
        # Effective irradiance considering soiling
        df['effective_irradiance'] = df['irradiance'] * df['soiling_ratio']
        # Non-linear irradiance effects
        df['irradiance_squared'] = df['irradiance'] ** 2
        df['irradiance_sqrt'] = np.sqrt(df['irradiance'].clip(lower=0))
        # Soiling impact at different irradiance levels
        df['soiling_irradiance_interaction'] = df['soiling_ratio'] * df['irradiance_sqrt']
    
    # 2. Enhanced aging features
    if 'panel_age' in df.columns:
        # Non-linear aging effects
        df['age_squared'] = df['panel_age'] ** 2
        df['age_sqrt'] = np.sqrt(df['panel_age'].clip(lower=0))
        
        if 'maintenance_count' in df.columns:
            # Maintenance effectiveness
            df['maintenance_frequency'] = df['maintenance_count'] / (df['panel_age'] + 1)
            df['maintenance_effectiveness'] = df['maintenance_count'] * np.exp(-0.1 * df['panel_age'])
            
        if 'soiling_ratio' in df.columns:
            # Age-related degradation
            df['age_degradation'] = df['soiling_ratio'] * np.exp(-0.05 * df['panel_age'])
    
    # 3. Power quality features
    if all(col in df.columns for col in ['voltage', 'current']):
        df['power'] = df['voltage'] * df['current']
        mean_voltage = df['voltage'].mean()
        mean_current = df['current'].mean()
        if mean_voltage != 0 and mean_current != 0:
            df['power_quality'] = df['power'] / (mean_voltage * mean_current)
            # Add power curve characteristics
            df['power_factor'] = df['power'] / (df['voltage'] * df['current'])
            df['power_efficiency'] = df['power'] / (df['irradiance'] + 1e-6)
        else:
            df['power_quality'] = 0
            df['power_factor'] = 0
            df['power_efficiency'] = 0
    
    # 4. Enhanced temperature features
    if 'temperature' in df.columns:
        # Standard temperature coefficient for solar panels (-0.4% per degree C above 25°C)
        temp_coef = -0.004
        df['temp_efficiency_factor'] = 1 + temp_coef * (df['temperature'] - 25)
        
        # Non-linear temperature effects
        df['temp_squared'] = df['temperature'] ** 2
        df['temp_stress'] = np.abs(df['temperature'] - 25)  # Deviation from optimal
        
        if 'module_temperature' in df.columns:
            df['temp_difference'] = df['module_temperature'] - df['temperature']
            df['temp_ratio'] = (df['module_temperature'] + 273.15) / (df['temperature'] + 273.15)
            df['temp_stress_combined'] = df['temp_stress'] * np.abs(df['temp_difference'])
    
    # 5. Enhanced environmental features
    weather_cols = ['humidity', 'cloud_coverage', 'wind_speed', 'pressure']
    if all(col in df.columns for col in weather_cols):
        # Normalize factors
        humidity_factor = df['humidity'].astype(float) / 100.0
        cloud_factor = df['cloud_coverage'].astype(float) / 100.0
        max_wind = df['wind_speed'].max()
        wind_factor = df['wind_speed'].astype(float) / max_wind if max_wind > 0 else 0
        pressure_factor = (df['pressure'] - df['pressure'].mean()) / df['pressure'].std()
        
        # Combined weather impact
        df['weather_impact'] = (
            -0.3 * humidity_factor +  # Higher humidity reduces efficiency
            -0.5 * cloud_factor +    # Cloud coverage has major impact
            0.2 * wind_factor +      # Wind helps cool panels
            0.1 * pressure_factor    # Pressure has minor impact
        )
        
        # Weather stress indicators
        df['humidity_stress'] = (humidity_factor - 0.5).abs()  # Deviation from 50% humidity
        df['weather_stability'] = 1 - (df['weather_impact'].std() / (np.abs(df['weather_impact'].mean()) + 1e-6))
        
        # Cooling effect
        if 'temp_difference' in df.columns:
            df['cooling_effect'] = wind_factor * df['temp_difference']
    
    # 6. Panel health indicators
    if 'module_temperature' in df.columns and 'temperature' in df.columns:
        mean_diff = df['temp_difference'].mean()
        df['panel_health'] = 1 / (1 + np.exp(df['temp_difference'] - mean_diff))
        
        if 'soiling_ratio' in df.columns and 'panel_age' in df.columns:
            # Combined health score
            df['overall_health'] = (
                df['panel_health'] * 
                df['soiling_ratio'] * 
                np.exp(-0.03 * df['panel_age']) *  # Age degradation factor
                (1 + df['maintenance_frequency'])   # Maintenance boost
            )
    
    # 7. Time-based features
    if 'datetime' in df.columns:
        df['datetime'] = pd.to_datetime(df['datetime'])
        df['hour'] = df['datetime'].dt.hour
        df['month'] = df['datetime'].dt.month
        df['day_of_year'] = df['datetime'].dt.dayofyear
        
        # Enhanced solar position and seasonal effects
        hour_rad = 2 * np.pi * df['hour'].astype(float) / 24
        day_rad = 2 * np.pi * df['day_of_year'].astype(float) / 365
        
        df['solar_elevation'] = np.sin(hour_rad) * np.sin(day_rad)
        df['solar_azimuth'] = np.cos(hour_rad) * np.cos(day_rad)
        df['seasonal_factor'] = np.sin(day_rad + np.pi/6)  # Phase shift for seasonal lag
        
        # Time-based efficiency factors
        if 'irradiance' in df.columns:
            df['time_efficiency'] = df['solar_elevation'] * df['irradiance'] / (df['irradiance'].max() + 1e-6)
    
    # Fill any remaining missing values with 0
    df = df.fillna(0)
    return df

def select_features(X_train, y_train, X_val, X_test, threshold=0.01, cat_features=None):
    """Select important features using CatBoost's feature importance"""
    # Train a quick model to get feature importances
    selector_model = CatBoostRegressor(
        iterations=200,
        learning_rate=0.1,
        depth=6,
        verbose=0
    )
    selector_model.fit(X_train, y_train, cat_features=cat_features, verbose=0)
    
    # Get feature importances
    importance_scores = selector_model.get_feature_importance()
    feature_names = X_train.columns.tolist()
    
    # Create importance dictionary
    importance_dict = dict(zip(feature_names, importance_scores))
    
    # Select features above threshold
    selected_features = [f for f, imp in importance_dict.items() if imp > threshold]
    logger.info(f"Selected {len(selected_features)} features out of {len(feature_names)}")
    
    # Log top 10 features
    top_features = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)[:10]
    logger.info("Top 10 features:")
    for feature, importance in top_features:
        logger.info(f"  - {feature}: {importance:.6f}")
    
    return X_train[selected_features], X_val[selected_features], X_test[selected_features], selected_features

def add_advanced_features(df):
    """Add advanced feature engineering to capture non-linear relationships and domain-specific insights"""
    df = df.copy()
    
    # Polynomial features for key numerical columns
    key_numerical_cols = ['irradiance', 'temperature', 'module_temperature', 'voltage', 'current', 'soiling_ratio']
    for col in key_numerical_cols:
        if col in df.columns:
            df[f'{col}_squared'] = df[col] ** 2
            df[f'{col}_cubed'] = df[col] ** 3
            df[f'{col}_sqrt'] = np.sqrt(df[col].clip(lower=0))
    
    # Interaction terms between key features
    if 'irradiance' in df.columns and 'temperature' in df.columns:
        df['irradiance_temp_interaction'] = df['irradiance'] * df['temperature']
    if 'voltage' in df.columns and 'current' in df.columns:
        df['voltage_current_interaction'] = df['voltage'] * df['current']
    if 'irradiance' in df.columns and 'soiling_ratio' in df.columns:
        df['irradiance_soiling_interaction'] = df['irradiance'] * df['soiling_ratio']
    if 'temperature' in df.columns and 'module_temperature' in df.columns:
        df['temp_module_diff_squared'] = (df['temperature'] - df['module_temperature']) ** 2
    
    # Log transformations for skewed features
    for col in key_numerical_cols:
        if col in df.columns:
            df[f'{col}_log'] = np.log1p(df[col].clip(lower=0))
    
    # Domain-specific derived columns for solar panel efficiency
    if 'voltage' in df.columns and 'current' in df.columns:
        df['power_estimate'] = df['voltage'] * df['current']
    if 'power_estimate' in df.columns and 'irradiance' in df.columns:
        df['efficiency_estimate'] = df['power_estimate'] / (df['irradiance'] + 0.0001)  # Avoid division by zero
    if 'temperature' in df.columns and 'module_temperature' in df.columns:
        df['temp_module_ratio'] = df['module_temperature'] / (df['temperature'] + 0.0001)  # Avoid division by zero
    if 'irradiance' in df.columns:
        df['irradiance_normalized'] = df['irradiance'] / (df['irradiance'].max() + 0.0001)  # Normalize irradiance
    
    # Additional interaction terms for nuanced effects
    if 'soiling_ratio' in df.columns and 'temperature' in df.columns:
        df['soiling_temp_interaction'] = df['soiling_ratio'] * df['temperature']
    if 'voltage' in df.columns and 'temperature' in df.columns:
        df['voltage_temp_interaction'] = df['voltage'] * df['temperature']
    
    # Fill any NaN values created by transformations, only for numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    
    return df

def add_time_series_features(df):
    """Add time-series features like lags and rolling windows."""
    df = df.copy()
    
    if 'datetime' not in df.columns or 'string_id' not in df.columns:
        logger.warning("Datetime or string_id not in columns, skipping time-series features.")
        return df
    
    logger.info("Adding time-series features...")
    
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.sort_values(by=['string_id', 'datetime'])
    
    # Features to apply time-series logic on
    ts_features = ['irradiance', 'temperature', 'module_temperature', 'cloud_coverage', 'wind_speed', 'humidity', 'pressure']
    
    # Lag periods and rolling window sizes
    lags = [1, 2]
    window_sizes = [3, 6]
    
    grouped = df.groupby('string_id')
    
    for col in ts_features:
        if col in df.columns:
            # Lag features
            for lag in lags:
                df[f'{col}_lag_{lag}'] = grouped[col].shift(lag)
            
            # Rolling window features
            for window in window_sizes:
                df[f'{col}_roll_mean_{window}'] = grouped[col].rolling(window=window, min_periods=1).mean()
                df[f'{col}_roll_std_{window}'] = grouped[col].rolling(window=window, min_periods=1).std()

    # Backfill to handle NaNs at the start of each group
    df = df.bfill()
    df = df.fillna(0)
    
    logger.info("Finished adding time-series features.")
    return df

def run_pipeline(config):
    """Run the pipeline leveraging key categorical features for improved performance"""
    start_time = time.time()
    logger.info("Starting optimized CatBoost modeling pipeline with Categorical Features")
    
    # Create output directory
    output_path = create_output_dir(config['output']['dir'])
    
    # Load and preprocess data
    logger.info("Loading and preprocessing data...")
    train_df = pd.read_csv(config['data']['train_path'])
    test_df = pd.read_csv(config['data']['test_path'])
    
    # Handle outliers in training data (as in best run)
    logger.info("Handling outliers in training data...")
    train_df, outlier_info = handle_outliers(train_df, target_column='efficiency', z_threshold=3)
    logger.info(f"Outlier handling summary: {outlier_info}")
    
    # Basic feature engineering
    train_df = add_basic_features(train_df)
    test_df = add_basic_features(test_df)
    
    # Advanced feature engineering
    logger.info("Performing advanced feature engineering...")
    train_df = add_advanced_features(train_df)
    test_df = add_advanced_features(test_df)

    # Define and process categorical features
    cat_features = ['string_id', 'error_code', 'installation_type']
    for col in cat_features:
        if col in train_df.columns:
            train_df[col] = train_df[col].astype(str).fillna('NA')
        if col in test_df.columns:
            test_df[col] = test_df[col].astype(str).fillna('NA')

    # Prepare features and target
    y_train = train_df['efficiency']
    
    # Align columns before splitting
    train_cols = [col for col in train_df.columns if col not in ['id', 'efficiency']]
    test_cols = [col for col in test_df.columns if col not in ['id']]
    shared_cols = list(set(train_cols) & set(test_cols))
    
    X = train_df[shared_cols]
    X_test = test_df[shared_cols]
    
    # Split data
    X_train, X_val, y_train_split, y_val = train_test_split(
        X, y_train, test_size=0.2, random_state=42
    )

    # Get final list of categorical features present in the training data
    final_cat_features = [col for col in cat_features if col in X_train.columns]
    
    # Feature selection
    logger.info("Performing feature selection...")
    X_train_selected, X_val_selected, X_test_selected, selected_features = select_features(X_train, y_train_split, X_val, X_test, cat_features=final_cat_features)
    logger.info(f"Selected features: {selected_features}")

    # Update cat features list to only include selected ones
    final_cat_features_selected = [f for f in final_cat_features if f in selected_features]
    
    # Refined hyperparameter tuning for a single CatBoost model
    logger.info("Tuning CatBoost model with MAE loss and categorical features...")
    param_grid = [
        # Original candidates
        {'iterations': 5000, 'learning_rate': 0.01, 'depth': 8, 'l2_leaf_reg': 3, 'subsample': 0.8},
        {'iterations': 6000, 'learning_rate': 0.008, 'depth': 9, 'l2_leaf_reg': 5, 'subsample': 0.75},
        {'iterations': 7000, 'learning_rate': 0.005, 'depth': 10, 'l2_leaf_reg': 7, 'subsample': 0.7},
        # More aggressive learning
        {'iterations': 4000, 'learning_rate': 0.02, 'depth': 7, 'l2_leaf_reg': 4, 'subsample': 0.85},
        # Deeper model with smaller learning rate
        {'iterations': 8000, 'learning_rate': 0.004, 'depth': 11, 'l2_leaf_reg': 6, 'subsample': 0.7},
        # Different subsample and regularization
        {'iterations': 5500, 'learning_rate': 0.015, 'depth': 8, 'l2_leaf_reg': 8, 'subsample': 0.9}
    ]
    best_params, best_score = tune_catboost_model(X_train_selected, y_train_split, X_val_selected, y_val, param_grid=param_grid, cat_features=final_cat_features_selected)
    logger.info(f"Best parameters for CatBoost: {best_params}")
    logger.info(f"Best validation MAE for CatBoost: {best_score:.6f}")
    
    # Train the final model with best parameters
    logger.info("Training final CatBoost model with best parameters...")
    final_model = CatBoostRegressor(loss_function='MAE', **best_params)
    final_model.fit(X_train_selected, y_train_split, eval_set=(X_val_selected, y_val), use_best_model=True, verbose=0, cat_features=final_cat_features_selected)
    
    # Final predictions
    final_val_pred = final_model.predict(X_val_selected)
    final_test_pred = final_model.predict(X_test_selected)
    
    # Post-processing: Adjust predictions based on validation set bias
    logger.info("Applying linear residual correction...")
    val_residuals = y_val - final_val_pred
    residual_model = LinearRegression()
    residual_model.fit(final_val_pred.reshape(-1, 1), val_residuals)
    test_residual_pred = residual_model.predict(final_test_pred.reshape(-1, 1))
    final_test_pred_adjusted = final_test_pred + test_residual_pred
    final_test_pred_adjusted = np.clip(final_test_pred_adjusted, 0, 1)
    logger.info("Post-processing applied: added linear residuals and clipped predictions")
    
    # Calculate validation metrics (pre-adjustment for reference)
    val_rmse = np.sqrt(mean_squared_error(y_val, final_val_pred))
    val_mae = mean_absolute_error(y_val, final_val_pred)
    val_r2 = r2_score(y_val, final_val_pred)
    
    logger.info(f"Validation RMSE (pre-adjustment): {val_rmse:.6f}")
    logger.info(f"Validation MAE (pre-adjustment): {val_mae:.6f}")
    logger.info(f"Validation R2 (pre-adjustment): {val_r2:.6f}")
    
    # Save predictions (adjusted)
    test_df['efficiency'] = final_test_pred_adjusted
    test_df[['id', 'efficiency']].to_csv(
        os.path.join(output_path, 'predictions_adjusted_cat_features.csv'),
        index=False
    )
    
    # Save model and analysis
    logger.info("Saving model and analysis...")
    final_model.save_model(os.path.join(output_path, 'catboost_final_cat_features.cbm'))
    save_feature_importance(final_model, selected_features, output_path, 'catboost_final_cat_features')
    
    execution_time = time.time() - start_time
    logger.info(f"Total execution time: {execution_time:.2f} seconds")
    
    return final_test_pred_adjusted

def tune_catboost_model(X_train, y_train, X_val, y_val, param_grid=None, cat_features=None):
    """Enhanced hyperparameter tuning for CatBoost model with MAE loss"""
    if param_grid is None:
        param_grid = [
            # Original candidates
            {'iterations': 5000, 'learning_rate': 0.01, 'depth': 8, 'l2_leaf_reg': 3, 'subsample': 0.8},
            {'iterations': 6000, 'learning_rate': 0.008, 'depth': 9, 'l2_leaf_reg': 5, 'subsample': 0.75},
            {'iterations': 7000, 'learning_rate': 0.005, 'depth': 10, 'l2_leaf_reg': 7, 'subsample': 0.7},
            # More aggressive learning
            {'iterations': 4000, 'learning_rate': 0.02, 'depth': 7, 'l2_leaf_reg': 4, 'subsample': 0.85},
            # Deeper model with smaller learning rate
            {'iterations': 8000, 'learning_rate': 0.004, 'depth': 11, 'l2_leaf_reg': 6, 'subsample': 0.7},
            # Different subsample and regularization
            {'iterations': 5500, 'learning_rate': 0.015, 'depth': 8, 'l2_leaf_reg': 8, 'subsample': 0.9}
        ]
    
    best_score = float('inf')
    best_params = None
    
    for params in param_grid:
        logger.info(f"Trying parameters: {params}")
        model = CatBoostRegressor(
            loss_function='MAE',
            eval_metric='MAE',
            early_stopping_rounds=100,
            verbose=0,
            random_seed=42,
            **params
        )
        model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True, cat_features=cat_features)
        val_pred = model.predict(X_val)
        score = mean_absolute_error(y_val, val_pred)
        logger.info(f"Validation MAE for this set: {score:.6f}")
        if score < best_score:
            best_score = score
            best_params = model.get_params()
    
    # Remove parameters that are not for the constructor
    best_params.pop('loss_function', None)
    best_params.pop('eval_metric', None)
    best_params.pop('early_stopping_rounds', None)
    best_params.pop('verbose', None)
    best_params.pop('cat_features', None)

    return best_params, best_score

def save_feature_importance(model, feature_names, output_path, model_name):
    """Save feature importance plot and CSV"""
    importance = model.get_feature_importance()
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importance
    })
    importance_df = importance_df.sort_values('Importance', ascending=False)
    
    # Save to CSV
    importance_df.to_csv(
        os.path.join(output_path, f'{model_name}_feature_importance.csv'),
        index=False
    )
    
    # Create and save plot
    plt.figure(figsize=(12, 6))
    plt.bar(range(len(importance_df)), importance_df['Importance'])
    plt.xticks(range(len(importance_df)), importance_df['Feature'], rotation=90)
    plt.title(f'Feature Importance - {model_name}')
    plt.tight_layout()
    plt.savefig(os.path.join(output_path, f'{model_name}_feature_importance.png'))
    plt.close()

def handle_outliers(df, target_column, z_threshold=3):
    """Detect and cap outliers in the target column using Z-score method"""
    df = df.copy()
    initial_rows = len(df)
    
    # Calculate Z-scores for the target column
    z_scores = np.abs((df[target_column] - df[target_column].mean()) / df[target_column].std())
    
    # Identify outliers
    outliers = df[z_scores > z_threshold]
    num_outliers = len(outliers)
    
    # Cap outliers instead of removing to maintain data size
    upper_limit = df[target_column].mean() + z_threshold * df[target_column].std()
    lower_limit = df[target_column].mean() - z_threshold * df[target_column].std()
    df[target_column] = df[target_column].clip(lower=lower_limit, upper=upper_limit)
    
    outlier_info = {
        'initial_rows': initial_rows,
        'num_outliers_detected': num_outliers,
        'percentage_outliers': (num_outliers / initial_rows) * 100 if initial_rows > 0 else 0,
        'upper_limit_applied': upper_limit,
        'lower_limit_applied': lower_limit
    }
    
    return df, outlier_info

if __name__ == '__main__':
    try:
        # Load configuration
        config = load_config()
        logger.info("Configuration loaded successfully")
        
        # Run the pipeline
        predictions = run_pipeline(config)
        
    except Exception as e:
        logger.error(f"Error in main: {str(e)}")
        raise 