In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
import sys
import os
sys.path.append(os.path.abspath('..'))

# TensorFlow/Keras imports for the ANNRegressor
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.regularizers import l1_l2
    from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
    TENSORFLOW_AVAILABLE = True
except ImportError:
    print("Warning: TensorFlow not available. ANNRegressor will not work.")
    TENSORFLOW_AVAILABLE = False

# Include the exact ANNRegressor class from training
class ANNRegressor:
    """
    Custom ANN Regressor wrapper that mimics scikit-learn interface
    This is the exact class definition used during model training.
    """
    def __init__(self, neurons=128, layers=3, dropout_rate=0.3, 
                 learning_rate=0.001, l1_reg=0.0, l2_reg=0.01,
                 epochs=200, batch_size=32, validation_split=0.2,
                 patience=20, verbose=0):
        self.neurons = neurons
        self.layers = layers
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.l1_reg = l1_reg
        self.l2_reg = l2_reg
        self.epochs = epochs
        self.batch_size = batch_size
        self.validation_split = validation_split
        self.patience = patience
        self.verbose = verbose
        self.model_ = None
        self.history_ = None
        
    def _build_model(self, input_dim):
        """Build the neural network model"""
        if not TENSORFLOW_AVAILABLE:
            raise ImportError("TensorFlow is required for ANNRegressor but is not installed.")
            
        model = Sequential()
        
        # Input layer
        model.add(Dense(self.neurons, 
                       input_dim=input_dim,
                       activation='relu',
                       kernel_regularizer=l1_l2(l1=self.l1_reg, l2=self.l2_reg)))
        model.add(BatchNormalization())
        model.add(Dropout(self.dropout_rate))
        
        # Hidden layers
        for i in range(self.layers - 1):
            layer_neurons = max(self.neurons // (2 ** i), 32)
            model.add(Dense(layer_neurons,
                           activation='relu',
                           kernel_regularizer=l1_l2(l1=self.l1_reg, l2=self.l2_reg)))
            model.add(BatchNormalization())
            model.add(Dropout(self.dropout_rate))
        
        # Output layer
        model.add(Dense(1, activation='linear'))
        
        # Compile model
        optimizer = Adam(learning_rate=self.learning_rate)
        model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
        
        return model
    
    def fit(self, X, y, **kwargs):
        """Fit the neural network"""
        if hasattr(X, 'values'):
            X = X.values
        if hasattr(y, 'values'):
            y = y.values
            
        if len(y.shape) > 1:
            y = y.flatten()
        
        self.model_ = self._build_model(X.shape[1])
        
        callbacks = [
            EarlyStopping(patience=self.patience, restore_best_weights=True),
            ReduceLROnPlateau(patience=self.patience//2, factor=0.5, min_lr=1e-6)
        ]
        
        self.history_ = self.model_.fit(
            X, y,
            epochs=self.epochs,
            batch_size=self.batch_size,
            validation_split=self.validation_split,
            callbacks=callbacks,
            verbose=self.verbose
        )
        
        return self
    
    def predict(self, X):
        """Make predictions"""
        if self.model_ is None:
            raise ValueError("Model must be fitted before making predictions")
            
        if hasattr(X, 'values'):
            X = X.values
            
        predictions = self.model_.predict(X, verbose=0)
        return predictions.flatten()
    
    def get_params(self, deep=True):
        """Get parameters for this estimator"""
        return {
            'neurons': self.neurons,
            'layers': self.layers,
            'dropout_rate': self.dropout_rate,
            'learning_rate': self.learning_rate,
            'l1_reg': self.l1_reg,
            'l2_reg': self.l2_reg,
            'epochs': self.epochs,
            'batch_size': self.batch_size,
            'validation_split': self.validation_split,
            'patience': self.patience,
            'verbose': self.verbose
        }
    
    def set_params(self, **params):
        """Set parameters for this estimator"""
        for key, value in params.items():
            setattr(self, key, value)
        return self

class SolarPanelPredictionPipeline:
    def __init__(self, model_path='model/best_solar_model.pkl'):
        """
        Initialize prediction pipeline with trained model
        """
        self.model_path = model_path
        self.model_package = None
        self.load_trained_model()
    
    def load_trained_model(self):
        """
        Load the trained model and all preprocessing components
        """
        try:
            with open(self.model_path, 'rb') as f:
                self.model_package = pickle.load(f)
            
            # Load all components from the training pipeline
            self.best_model = self.model_package['model']
            self.preprocessor = self.model_package['preprocessor']
            self.target_transformer = self.model_package['target_transformer']
            self.imputer = self.model_package.get('imputer', None)
            self.feature_cols = self.model_package['feature_names']
            self.categorical_cols = self.model_package['categorical_cols']
            self.numerical_cols = self.model_package['numerical_cols']
            self.best_model_name = self.model_package.get('best_model_name', 'Unknown')
            self.best_score = self.model_package.get('best_score', 0)
            self.features_to_drop = self.model_package.get('features_to_drop', [])
            self.final_feature_names = self.model_package.get('final_feature_names', [])
            
            print(f"Model loaded successfully: {self.best_model_name}")
            print(f"Model score: {self.best_score:.4f}")
            print(f"Expected features: {len(self.feature_cols)}")
            print(f"Features to drop: {self.features_to_drop}")
            print(f"Categorical features: {len(self.categorical_cols)}")
            print(f"Numerical features: {len(self.numerical_cols)}")
            
        except FileNotFoundError:
            raise FileNotFoundError(f"Model file not found: {self.model_path}")
        except Exception as e:
            raise Exception(f"Error loading model: {str(e)}")
    
    def fix_data_types(self, df, dataset_name="PREDICTION"):
        """
        Fix data type inconsistencies for specific columns
        This matches the exact logic from the training pipeline
        """
        df_fixed = df.copy()
        
        # Define columns that should be numeric (same as training)
        numeric_columns_to_fix = ['humidity', 'wind_speed', 'pressure']
        
        print(f"\n=== FIXING DATA TYPES FOR {dataset_name} ===")
        
        for col in numeric_columns_to_fix:
            if col in df_fixed.columns:
                print(f"\nProcessing {col}:")
                print(f"Original dtype: {df_fixed[col].dtype}")
                
                if df_fixed[col].dtype == 'object':
                    try:
                        numeric_conversion = pd.to_numeric(df_fixed[col], errors='coerce')
                        non_numeric_mask = pd.isna(numeric_conversion) & df_fixed[col].notna()
                        
                        if non_numeric_mask.any():
                            print(f"Non-numeric values found in {col}:")
                            non_numeric_values = df_fixed.loc[non_numeric_mask, col].value_counts()
                            print(non_numeric_values.head(10))
                            
                            df_fixed[col] = df_fixed[col].astype(str)
                            df_fixed[col] = df_fixed[col].str.replace(r'[^\d.-]', '', regex=True)
                            df_fixed[col] = df_fixed[col].str.strip()
                            df_fixed[col] = df_fixed[col].replace('', np.nan)
                            df_fixed[col] = df_fixed[col].replace('nan', np.nan)
                            
                        df_fixed[col] = pd.to_numeric(df_fixed[col], errors='coerce')
                        
                        print(f"Converted dtype: {df_fixed[col].dtype}")
                        print(f"Missing values after conversion: {df_fixed[col].isnull().sum()}")
                        print(f"Valid numeric values: {df_fixed[col].notna().sum()}")
                        
                        if df_fixed[col].notna().any():
                            print(f"Min: {df_fixed[col].min():.3f}")
                            print(f"Max: {df_fixed[col].max():.3f}")
                            print(f"Mean: {df_fixed[col].mean():.3f}")
                            
                    except Exception as e:
                        print(f"Error converting {col}: {str(e)}")
                else:
                    print(f"{col} is already numeric type: {df_fixed[col].dtype}")
        
        return df_fixed
    
    def apply_imputation(self, df):
        """
        Apply the fitted imputation pipeline to the raw data
        """
        if self.imputer is None:
            print("Warning: No imputation pipeline found. Proceeding without imputation.")
            return df
        
        print(f"\n=== APPLYING IMPUTATION PIPELINE ===")
        print(f"Data shape before imputation: {df.shape}")
        print(f"Missing values before imputation:\n{df.isnull().sum()[df.isnull().sum() > 0]}")
        
        # df_imputed = self.imputer.transform(df)

        df_imputed= self.imputer.transform_prediction(df)
        
        print(f"Data shape after imputation: {df_imputed.shape}")
        remaining_missing = df_imputed.isnull().sum().sum()
        print(f"Remaining missing values after imputation: {remaining_missing}")
        
        return df_imputed
    
    def apply_feature_engineering(self, df):
        """
        Apply the same feature engineering as used during training
        """
        print(f"\n=== APPLYING FEATURE ENGINEERING ===")
        print(f"Data shape before feature engineering: {df.shape}")
        
        try:
            from utils.feature_engineering import SolarFeatureEngineering
            feature_engineer = SolarFeatureEngineering()
            df_engineered = feature_engineer.create_solar_features(df)
            
            print(f"Data shape after feature engineering: {df_engineered.shape}")
            
            # Verify no missing values in new features
            new_missing = df_engineered.isnull().sum().sum()
            if new_missing > 0:
                print(f"Warning: {new_missing} missing values found after feature engineering")
                print("Missing values by column:")
                print(df_engineered.isnull().sum()[df_engineered.isnull().sum() > 0])
            
            return df_engineered
            
        except ImportError:
            print("Warning: feature_engineering module not found. Skipping feature engineering.")
            print("This may cause prediction errors if the model expects engineered features.")
            return df
        except Exception as e:
            print(f"Error in feature engineering: {str(e)}")
            print("Proceeding without feature engineering.")
            return df
    
    def drop_features(self, df):
        """
        Drop the same features that were dropped during training
        """
        if not self.features_to_drop:
            print("No features to drop.")
            return df
            
        print(f"\n=== DROPPING FEATURES ===")
        print(f"Features to drop: {self.features_to_drop}")
        
        available_features = [col for col in self.features_to_drop if col in df.columns]
        unavailable_features = [col for col in self.features_to_drop if col not in df.columns]
        
        if available_features:
            df_dropped = df.drop(columns=available_features)
            print(f"Dropped features: {available_features}")
            print(f"Data shape after dropping features: {df_dropped.shape}")
        else:
            df_dropped = df
            print("No features were dropped (none found in dataset)")
        
        if unavailable_features:
            print(f"Warning: Features not found in dataset: {unavailable_features}")
        
        return df_dropped
    
    def preprocess_features(self, df):
        """
        Apply the same preprocessing as used during training
        """
        print(f"\n=== PREPROCESSING FEATURES ===")
        
        # Step 1: Select only the features that were used during training
        missing_features = [col for col in self.feature_cols if col not in df.columns]
        if missing_features:
            print(f"Error: Missing required features: {missing_features}")
            raise ValueError(f"Missing required features: {missing_features}")
        
        X = df[self.feature_cols].copy()
        print(f"Selected feature columns: {X.shape[1]} features")
        
        # Step 2: Apply preprocessing using fitted preprocessor
        X_processed = self.preprocessor.transform(X)
        
        # Step 3: Convert back to DataFrame using the same feature names as training
        X_processed = pd.DataFrame(X_processed, columns=self.final_feature_names, index=X.index)
        
        print(f"✓ Applied preprocessing")
        print(f"Final preprocessed data shape: {X_processed.shape}")
        
        return X_processed
    
    def inverse_transform_predictions(self, y_pred):
        """
        Apply inverse transformation to predictions to get them back to original scale
        """
        y_pred_reshaped = y_pred.reshape(-1, 1)
        y_pred_original = self.target_transformer.inverse_transform(y_pred_reshaped).flatten()
        return y_pred_original
    
    def predict(self, raw_data):
        """
        Complete prediction pipeline that mirrors the training pipeline exactly
        
        Pipeline: Raw Data → Fix Data Types → Imputation → Feature Engineering → Drop Features → Preprocessing → Prediction
        """
        print("="*80)
        print("SOLAR PANEL PREDICTION PIPELINE")
        print("="*80)
        
        print(f"Input raw data shape: {raw_data.shape}")
        
        # Step 1: Fix data types (same as training)
        print("\nStep 1: Fixing data types...")
        df_fixed = self.fix_data_types(raw_data, "PREDICTION DATA")
        
        # Step 2: Apply imputation pipeline
        print("\nStep 2: Applying imputation pipeline...")
        df_imputed = self.apply_imputation(df_fixed)
        
        # Step 3: Apply feature engineering
        print("\nStep 3: Applying feature engineering...")
        df_engineered = self.apply_feature_engineering(df_imputed)
        
        # Step 4: Drop features
        print("\nStep 4: Dropping specified features...")
        df_final = self.drop_features(df_engineered)
        
        # Step 5: Preprocess features
        print("\nStep 5: Preprocessing features...")
        X_processed = self.preprocess_features(df_final)
        
        # Step 6: Make predictions on transformed scale
        print(f"\nStep 6: Making predictions...")
        y_pred_transformed = self.best_model.predict(X_processed)
        
        # Step 7: Transform predictions back to original scale
        print(f"Step 7: Transforming predictions to original scale...")
        y_pred_original = self.inverse_transform_predictions(y_pred_transformed)
        
        print(f"✓ Generated {len(y_pred_original)} predictions")
        print(f"Prediction statistics:")
        print(f"  Min: {y_pred_original.min():.4f}")
        print(f"  Max: {y_pred_original.max():.4f}")
        print(f"  Mean: {y_pred_original.mean():.4f}")
        print(f"  Std: {y_pred_original.std():.4f}")
        
        return y_pred_original
    
    def predict_with_id(self, raw_data, id_column='id'):
        """
        Make predictions and return with original IDs
        """
        if id_column in raw_data.columns:
            ids = raw_data[id_column].copy()
            print(f"Found ID column: {id_column}")
        else:
            ids = range(len(raw_data))
            print(f"No ID column found, using sequential IDs")
        
        predictions = self.predict(raw_data)
        
        results = pd.DataFrame({
            id_column: ids,
            'efficiency': predictions
        })
        
        return results
    
    def save_predictions(self, raw_data, output_path='predictions.csv', id_column='id'):
        """
        Generate predictions and save to CSV
        """
        results = self.predict_with_id(raw_data, id_column)
        results.to_csv(output_path, index=False)
        print(f"\n✓ Predictions saved to: {output_path}")
        print(f"Output format: {list(results.columns)}")
        return results
    
    def validate_pipeline_compatibility(self):
        """
        Validate that the loaded model has all required components
        """
        required_components = [
            'model', 'preprocessor', 'target_transformer', 
            'feature_names', 'categorical_cols', 'numerical_cols',
            'final_feature_names'
        ]
        
        optional_components = ['imputer', 'features_to_drop']
        
        missing_components = []
        for component in required_components:
            if component not in self.model_package:
                missing_components.append(component)
        
        if missing_components:
            print(f"Error: Missing required components in saved model: {missing_components}")
            return False
        
        missing_optional = []
        for component in optional_components:
            if component not in self.model_package:
                missing_optional.append(component)
        
        if missing_optional:
            print(f"Warning: Missing optional components: {missing_optional}")
            print("Pipeline will continue but some features may not be available.")
        
        print("✓ All required pipeline components are available")
        return True

# Utility functions
def load_test_data(file_path):
    """Load test data from CSV"""
    try:
        df = pd.read_csv(file_path)
        print(f"Test data loaded successfully: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        return df
    except Exception as e:
        raise Exception(f"Error loading test data: {str(e)}")

def main():
    """
    Main execution function
    """
    try:
        # Initialize prediction pipeline
        print("Initializing Solar Panel Prediction Pipeline...")
        pipeline = SolarPanelPredictionPipeline(model_path='')
        
        # Validate pipeline compatibility
        if not pipeline.validate_pipeline_compatibility():
            print("Warning: Pipeline compatibility issues detected. Proceeding anyway...")
        
        # Load test data
        print(f"\nLoading test data...")
        test_data = load_test_data('../dataset/test.csv')
        
        # Generate and save predictions
        print(f"\nStarting prediction process...")
        predictions = pipeline.save_predictions(
            raw_data=test_data,
            output_path='solar_efficiency_predictions.csv',
            id_column='id'
        )
        
        print("\n" + "="*80)
        print("PREDICTION PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*80)
        print(f"Generated predictions for {len(predictions)} samples")
        print(f"Results saved to: solar_efficiency_predictions.csv")
        
        # Display sample predictions
        print(f"\nSample predictions:")
        print(predictions.head(10))
        
        return predictions
        
    except Exception as e:
        print(f"Error in prediction pipeline: {str(e)}")
        import traceback
        traceback.print_exc()
        raise

if __name__ == "__main__":
    # Execute the main prediction pipeline
    try:
        predictions = main()
        print(f"\nPipeline executed successfully!")
        print(f"Total predictions generated: {len(predictions)}")
    except Exception as e:
        print(f"Pipeline failed: {str(e)}")

Initializing Solar Panel Prediction Pipeline...
Error in prediction pipeline: Model file not found: 
Pipeline failed: Model file not found: 


Traceback (most recent call last):
  File "/var/folders/vy/hqlqcg292rj1msf91q8wjp_40000gn/T/ipykernel_52355/741362664.py", line 158, in load_trained_model
    with open(self.model_path, 'rb') as f:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/guptatilak/Desktop/Zelestra-ML-Ascend/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 326, in _modified_open
    return io_open(file, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: ''

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/var/folders/vy/hqlqcg292rj1msf91q8wjp_40000gn/T/ipykernel_52355/741362664.py", line 482, in main
    pipeline = SolarPanelPredictionPipeline(model_path='')
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/vy/hqlqcg292rj1msf91q8wjp_40000gn/T/ipykernel_52355/741362664.py", line 151, in __init__
    self.load_trained_model()
  F