In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
import warnings
warnings.filterwarnings('ignore')

class SolarPanelMLMonitor:
    """
    Comprehensive Machine Learning Model for Solar Panel Performance Monitoring

    Features:
    - Power Output Monitoring: Real-time electricity generation prediction
    - Panel Efficiency: Solar energy to electrical energy conversion ratio
    - Performance Ratio: Actual vs theoretical energy output
    - String-level Performance: Output monitoring of connected panel groups
    """

    def __init__(self):
        self.models = {}
        self.scalers = {}
        self.trained_models = {}
        self.performance_metrics = {}

    def load_and_merge_data(self, file1_path, file2_path):
        """
        Load and merge the two CSV files containing solar panel data

        Args:
            file1_path: Path to first CSV (DATE_TIME, PLANT_ID, AMBIENT_TEMPERATURE, MODULE_TEMPERATURE, IRRADIATION)
            file2_path: Path to second CSV (DATE_TIME, PLANT_ID, DC_POWER, AC_POWER, DAILY_YIELD, TOTAL_YIELD)
        """
        try:
            # Load datasets
            df1 = pd.read_csv(r"C:\Users\Chagganiya Aggarwal\Downloads\Plant_1_Generation_Data.csv")
            df2 = pd.read_csv(r"C:\Users\Chagganiya Aggarwal\Downloads\Plant_1_Weather_Sensor_Data.csv")

            # Convert DATE_TIME to datetime
            df1['DATE_TIME'] = pd.to_datetime(df1['DATE_TIME'])
            df2['DATE_TIME'] = pd.to_datetime(df2['DATE_TIME'])

            # Merge datasets on DATE_TIME and PLANT_ID
            merged_df = pd.merge(df1, df2, on=['DATE_TIME', 'PLANT_ID'], how='inner')

            print(f"Dataset 1 shape: {df1.shape}")
            print(f"Dataset 2 shape: {df2.shape}")
            print(f"Merged dataset shape: {merged_df.shape}")

            return merged_df
        except Exception as e:
            print(f"Error loading data: {e}")
            return None

    def feature_engineering(self, df):
        """
        Create additional features for better model performance
        """
        df = df.copy()

        # Time-based features
        df['HOUR'] = df['DATE_TIME'].dt.hour
        df['DAY_OF_YEAR'] = df['DATE_TIME'].dt.dayofyear
        df['MONTH'] = df['DATE_TIME'].dt.month
        df['SEASON'] = df['MONTH'].apply(lambda x: 
            'Winter' if x in [12, 1, 2] else
            'Spring' if x in [3, 4, 5] else
            'Summer' if x in [6, 7, 8] else 'Fall')

        # Efficiency calculations
        df['PANEL_EFFICIENCY'] = np.where(df['IRRADIATION'] > 0, 
                                         (df['DC_POWER'] / (df['IRRADIATION'] * 1)), 0)

        # Performance Ratio calculation
        # PR = Actual Energy Output / Theoretical Energy Output
        theoretical_power = df['IRRADIATION'] * 0.2  # Assuming 20% panel efficiency
        df['PERFORMANCE_RATIO'] = np.where(theoretical_power > 0, 
                                          df['DC_POWER'] / theoretical_power, 0)

        # Temperature difference
        df['TEMP_DIFF'] = df['MODULE_TEMPERATURE'] - df['AMBIENT_TEMPERATURE']

        # DC to AC conversion efficiency
        df['DC_AC_EFFICIENCY'] = np.where(df['DC_POWER'] > 0, 
                                         df['AC_POWER'] / df['DC_POWER'], 0)

        # Power per unit irradiation
        df['POWER_PER_IRRADIATION'] = np.where(df['IRRADIATION'] > 0, 
                                              df['DC_POWER'] / df['IRRADIATION'], 0)

        # Encode categorical features
        le = LabelEncoder()
        df['SEASON_ENCODED'] = le.fit_transform(df['SEASON'])

        return df

    def data_preprocessing(self, df):
        """
        Clean and preprocess the data
        """
        # Handle missing values
        df = df.fillna(df.median(numeric_only=True))

        # Remove outliers using IQR method
        numeric_columns = df.select_dtypes(include=[np.number]).columns
        for col in numeric_columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

        print(f"Data shape after preprocessing: {df.shape}")
        return df

    def prepare_features(self, df, target_column):
        """
        Prepare features and target for model training
        """
        # Feature columns
        feature_columns = [
            'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION',
            'HOUR', 'DAY_OF_YEAR', 'MONTH', 'SEASON_ENCODED',
            'TEMP_DIFF', 'POWER_PER_IRRADIATION', 'PLANT_ID'
        ]

        # Handle missing feature columns
        available_features = [col for col in feature_columns if col in df.columns]

        X = df[available_features]
        y = df[target_column]

        return X, y

    def train_models(self, X, y, model_type):
        """
        Train multiple ML models for the specified prediction task
        """
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Store scaler
        self.scalers[model_type] = scaler

        # Define models
        models = {
            'Linear_Regression': LinearRegression(),
            'Random_Forest': RandomForestRegressor(n_estimators=100, random_state=42),
            'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42),
            'Gradient_Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
            'SVR': SVR(kernel='rbf'),
            'Neural_Network': MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
        }

        # Train and evaluate models
        results = {}
        for name, model in models.items():
            # Train model
            if name in ['Linear_Regression', 'SVR', 'Neural_Network']:
                model.fit(X_train_scaled, y_train)
                y_pred = model.predict(X_test_scaled)
            else:
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

            # Calculate metrics
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            results[name] = {
                'model': model,
                'mse': mse,
                'mae': mae,
                'r2': r2,
                'rmse': np.sqrt(mse)
            }

            print(f"{name} - MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")

        # Store best model
        best_model_name = max(results.keys(), key=lambda x: results[x]['r2'])
        self.trained_models[model_type] = {
            'best_model': results[best_model_name]['model'],
            'best_model_name': best_model_name,
            'all_results': results,
            'X_test': X_test,
            'y_test': y_test,
            'scaler': scaler
        }

        print(f"\nBest model for {model_type}: {best_model_name} with R² = {results[best_model_name]['r2']:.4f}")

        return results

    def train_power_output_monitor(self, df):
        """
        Train model for Power Output Monitoring: Real-time measurement of electricity generation
        """
        print("\n=== Training Power Output Monitoring Model ===")
        X, y = self.prepare_features(df, 'DC_POWER')
        return self.train_models(X, y, 'power_output')

    def train_efficiency_monitor(self, df):
        """
        Train model for Panel Efficiency: Ratio of solar energy converted to electrical energy
        """
        print("\n=== Training Panel Efficiency Monitoring Model ===")
        X, y = self.prepare_features(df, 'PANEL_EFFICIENCY')
        return self.train_models(X, y, 'efficiency')

    def train_performance_ratio_monitor(self, df):
        """
        Train model for Performance Ratio: Actual vs theoretical energy output
        """
        print("\n=== Training Performance Ratio Monitoring Model ===")
        X, y = self.prepare_features(df, 'PERFORMANCE_RATIO')
        return self.train_models(X, y, 'performance_ratio')

    def train_string_level_monitor(self, df):
        """
        Train model for String-level Performance: Output monitoring of connected panel groups
        """
        print("\n=== Training String-level Performance Monitoring Model ===")
        # Group by PLANT_ID to simulate string-level monitoring
        string_performance = df.groupby(['PLANT_ID', 'DATE_TIME']).agg({
            'DC_POWER': 'sum',
            'AMBIENT_TEMPERATURE': 'mean',
            'MODULE_TEMPERATURE': 'mean',
            'IRRADIATION': 'mean'
        }).reset_index()

        X, y = self.prepare_features(string_performance, 'DC_POWER')
        return self.train_models(X, y, 'string_level')

    def predict_real_time(self, input_data, prediction_type):
        """
        Make real-time predictions using trained models
        """
        if prediction_type not in self.trained_models:
            print(f"Model for {prediction_type} not trained yet!")
            return None

        model_info = self.trained_models[prediction_type]
        model = model_info['best_model']
        scaler = model_info['scaler']

        # Scale input if needed
        model_name = model_info['best_model_name']
        if model_name in ['Linear_Regression', 'SVR', 'Neural_Network']:
            input_scaled = scaler.transform(input_data)
            prediction = model.predict(input_scaled)
        else:
            prediction = model.predict(input_data)

        return prediction

    def generate_performance_report(self):
        """
        Generate comprehensive performance report
        """
        report = "\n" + "="*60
        report += "\n           SOLAR PANEL ML MONITORING REPORT"
        report += "\n" + "="*60

        for model_type, model_info in self.trained_models.items():
            report += f"\n\n{model_type.upper()} MONITORING:"
            report += f"\n{'-'*40}"
            report += f"\nBest Model: {model_info['best_model_name']}"

            best_results = model_info['all_results'][model_info['best_model_name']]
            report += f"\nR² Score: {best_results['r2']:.4f}"
            report += f"\nRMSE: {best_results['rmse']:.4f}"
            report += f"\nMAE: {best_results['mae']:.4f}"

        return report

# Usage Example and Implementation Guide
#SolarPanelMLMonitor.
if __name__ == "__main__":
    print("Solar Panel ML Monitoring System - Implementation Guide")
    print("="*60)

    # Initialize the monitoring system
    monitor = SolarPanelMLMonitor()
    

Solar Panel ML Monitoring System - Implementation Guide


In [3]:
# Load your CSV files
merged_df = monitor.load_and_merge_data(r'C:\Users\Chagganiya Aggarwal\Downloads\Plant_1_Generation_Data.csv', r"C:\Users\Chagganiya Aggarwal\Downloads\Plant_1_Weather_Sensor_Data.csv")

# Verify data loading
print(f"Merged dataset shape: {merged_df.shape}")
print(merged_df.head())


Dataset 1 shape: (68778, 6)
Dataset 2 shape: (3182, 5)
Merged dataset shape: (68774, 9)
Merged dataset shape: (68774, 9)
   DATE_TIME  PLANT_ID  DC_POWER  AC_POWER  DAILY_YIELD  TOTAL_YIELD  \
0 2020-05-15   4135001       0.0       0.0          0.0    6259559.0   
1 2020-05-15   4135001       0.0       0.0          0.0    6183645.0   
2 2020-05-15   4135001       0.0       0.0          0.0    6987759.0   
3 2020-05-15   4135001       0.0       0.0          0.0    7602960.0   
4 2020-05-15   4135001       0.0       0.0          0.0    7158964.0   

   AMBIENT_TEMPERATURE  MODULE_TEMPERATURE  IRRADIATION  
0            25.184316           22.857507          0.0  
1            25.184316           22.857507          0.0  
2            25.184316           22.857507          0.0  
3            25.184316           22.857507          0.0  
4            25.184316           22.857507          0.0  


In [4]:
# Create additional features for better model performance
df_engineered = monitor.feature_engineering(merged_df)

# New features created:
# - Time-based features (hour, day_of_year, month, season)
# - Panel efficiency calculations
# - Performance ratio calcul


In [5]:
# Clean and preprocess the data
df_clean = monitor.data_preprocessing(df_engineered)

# Preprocessing steps:
# - Handle missing values using median imputation
# - Remove outliers using IQR method
# - Prepare data for ML model training


Data shape after preprocessing: (68730, 19)


In [6]:
# Train model for real-time electricity generation prediction
power_results = monitor.train_power_output_monitor(df_clean)

# This trains multiple algorithms:
# - Linear Regression
# - Random Forest
# - XGBoost
# - Gradient Boosting
# - Support Vector Regression
# - Neural Network



=== Training Power Output Monitoring Model ===
Linear_Regression - MSE: 331752.9338, MAE: 279.0604, R²: 0.9794
Random_Forest - MSE: 703.7712, MAE: 6.6095, R²: 1.0000
XGBoost - MSE: 2554.4681, MAE: 19.0066, R²: 0.9998
Gradient_Boosting - MSE: 6575.5808, MAE: 38.4438, R²: 0.9996
SVR - MSE: 1595138.5014, MAE: 671.7756, R²: 0.9011
Neural_Network - MSE: 258.3382, MAE: 7.3070, R²: 1.0000

Best model for power_output: Neural_Network with R² = 1.0000


In [7]:
# Train model for solar-to-electrical energy conversion ratio
efficiency_results = monitor.train_efficiency_monitor(df_clean)



=== Training Panel Efficiency Monitoring Model ===
Linear_Regression - MSE: 0.0000, MAE: 0.0000, R²: 1.0000
Random_Forest - MSE: 19.2491, MAE: 0.2811, R²: 1.0000
XGBoost - MSE: 20906.1626, MAE: 14.3493, R²: 0.9996
Gradient_Boosting - MSE: 438.5389, MAE: 10.8206, R²: 1.0000
SVR - MSE: 7020311.6788, MAE: 1738.5117, R²: 0.8539
Neural_Network - MSE: 7.1946, MAE: 1.5397, R²: 1.0000

Best model for efficiency: Linear_Regression with R² = 1.0000


In [8]:
# Train model for actual vs theoretical energy output
pr_results = monitor.train_performance_ratio_monitor(df_clean)



=== Training Performance Ratio Monitoring Model ===
Linear_Regression - MSE: 0.0000, MAE: 0.0000, R²: 1.0000
Random_Forest - MSE: 477.4527, MAE: 1.3852, R²: 1.0000
XGBoost - MSE: 519883.0083, MAE: 70.9483, R²: 0.9996
Gradient_Boosting - MSE: 10963.4718, MAE: 54.1030, R²: 1.0000
SVR - MSE: 1045667909.2822, MAE: 27089.7752, R²: 0.1297
Neural_Network - MSE: 62.0913, MAE: 3.5260, R²: 1.0000

Best model for performance_ratio: Linear_Regression with R² = 1.0000


In [15]:
# Train model for connected panel groups monitoring
string_results = monitor.train_string_level_monitor(df_clean)



=== Training String-level Performance Monitoring Model ===
Linear_Regression - MSE: 51192361.7276, MAE: 4183.4715, R²: 0.9931
Random_Forest - MSE: 38498750.3752, MAE: 2928.8151, R²: 0.9948
XGBoost - MSE: 40555384.4799, MAE: 2976.7813, R²: 0.9945
Gradient_Boosting - MSE: 34739311.1300, MAE: 2840.2185, R²: 0.9953
SVR - MSE: 10408179953.3139, MAE: 64329.7114, R²: -0.4045
Neural_Network - MSE: 63576224.4622, MAE: 4871.7140, R²: 0.9914

Best model for string_level: Gradient_Boosting with R² = 0.9953


In [1]:
# Prepare input data for prediction
input_data = pd.DataFrame({
    'AMBIENT_TEMPERATURE': [25.0],
    'MODULE_TEMPERATURE': [35.0],
    'IRRADIATION': [1000.0],
    'HOUR': [12],
    'DAY_OF_YEAR': [180],
    'MONTH': [6],
    'SEASON_ENCODED': [2],  # Summer
    'TEMP_DIFF': [10.0],
    'POWER_PER_IRRADIATION': [0.75],
    'PLANT_ID': [1]
})

# Make predictions
power_prediction = monitor.predict_real_time(input_data, 'power_output')
efficiency_prediction = monitor.predict_real_time(input_data, 'efficiency')
pr_prediction = monitor.predict_real_time(input_data, 'performance_ratio')

print(f"Predicted Power Output: {power_prediction[0]:.2f} kW")
print(f"Predicted Efficiency: {efficiency_prediction[0]:.4f}")
print(f"Predicted Performance Ratio: {pr_prediction[0]:.4f}")


NameError: name 'pd' is not defined

In [17]:
# Get comprehensive performance report
report = monitor.generate_performance_report()
print(report)



           SOLAR PANEL ML MONITORING REPORT

POWER_OUTPUT MONITORING:
----------------------------------------
Best Model: Neural_Network
R² Score: 1.0000
RMSE: 16.0729
MAE: 7.3070

EFFICIENCY MONITORING:
----------------------------------------
Best Model: Linear_Regression
R² Score: 1.0000
RMSE: 0.0000
MAE: 0.0000

PERFORMANCE_RATIO MONITORING:
----------------------------------------
Best Model: Linear_Regression
R² Score: 1.0000
RMSE: 0.0000
MAE: 0.0000

STRING_LEVEL MONITORING:
----------------------------------------
Best Model: Gradient_Boosting
R² Score: 0.9953
RMSE: 5894.0064
MAE: 2840.2185
