In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# --- Configuration ---
# NOTE: We assume 'benin-malanville.csv' is in a 'data/' directory.
RAW_FILE_PATH = 'data/benin-malanville.csv'
DAYTIME_GHI_THRESHOLD = 5  # Filter to GHI > 5 W/m^2 for relevant power generation data

# --- Data Cleaning (Replicated for self-contained script) ---

def load_and_clean_data(filepath):
    """
    Loads data, sets index, handles missing weather data (median imputation),
    and filters for daytime power generation records.
    """
    try:
        df = pd.read_csv(filepath)
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        df = df.set_index('Timestamp')
    except FileNotFoundError:
        print(f"ERROR: File not found at {filepath}. Please check the path.")
        return None
    except Exception as e:
        print(f"ERROR: Could not load or parse CSV: {e}")
        return None

    # Impute weather-related columns with their median (from EDA strategy)
    impute_cols = ['Tamb', 'RH', 'WS', 'TModA', 'TModB']
    for col in impute_cols:
        if col in df.columns:
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
    
    # 1. Filter for valid power-generation rows (GHI and ModA must be > 0)
    # This automatically handles most nulls and non-production hours.
    df_daytime = df[
        (df['GHI'] > DAYTIME_GHI_THRESHOLD) & 
        (df['ModA'] > 0)
    ].copy()

    if df_daytime.empty:
        print("ERROR: Filtered daytime data is empty. Check GHI and ModA columns.")
        return None

    # 2. Select final features and target, dropping any remaining NaNs in these specific columns
    required_cols = ['GHI', 'TModA', 'ModA']
    if not all(col in df_daytime.columns for col in required_cols):
        print(f"ERROR: Missing required columns for modeling: {required_cols}")
        return None
        
    df_final = df_daytime[required_cols].dropna()

    print(f"Loaded {len(df)} total records.")
    print(f"Modeling with {len(df_final)} daytime records (GHI > {DAYTIME_GHI_THRESHOLD} W/m²).")
    return df_final


def train_and_evaluate_model(df):
    """
    Splits data, trains Linear Regression model, and prints results.
    """
    # Define features (X) and target (y)
    # X: GHI (Solar Input) and TModA (Thermal Constraint)
    # y: ModA (Power Output)
    X = df[['GHI', 'TModA']]
    y = df['ModA']

    # Split the data into training and testing sets (80/20 split)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Initialize and train the Linear Regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions and evaluate the model
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    # --- Results Presentation ---
    print("\n" + "="*50)
    print("      LINEAR REGRESSION MODEL RESULTS")
    print("="*50)
    
    print(f"Model Predicts: ModA = C0 + (C1 * GHI) + (C2 * TModA)")
    print(f"Intercept (C0): {model.intercept_:.4f}")
    
    # Coefficients explain the change in ModA for a one-unit change in X
    print("\n--- Feature Coefficients ---")
    
    # GHI Coefficient (C1)
    ghi_coeff = model.coef_[X.columns.get_loc('GHI')]
    print(f"1. GHI Coefficient: {ghi_coeff:.4f}")
    print(f"   Interpretation: For every 1 W/m² increase in solar input (GHI), ModA output increases by {ghi_coeff:.4f} units.")
    
    # TModA Coefficient (C2) - This is the key actionable insight
    tmod_coeff = model.coef_[X.columns.get_loc('TModA')]
    print(f"2. TModA Coefficient: {tmod_coeff:.4f}")
    print(f"   Interpretation: For every 1°C increase in Module Temperature (TModA), ModA output **decreases** by {abs(tmod_coeff):.4f} units.")
    print("\n*** This negative value quantifies the power loss due to heat. ***")
    
    print("\n--- Model Performance Metrics ---")
    print(f"R-squared (R²): {r2:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")

    if r2 > 0.9:
        print("\nConclusion: R² > 0.9 indicates the model is highly accurate. The coefficients provide strong evidence for engineering decisions.")
    else:
        print("\nConclusion: The R² suggests the model is a good fit, but non-linear models (like Polynomial Regression) might be explored for better accuracy.")


# --- Execution ---
if __name__ == '__main__':
    df_final = load_and_clean_data(RAW_FILE_PATH)
    
    if df_final is not None:
        train_and_evaluate_model(df_final)