In [None]:
# IMPORT LIBRARIES
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# LOAD DATA
df = pd.read_csv('../data/ccpp_data.csv')

# BASIC DATA EXPLORATION
print("First few rows:")
print(df.head())
print("\nGeneral info:")
print(df.info())
print("\nDescriptive statistics:")
print(df.describe())

# VISUALIZATION: CORRELATION MATRIX
plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# DEFINE FEATURES (X) AND TARGET (y)
X = df[['AT', 'AP', 'RH', 'V']]
y = df['PE']

# SPLIT DATA INTO TRAIN AND TEST SETS (80% TRAIN, 20% TEST)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# CROSS-VALIDATION SETUP: 5-FOLD CV
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def evaluate_cv(model, X_train, y_train, cv):
    mse_scores = -cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(mse_scores)
    r2_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='r2')
    return mse_scores.mean(), mse_scores.std(), rmse_scores.mean(), rmse_scores.std(), r2_scores.mean(), r2_scores.std()

# EVALUATE LINEAR REGRESSION IN CV
lr = LinearRegression()
mse_lr_mean, mse_lr_std, rmse_lr_mean, rmse_lr_std, r2_lr_mean, r2_lr_std = evaluate_cv(lr, X_train, y_train, kf)

print(f"Linear Regression - CV MSE: {mse_lr_mean:.3f} ± {mse_lr_std:.3f}")
print(f"Linear Regression - CV RMSE: {rmse_lr_mean:.3f} ± {rmse_lr_std:.3f}")
print(f"Linear Regression - CV R²: {r2_lr_mean:.3f} ± {r2_lr_std:.3f}")

# EVALUATE RANDOM FOREST IN CV
rf = RandomForestRegressor(n_estimators=100, random_state=42)
mse_rf_mean, mse_rf_std, rmse_rf_mean, rmse_rf_std, r2_rf_mean, r2_rf_std = evaluate_cv(rf, X_train, y_train, kf)

print(f"Random Forest - CV MSE: {mse_rf_mean:.3f} ± {mse_rf_std:.3f}")
print(f"Random Forest - CV RMSE: {rmse_rf_mean:.3f} ± {rmse_rf_std:.3f}")
print(f"Random Forest - CV R²: {r2_rf_mean:.3f} ± {r2_rf_std:.3f}")

# TRAIN FINAL MODELS ON FULL TRAINING SET
lr.fit(X_train, y_train)
rf.fit(X_train, y_train)

# PREDICTIONS ON TEST SET
y_pred_lr = lr.predict(X_test)
y_pred_rf = rf.predict(X_test)

# EVALUATE TEST SET
def evaluate_test(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return mse, rmse, r2

mse_test_lr, rmse_test_lr, r2_test_lr = evaluate_test(y_test, y_pred_lr)
mse_test_rf, rmse_test_rf, r2_test_rf = evaluate_test(y_test, y_pred_rf)

print("\nTest set performance:")
print("Linear Regression:")
print(f"  MSE: {mse_test_lr:.3f}")
print(f"  RMSE: {rmse_test_lr:.3f}")
print(f"  R2 Score: {r2_test_lr:.3f}")

print("Random Forest:")
print(f"  MSE: {mse_test_rf:.3f}")
print(f"  RMSE: {rmse_test_rf:.3f}")
print(f"  R2 Score: {r2_test_rf:.3f}")

# PLOT PREDICTIONS VS TRUE VALUES (LR vs RF)
plt.figure(figsize=(12,5))

plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_lr, alpha=0.6, color='blue')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.title("Linear Regression: Actual vs Predicted")
plt.xlabel("Actual Power Output")
plt.ylabel("Predicted Power Output")

plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred_rf, alpha=0.6, color='green')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.title("Random Forest: Actual vs Predicted")
plt.xlabel("Actual Power Output")
plt.ylabel("Predicted Power Output")

plt.suptitle("Model Comparison: Predicted vs Actual Power Output")
plt.tight_layout()
plt.show()
