# Data Exploration

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
import numpy as np

# Load the dataset
file_path = '/content/EPB_data (1).csv'  # Ensure the file is uploaded to Google Colab
data = pd.read_csv(file_path)

# 1. Attributes Mathematical Representation & Data Preprocessing (Heading 3.2)
attributes_info = pd.DataFrame({
    "Data Type": data.dtypes,
    "Unique Values": data.nunique(),
    "Missing Values": data.isnull().sum(),
    "Mean": data.mean(numeric_only=True),
    "Standard Deviation": data.std(numeric_only=True),
    "Minimum": data.min(numeric_only=True),
    "Maximum": data.max(numeric_only=True)
})

# Save Attributes Info
attributes_info_path = "/content/attributes_info.csv"
attributes_info.to_csv(attributes_info_path)
print(f"Attributes info saved at: {attributes_info_path}")

# 2. Statistical Summary
statistical_summary = data.describe()

# Save Statistical Summary
statistical_summary_path = "/content/statistical_summary.csv"
statistical_summary.to_csv(statistical_summary_path)
print(f"Statistical summary saved at: {statistical_summary_path}")

# 3. Box Plots
plt.figure(figsize=(16, 10))
for i, column in enumerate(data.columns, 1):
    plt.subplot(3, 4, i)
    sns.boxplot(data[column])
    plt.title(column)
plt.tight_layout()
box_plots_path = "/content/box_plots.png"
plt.savefig(box_plots_path)
print(f"Box plots saved at: {box_plots_path}")
plt.close()

# 4. Density Plots
plt.figure(figsize=(16, 10))
for i, column in enumerate(data.columns, 1):
    plt.subplot(3, 4, i)
    sns.kdeplot(data[column], fill=True)
    plt.title(column)
plt.tight_layout()
density_plots_path = "/content/density_plots.png"
plt.savefig(density_plots_path)
print(f"Density plots saved at: {density_plots_path}")
plt.close()

# 5. Correlation Metrics
correlation_matrix = data.corr()

# Save Correlation Metrics
correlation_matrix_path = "/content/correlation_matrix.csv"
correlation_matrix.to_csv(correlation_matrix_path)
print(f"Correlation metrics saved at: {correlation_matrix_path}")

# Generate Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
correlation_heatmap_path = "/content/correlation_heatmap.png"
plt.title("Correlation Matrix Heatmap")
plt.savefig(correlation_heatmap_path)
print(f"Correlation heatmap saved at: {correlation_heatmap_path}")
plt.close()



Attributes info saved at: /content/attributes_info.csv
Statistical summary saved at: /content/statistical_summary.csv
Box plots saved at: /content/box_plots.png
Density plots saved at: /content/density_plots.png
Correlation metrics saved at: /content/correlation_matrix.csv
Correlation heatmap saved at: /content/correlation_heatmap.png


# Feature Engineering

In [None]:
# 6. Feature Engineering

# Feature Engineering: Normalization and Standardization
scaler = MinMaxScaler()
data[['Relative Compactness', 'Surface Area', 'Wall Area', 'Roof Area']] = scaler.fit_transform(
    data[['Relative Compactness', 'Surface Area', 'Wall Area', 'Roof Area']]
)

std_scaler = StandardScaler()
data[['Heating Load', 'Cooling Load']] = std_scaler.fit_transform(data[['Heating Load', 'Cooling Load']])

# Feature Engineering: Feature Interaction
data['Surface x Glazing'] = data['Surface Area'] * data['Glazing Area']

# Feature Engineering: Categorical Encoding
encoder = OneHotEncoder(drop='first', sparse_output=False)  # Drop first to avoid multicollinearity, use sparse_output instead of sparse
orientation_encoded = encoder.fit_transform(data[['Orientation']])
glazing_distribution_encoded = encoder.fit_transform(data[['Glazing Area Distribution']])

# Add encoded features to the dataset
orientation_columns = [f'Orientation_{i+1}' for i in range(orientation_encoded.shape[1])]
glazing_distribution_columns = [f'GlazingDist_{i+1}' for i in range(glazing_distribution_encoded.shape[1])]
orientation_df = pd.DataFrame(orientation_encoded, columns=orientation_columns)
glazing_dist_df = pd.DataFrame(glazing_distribution_encoded, columns=glazing_distribution_columns)
data = pd.concat([data, orientation_df, glazing_dist_df], axis=1)

# Drop original categorical features
data.drop(['Orientation', 'Glazing Area Distribution'], axis=1, inplace=True)

# Feature Engineering: Polynomial Features
data['Overall Height^2'] = data['Overall Height'] ** 2

# Feature Engineering: Domain-Specific Features
data['Volume'] = data['Surface Area'] * data['Overall Height']
data['Efficiency Index'] = (data['Heating Load'] + data['Cooling Load']) / data['Surface Area']

# Feature Engineering: Correlation Analysis
# Drop highly correlated features with correlation coefficient > 0.95
correlation_threshold = 0.95
corr_matrix = data.corr()
to_drop = [column for column in corr_matrix.columns if any(corr_matrix[column] > correlation_threshold) and column != column]
data.drop(columns=to_drop, inplace=True)

# Save Feature Engineered Dataset
feature_engineered_path = "/content/feature_engineered_data.csv"
data.to_csv(feature_engineered_path, index=False)
print(f"Feature engineered dataset saved at: {feature_engineered_path}")





Feature engineered dataset saved at: /content/feature_engineered_data.csv


# Previous models

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Load the feature-engineered dataset
feature_engineered_path = '/content/feature_engineered_data.csv'
data = pd.read_csv(feature_engineered_path)

# Define features (X) and targets (y)
X = data.drop(columns=['Heating Load', 'Cooling Load'])  # Exclude target columns
y_heating = data['Heating Load']
y_cooling = data['Cooling Load']

# Replace infinite values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute NaN values (e.g., with the mean of the column)
for column in X.columns:
    X[column].fillna(X[column].mean(), inplace=True)

# Split data into training and testing sets
X_train, X_test, y_train_h, y_test_h = train_test_split(X, y_heating, test_size=0.2, random_state=42)
X_train, X_test, y_train_c, y_test_c = train_test_split(X, y_cooling, test_size=0.2, random_state=42)

# List of models to evaluate
models = {
    "Extra Trees Regressor": ExtraTreesRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42)
}

# Function to calculate metrics
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    return {
        "Training": {
            "R²": r2_score(y_train, y_pred_train),
            "MAE": mean_absolute_error(y_train, y_pred_train),
            "RMSE": np.sqrt(mean_squared_error(y_train, y_pred_train)),
        },
        "Testing": {
            "R²": r2_score(y_test, y_pred_test),
            "MAE": mean_absolute_error(y_test, y_pred_test),
            "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
        }
    }

# Evaluate each model for both heating and cooling loads
results = {}
for name, model in models.items():
    results[name] = {
        "Heating Load": evaluate_model(model, X_train, X_test, y_train_h, y_test_h),
        "Cooling Load": evaluate_model(model, X_train, X_test, y_train_c, y_test_c),
    }

# Print results
for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    for load_type, performance in metrics.items():
        print(f"  {load_type}:")
        print(f"    Training -> R²: {performance['Training']['R²']:.4f}, MAE: {performance['Training']['MAE']:.4f}, RMSE: {performance['Training']['RMSE']:.4f}")
        print(f"    Testing  -> R²: {performance['Testing']['R²']:.4f}, MAE: {performance['Testing']['MAE']:.4f}, RMSE: {performance['Testing']['RMSE']:.4f}")







The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[column].fillna(X[column].mean(), inplace=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 306
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 18
[LightGBM] [Info] Start training from score -0.015091
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000064 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 306
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 18
[LightGBM] [Info] Start training from score -0.018617

Model: Extra Trees Regressor
  Heating Load:
    Training -> R²: 1.0000, MAE: 0.0000, RMSE: 0.0000
    Testing  -> R²: 0.9985, MAE: 0.0263, RMSE: 0.0398
  Cooling Load:
    Training -> R²: 1.

# Genetic Algorithm

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor

# Load the feature-engineered dataset
feature_engineered_path = '/content/feature_engineered_data.csv'
data = pd.read_csv(feature_engineered_path)

# Define features (X) and targets (y)
X = data.drop(columns=['Heating Load', 'Cooling Load'])
y_heating = data['Heating Load']
y_cooling = data['Cooling Load']

# Split data into training and testing sets
X_train, X_test, y_train_h, y_test_h = train_test_split(X, y_heating, test_size=0.2, random_state=42)
X_train, X_test, y_train_c, y_test_c = train_test_split(X, y_cooling, test_size=0.2, random_state=42)

# Fitness Function
def fitness_function(params, X, y, model_class):
    """Evaluate the model performance using a given set of hyperparameters."""
    model = model_class(**params)

    # Create a copy of X to avoid modifying the original DataFrame
    X_copy = X.copy()

    # Check and handle infinite/NaN values in the copy of X
    X_copy = X_copy.replace([np.inf, -np.inf], np.nan)  # Replace inf with NaN
    X_copy = X_copy.fillna(X_copy.mean())  # Impute NaN with column means

    try:
        model.fit(X_copy, y)  # Fit the model using the copy
        y_pred = model.predict(X_copy)  # Predict using the copy
        return -r2_score(y, y_pred)  # Negative R² to minimize error
    except ValueError as e:
        if "infinity or a value too large" in str(e):
            # Handle the error gracefully, e.g., return a very poor fitness score
            return float('inf')
        else:
            raise  # Raise other ValueErrors

# Genetic Algorithm Implementation
def genetic_algorithm(
    X, y, model_class, param_space, pop_size=10, generations=20, mutation_prob=0.1, crossover_prob=0.5
):
    # Initialize population with random parameters
    population = [
        {key: np.random.choice(values) for key, values in param_space.items()} for _ in range(pop_size)
    ]
    best_params = None
    best_fitness = float('inf')

    for generation in range(generations):
        fitness_scores = [fitness_function(ind, X, y, model_class) for ind in population]
        sorted_indices = np.argsort(fitness_scores)

        # Keep track of the best solution
        if fitness_scores[sorted_indices[0]] < best_fitness:
            best_fitness = fitness_scores[sorted_indices[0]]
            best_params = population[sorted_indices[0]]

        # Select parents
        parents = [population[i] for i in sorted_indices[:pop_size // 2]]

        # Crossover
        offspring = []
        for _ in range(pop_size - len(parents)):
            if np.random.rand() < crossover_prob:
                parent1 = np.random.choice(parents)
                parent2 = np.random.choice(parents)
                child = {key: np.random.choice([parent1[key], parent2[key]]) for key in param_space.keys()}
                offspring.append(child)

        # Mutation
        for child in offspring:
            if np.random.rand() < mutation_prob:
                key_to_mutate = np.random.choice(list(param_space.keys()))
                child[key_to_mutate] = np.random.choice(param_space[key_to_mutate])

        # Next generation
        population = parents + offspring

    return best_params, best_fitness

# Define parameter space for Extra Trees Regressor
param_space = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# # Run the genetic algorithm for heating load

# Handle infinite/NaN values in X_train and X_test before model training
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_train = X_train.fillna(X_train.mean())

X_test = X_test.replace([np.inf, -np.inf], np.nan)
X_test = X_test.fillna(X_test.mean())

# Run the genetic algorithm for heating load
best_params_h, best_fitness_h = genetic_algorithm(X_train, y_train_h, ExtraTreesRegressor, param_space)

# Run the genetic algorithm for cooling load
best_params_c, best_fitness_c = genetic_algorithm(X_train, y_train_c, ExtraTreesRegressor, param_space)

# ... (rest of the code) ...



# Evaluate the performance of the best models
def evaluate_model(X_train, X_test, y_train, y_test, model_params):
    model = ExtraTreesRegressor(**model_params)
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    return {
        "Training": {
            "R²": r2_score(y_train, y_pred_train),
            "MAE": mean_absolute_error(y_train, y_pred_train),
            "RMSE": np.sqrt(mean_squared_error(y_train, y_pred_train)),
        },
        "Testing": {
            "R²": r2_score(y_test, y_pred_test),
            "MAE": mean_absolute_error(y_test, y_pred_test),
            "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
        }
    }

# Performance metrics for heating load
performance_h = evaluate_model(X_train, X_test, y_train_h, y_test_h, best_params_h)

# Performance metrics for cooling load
performance_c = evaluate_model(X_train, X_test, y_train_c, y_test_c, best_params_c)

# Output results
results = {
    "Heating Load": {
        "Best Parameters": best_params_h,
        "Performance": performance_h
    },
    "Cooling Load": {
        "Best Parameters": best_params_c,
        "Performance": performance_c
    }
}
print(results)


{'Heating Load': {'Best Parameters': {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2, 'min_samples_leaf': 2}, 'Performance': {'Training': {'R²': 0.9996914815056265, 'MAE': 0.010600079814106085, 'RMSE': 0.017499312128429676}, 'Testing': {'R²': 0.9988601628381928, 'MAE': 0.023820461778985353, 'RMSE': 0.03418254079198293}}}, 'Cooling Load': {'Best Parameters': {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1}, 'Performance': {'Training': {'R²': 0.9998254989889235, 'MAE': 0.008449831836184626, 'RMSE': 0.013156715688322114}, 'Testing': {'R²': 0.9963074369314525, 'MAE': 0.04116806392807455, 'RMSE': 0.06152549480297585}}}}
