In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
def problem1_task1():
    """
        Predictive models on the estimation of energy performance of residential buildings.

            Parameters:

            Features: X

                X1: relative compactness
                X2: surface area
                X3: wall area
                X4: roof area
                X5: overall height
                X6: orientation
                X7: glazing area
                X8: glazing area distribution

            Targets: y

                Y1: heating load (HL)
                Y2: cooling load (CL)
    """
    # Get data from sheet
    df = pd.read_excel('data/ENB2012_data.xlsx')

    features = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
    targets = ['Y1', 'Y2']

    X = df[features]
    y = df[targets]
    
    # Scaling and normalizing the data
    X_normalized = preprocessing.MinMaxScaler()
    alpha_values = [0.001,0.01,0.1, 1.0, 10.0]
    regr_cv = RidgeCV(alphas=alpha_values)
    model_cv = regr_cv.fit(X_normalized, y)
    optimum_alpha = model_cv.alpha_
    print(f"Optimum alpha : {optimum_alpha}")
    
    
    ridge_model = Ridge(normalize = True)

    scoring_metrics = ['neg_mean_squared_error', 'neg_mean_absolute_error']

    print("---------------------------------------------------")
    print(f"Alpha : {optimum_alpha}")
    ridge_model.alpha = optimum_alpha
    # prepare the cross-validation procedure
    ridge_cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
    # evaluate model
    scores = cross_validate(ridge_model, X, y, scoring=scoring_metrics, cv=ridge_cv, n_jobs=-1)
    ridge_MSE_means = (np.mean(scores['test_neg_mean_squared_error']))
    ridge_MAE_means = (np.mean(scores['test_neg_mean_absolute_error']))
    ridge_MSE_stds = (np.std(scores['test_neg_mean_squared_error']))
    ridge_MAE_stds = (np.std(scores['test_neg_mean_absolute_error']))
    # report performance
    print('MSE (mean) : %.3f (std) : (%.3f)' % (ridge_MSE_means, ridge_MSE_stds))
    print('MAE (mean) : %.3f (std) : (%.3f)' % (ridge_MAE_means, ridge_MAE_stds))
    print("****************************************************")


In [32]:
def problem1_task2():
    """
        Predictive models on the estimation of energy performance of residential buildings.

            Parameters:

            Features: X

                X1: relative compactness
                X2: surface area
                X3: wall area
                X4: roof area
                X5: overall height
                X6: orientation
                X7: glazing area
                X8: glazing area distribution

            Targets: y

                Y1: heating load (HL)
                Y2: cooling load (CL)
    """
    # Get data from sheet
    df = pd.read_excel('data/ENB2012_data.xlsx')

    features = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
    targets = ['Y1', 'Y2']

    X = df[features]
    y = df[targets]
    
    # Scaling and normalizing the data
    min_max_scaler = preprocessing.MinMaxScaler()
    X_normalized = min_max_scaler.fit_transform(X)
    
    scoring_metrics = ['neg_mean_squared_error', 'neg_mean_absolute_error']
    
        # Create the parameter grid based on the results of random search 
    param_grid = {
        'max_depth': [50, 150, 250],
        'min_samples_leaf': [1, 2, 3],
        'min_samples_split': [2, 3],
        'n_estimators': [10, 50, 100, 250, 500]
    }
    # Create a based model
    rf_model = RandomForestRegressor()
    # evaluate the model
    rfr_cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf_model, param_grid = param_grid, scoring=scoring_metrics,
                              cv = rfr_cv, n_jobs = -1, verbose = 2, refit=callable)
    # Fit the grid search to the data for y1
    print("Grid result for X normalized and Y1")
    grid_results_y1 = grid_search.fit(X_normalized, y['Y1'])
    print(grid_results_y1.best_params_)
    results_df_y1 = pd.DataFrame(grid_results_y1.cv_results_)
    mse_mean_y1 = np.mean(results_df_y1['mean_test_neg_mean_squared_error'].to_numpy())
    mse_std_y1 = np.std(results_df_y1['mean_test_neg_mean_squared_error'].to_numpy())
    mae_mean_y1 = np.mean(results_df_y1['mean_test_neg_mean_absolute_error'].to_numpy())
    mae_std_y1 = np.std(results_df_y1['mean_test_neg_mean_absolute_error'].to_numpy())
    score_results_y1 = pd.DataFrame(np.array([mse_mean_y1,mse_std_y1, mae_mean_y1, mae_std_y1]).reshape(2,2),['MSE','MAE'], ['Mean', 'Std'])
    print(score_results_y1)
    print("---------------------------------------------------")
    # Fit the grid search to the data for y2
    print("Grid result for X normalized and Y2")
    grid_results_y2 = grid_search.fit(X_normalized, y['Y2'])
    print(grid_results_y2.best_params_)
    results_df_y2 = pd.DataFrame(grid_results_y2.cv_results_)
    mse_mean_y2 = np.mean(results_df_y2['mean_test_neg_mean_squared_error'].to_numpy())
    mse_std_y2 = np.std(results_df_y2['mean_test_neg_mean_squared_error'].to_numpy())
    mae_mean_y2 = np.mean(results_df_y2['mean_test_neg_mean_absolute_error'].to_numpy())
    mae_std_y2 = np.std(results_df_y2['mean_test_neg_mean_absolute_error'].to_numpy())
    score_results_y2 = pd.DataFrame(np.array([mse_mean_y2,mse_std_y2, mae_mean_y2, mae_std_y2]).reshape(2,2),['MSE','MAE'], ['Mean', 'Std'])
    print(print(score_results_y2))
    print("****************************************************")

In [3]:
problem1_task1()

Optimum alpha : 0.001
---------------------------------------------------
Alpha : 0.001
MSE (mean) : -9.536 (std) : (1.888)
MAE (mean) : -2.179 (std) : (0.232)
****************************************************


In [33]:
problem1_task2()

Grid result for X normalized and Y1
Fitting 100 folds for each of 90 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done 925 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done 1496 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2437 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 3458 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 4505 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 5920 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 7417 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed:  7.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}
         Mean       Std
MSE -0.292673  0.059189
MAE -0.352472  0.026939
---------------------------------------------------
Grid result for X normalized and Y2
Fitting 100 folds for each of 90 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 893 tasks      | elapsed:   46.0s
[Parallel(n_jobs=-1)]: Done 1432 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 2296 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 3152 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 4401 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 5563 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 7017 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 8906 tasks      | elapsed:  7.8min


{'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}
         Mean       Std
MSE -3.138312  0.291896
MAE -1.105891  0.065733
None
****************************************************


[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed:  8.0min finished
