In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from numpy.random import randn

# PROBLEM 1

In [None]:
def problem1_task1():
    """
        Predictive models on the estimation of energy performance of residential buildings.

            Parameters:

            Features: X

                X1: relative compactness
                X2: surface area
                X3: wall area
                X4: roof area
                X5: overall height
                X6: orientation
                X7: glazing area
                X8: glazing area distribution

            Targets: y

                Y1: heating load (HL)
                Y2: cooling load (CL)
    """
    # Get data from sheet
    df = pd.read_excel('data/ENB2012_data.xlsx')

    features = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
    targets = ['Y1', 'Y2']

    X = df[features]
    y = df[targets]
    
    # Scaling and normalizing the data
    X_normalized = preprocessing.normalize(X, norm='l2')
    alpha_values = [0.001,0.01,0.1, 1.0, 10.0]
    regr_cv = RidgeCV(alphas=alpha_values)
    model_cv = regr_cv.fit(X_normalized, y)
    optimum_alpha = model_cv.alpha_
    print(f"Optimum alpha : {optimum_alpha}")
    
    
    ridge_model = Ridge(normalize = True)

    scoring_metrics = ['neg_mean_squared_error', 'neg_mean_absolute_error']

    print("---------------------------------------------------")
    print(f"Alpha : {optimum_alpha}")
    ridge_model.alpha = optimum_alpha
    # prepare the cross-validation procedure
    ridge_cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
    # evaluate model
    scores = cross_validate(ridge_model, X, y, scoring=scoring_metrics, cv=ridge_cv, n_jobs=-1)
    ridge_MSE_means = (np.mean(scores['test_neg_mean_squared_error']))
    ridge_MAE_means = (np.mean(scores['test_neg_mean_absolute_error']))
    ridge_MSE_stds = (np.std(scores['test_neg_mean_squared_error']))
    ridge_MAE_stds = (np.std(scores['test_neg_mean_absolute_error']))
    # report performance
    print('MSE (mean) : %.3f (std) : (%.3f)' % (ridge_MSE_means, ridge_MSE_stds))
    print('MAE (mean) : %.3f (std) : (%.3f)' % (ridge_MAE_means, ridge_MAE_stds))
    print("****************************************************")


In [2]:
def problem1_task2():
    """
        Predictive models on the estimation of energy performance of residential buildings.

            Parameters:

            Features: X

                X1: relative compactness
                X2: surface area
                X3: wall area
                X4: roof area
                X5: overall height
                X6: orientation
                X7: glazing area
                X8: glazing area distribution

            Targets: y

                Y1: heating load (HL)
                Y2: cooling load (CL)
    """
    # Get data from sheet
    df = pd.read_excel('data/ENB2012_data.xlsx')

    features = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
    targets = ['Y1', 'Y2']

    X = df[features]
    y = df[targets]
    
    # Scaling and normalizing the data
    X_normalized = preprocessing.normalize(X, norm='l2')
    
    scoring_metrics = ['neg_mean_squared_error', 'neg_mean_absolute_error']
    
        # Create the parameter grid based on the results of random search 
    param_grid = {
        'max_depth': [50, 150, 250],
        'min_samples_leaf': [1, 2, 3],
        'min_samples_split': [2, 3],
        'n_estimators': [10, 50, 100, 250, 500]
    }
    # Create a based model
    rf_model = RandomForestRegressor()
    # evaluate the model
    rfr_cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf_model, param_grid = param_grid, scoring=scoring_metrics,
                              cv = rfr_cv, n_jobs = -1, verbose = 2, refit=False)
    # Fit the grid search to the data
    grid_search.fit(X_normalized, y)
    best_grid = grid_search.best_estimator_
    print(best_grid)

In [None]:
problem1_task1()

In [3]:
problem1_task2()

Fitting 100 folds for each of 90 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 378 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 684 tasks      | elapsed:   38.7s
[Parallel(n_jobs=-1)]: Done 1160 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1853 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 2506 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3425 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 4424 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 5529 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 6874 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 8196 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed: 10.8min finished


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [35]:
# Get data from sheet
df = pd.read_excel('data/ENB2012_data.xlsx')

features = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
targets = ['Y1', 'Y2']

X = df[features]
y = df[targets]
    
# Scaling and normalizing the data
min_max_scaler = preprocessing.MinMaxScaler()
X_normalized = min_max_scaler.fit_transform(X)

scoring_metrics = ['neg_mean_squared_error', 'neg_mean_absolute_error']

# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [50, 150, 250],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3],
    'n_estimators': [10, 50, 100, 250, 500]
}
# Create a based model
rf_model = RandomForestRegressor()
# evaluate the model
rfr_cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf_model, param_grid = param_grid, scoring=scoring_metrics,
                          cv = rfr_cv, n_jobs = -1, verbose = 2, refit=callable)
# Fit the grid search to the data for y1
print("Grid result for X normalized and Y1")
grid_results_y1 = grid_search.fit(X_normalized, y['Y1'])
print(grid_results_y1.best_params_)

Grid result for X normalized and Y1
Fitting 100 folds for each of 90 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 925 tasks      | elapsed:   36.6s
[Parallel(n_jobs=-1)]: Done 1577 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2493 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 3514 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 4724 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 5992 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 7545 tasks      | elapsed:  5.9min


{'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}


[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed:  7.0min finished


In [37]:
rf_model_y1 = RandomForestRegressor(**grid_results_y1.best_params_)
for score in scoring_metrics:
        print(score),
        print (" : "),
        score_result = cross_val_score(rf_model_y1, X_normalized, y['Y1'], scoring=score, cv=rfr_cv, n_jobs=-1)
        rf_score_means = (-1 * np.mean(score_result))
        rf_score_stds = (np.std(score_result))
        # report performance
        print('(mean) : %.3f (std) : (%.3f)' % (rf_score_means, rf_score_stds))
        print("****************************************************")

neg_mean_squared_error
 : 
(mean) : 0.245 (std) : (0.090)
****************************************************
neg_mean_absolute_error
 : 
(mean) : 0.329 (std) : (0.040)
****************************************************


In [39]:
grid_results_y2 = grid_search.fit(X_normalized, y['Y2'])
print(grid_results_y2.best_params_)
rf_model_y2 = RandomForestRegressor(**grid_results_y2.best_params_)
for score in scoring_metrics:
        print(score),
        print (" : "),
        score_result = cross_val_score(rf_model_y1, X_normalized, y['Y2'], scoring=score, cv=rfr_cv, n_jobs=-1)
        rf_score_means = (-1 * np.mean(score_result))
        rf_score_stds = (np.std(score_result))
        # report performance
        print('(mean) : %.3f (std) : (%.3f)' % (rf_score_means, rf_score_stds))
        print("****************************************************")

Fitting 100 folds for each of 90 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   19.3s
[Parallel(n_jobs=-1)]: Done 925 tasks      | elapsed:   41.1s
[Parallel(n_jobs=-1)]: Done 1496 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 2437 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 3402 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 4505 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 5912 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 7385 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 8954 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed:  7.7min finished


{'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}
neg_mean_squared_error
 : 
(mean) : 2.930 (std) : (0.894)
****************************************************
neg_mean_absolute_error
 : 
(mean) : 1.017 (std) : (0.158)
****************************************************


In [28]:
scores_y1

{'fit_time': array([0.05013728, 0.048558  , 0.04555106, 0.04502916, 0.04495811,
        0.04700971, 0.03856325, 0.03595996, 0.04227686, 0.03843403,
        0.02865791, 0.037009  , 0.0273118 , 0.03255105, 0.02692914,
        0.02030206, 0.02359605, 0.01995015, 0.04779506, 0.02727294,
        0.02008319, 0.04472899, 0.02416897, 0.04951382, 0.04948211,
        0.02031612, 0.03141022, 0.028193  , 0.01983905, 0.01968598,
        0.02675462, 0.02009988, 0.03210115, 0.03772593, 0.02807188,
        0.02462387, 0.03203273, 0.02054405, 0.02702618, 0.0226419 ,
        0.03418016, 0.02234197, 0.0266068 , 0.02740479, 0.03299999,
        0.02712822, 0.03705502, 0.02641106, 0.02905416, 0.02567196,
        0.02620792, 0.02989101, 0.03166389, 0.03300786, 0.02634001,
        0.03580809, 0.03298903, 0.02642417, 0.02883625, 0.02843714,
        0.03652096, 0.02712226, 0.03298807, 0.02611518, 0.0277009 ,
        0.02990317, 0.02641511, 0.03254294, 0.03265977, 0.02562094,
        0.03127313, 0.02658415, 0.03

In [8]:
grid_search.fit(X_normalized, y)
grid_search.b

Fitting 100 folds for each of 90 candidates, totalling 9000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 354 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   36.4s
[Parallel(n_jobs=-1)]: Done 1024 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1652 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3329 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 4304 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 5385 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 6522 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 7925 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 9000 out of 9000 | elapsed: 11.7min finished


AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [18]:
results_df = pd.DataFrame(grid_search.cv_results_)
results_df[['mean_test_neg_mean_absolute_error','mean_test_neg_mean_squared_error']]

Unnamed: 0,mean_test_neg_mean_absolute_error,mean_test_neg_mean_squared_error
0,-0.790681,-2.304826
1,-0.777258,-2.198255
2,-0.771690,-2.159435
3,-0.770573,-2.154504
4,-0.769297,-2.148807
...,...,...
85,-0.842941,-2.254338
86,-0.831923,-2.196605
87,-0.830324,-2.184033
88,-0.828448,-2.171968


In [None]:
# Get data from sheet
df = pd.read_excel('data/ENB2012_data.xlsx')

In [None]:
# to evaluate whether any value is missing
df.isnull().values.any()

In [None]:
# Observe the features and targets. Later they are going to be split.
df.columns

In [None]:
"""
Predictive models on the estimation of energy performance of
residential buildings.

Parameters:

    Features: X

        X1: relative compactness
        X2: surface area
        X3: wall area
        X4: roof area
        X5: overall height
        X6: orientation
        X7: glazing area
        X8: glazing area distribution
        
    Targets: y
    
        Y1: heating load (HL)
        Y2: cooling load (CL)
"""

features = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
targets = ['Y1', 'Y2']

X = df[features]
y = df[targets]

In [None]:
# Show size of the Feature Matrix
X.shape

In [None]:
# Show size of the Target Matrix
y.shape

In [None]:
# For each numerical feature compute number of unique entries
unique_values = X.select_dtypes(
  include="number").nunique().sort_values()

# Plot information with y-axis
unique_values.plot.bar(logy=False, figsize=(15, 4),
                       title="Unique values per feature");

In [None]:
# Scaling and normalizing the data
X_normalized = preprocessing.normalize(X, norm='l2')

In [None]:
alpha_values = [0.001,0.01,0.1, 1.0, 10.0]
regr_cv = RidgeCV(alphas=alpha_values)
model_cv = regr_cv.fit(X_normalized, y)
optimum_alpha = model_cv.alpha_

In [None]:
optimum_alpha

In [None]:
alpha_values = [0.001,0.01,0.1, 1.0, 10.0]

ridge_model = Ridge(normalize = True)

scoring_metrics = ['neg_mean_squared_error', 'neg_mean_absolute_error']

print("---------------------------------------------------")
print(f"Alpha : {optimum_alpha}")
ridge_model.alpha = optimum_alpha
# prepare the cross-validation procedure
ridge_cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
# evaluate model
scores = cross_validate(ridge_model, X, y, scoring=scoring_metrics, cv=ridge_cv, n_jobs=-1)
ridge_MSE_means = (np.mean(scores['test_neg_mean_squared_error']))
ridge_MAE_means = (np.mean(scores['test_neg_mean_absolute_error']))
ridge_MSE_stds = (np.std(scores['test_neg_mean_squared_error']))
ridge_MAE_stds = (np.std(scores['test_neg_mean_absolute_error']))
# report performance
print('MSE (mean) : %.3f (std) : (%.3f)' % (ridge_MSE_means, ridge_MSE_stds))
print('MAE (mean) : %.3f (std) : (%.3f)' % (ridge_MAE_means, ridge_MAE_stds))
print("****************************************************")

In [None]:
# Scaling the data
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [None]:
regr_cv = RidgeCV(alphas=[0.001, 0.01, 0.1, 1.0, 10.0])

In [None]:
model_cv = regr_cv.fit(X_std, y)

In [None]:
model_cv.alpha_

In [None]:
# After scaling the data
df_X_sc = pd.DataFrame(X_std, columns = features)
df_X_sc.head(15)

In [17]:
scores = ['MSE', 'MSE']
models = ['RandomForest', 'RidgeRegression']
hier_index = list(zip(scores, models))
hier_index_2 = pd.MultiIndex.from_tuples(hier_index)
df = pd.DataFrame(randn(2,2), hier_index_2, ['Y1', 'Y2'] )
df

Unnamed: 0,Unnamed: 1,Y1,Y2
MSE,RandomForest,-0.860788,-1.894437
MSE,RidgeRegression,-0.343683,1.276502


In [14]:
a = 1
b = 2
c = 3
d = 4
e = np.array([a,b,c,d]).reshape(2,2)
e

array([[1, 2],
       [3, 4]])

In [19]:
df = pd.DataFrame(e, hier_index_2, ['Y1', 'Y2'] )
df

Unnamed: 0,Unnamed: 1,Y1,Y2
MSE,RandomForest,1,2
MSE,RidgeRegression,3,4
