In [153]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [154]:
df = pd.read_csv('./EF_comp.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 83 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Project number                                198 non-null    object 
 1   The date                                      198 non-null    object 
 2   Region                                        198 non-null    object 
 3   The town/village                              191 non-null    object 
 4   Home address                                  198 non-null    object 
 5   County/City                                   198 non-null    object 
 6   Initial year of exploitation                  198 non-null    int64  
 7   Building Total Area                           198 non-null    float64
 8   Room volume                                   198 non-null    float64
 9   Average floor height                          198 non-null    flo

In [155]:
feature_cols = ['Building Total Area','Reference area','Above-ground floors',
                'Underground floor','Energy consumption before',
                'Initial energy class ','Energy class after']
target_cols = ['Carrying out construction works ','Reconstruction of engineering systems',
                'Heat installation','Water heating system']

categorical_cols = ['Above-ground floors','Underground floor',
                    'Carrying out construction works',
                    'Reconstruction of engineering systems',
                    'Heat installation','Water heating system']

df = df[df.columns[df.columns.isin(feature_cols+target_cols)]]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 11 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Building Total Area                    198 non-null    float64
 1   Reference area                         198 non-null    float64
 2   Above-ground floors                    198 non-null    int64  
 3   Underground floor                      198 non-null    int64  
 4   Initial energy class                   198 non-null    object 
 5   Energy consumption before              198 non-null    float64
 6   Carrying out construction works        198 non-null    int64  
 7   Reconstruction of engineering systems  198 non-null    int64  
 8   Water heating system                   198 non-null    int64  
 9   Heat installation                      198 non-null    int64  
 10  Energy class after                     198 non-null    object 
dtypes: flo

In [156]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

for col in df.columns:
    if is_numeric_dtype(df[col]):
        df[[col]] = MinMaxScaler().fit_transform(df[[col]])

for col in df.columns:
    if is_string_dtype(df[col]):
        df[col] = LabelEncoder().fit_transform(df[col])


Xfeatures = df[feature_cols]
ylabels = df[target_cols]

train_X,test_X,train_Y,test_Y = train_test_split(Xfeatures,ylabels,test_size=0.2,random_state=7)
train_X, validation_X, train_Y, validation_Y = train_test_split(train_X, train_Y, test_size=0.25, random_state=1, shuffle=True) # 0.25 x 0.8 = 0.2


In [157]:
from sklearn.neural_network import MLPClassifier

classifier  = MLPClassifier(hidden_layer_sizes=(150,100,50), max_iter=500, activation = 'relu', 
                            solver='adam', shuffle=True, random_state=1, #,verbose=True, early_stopping=True
                            learning_rate='adaptive').fit(train_X, train_Y)
classifier.fit(train_X, train_Y)
pred_Y = classifier.predict(validation_X)




In [158]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, hamming_loss, jaccard_score, average_precision_score
from sklearn.metrics import roc_auc_score, roc_curve, auc, precision_recall_curve, confusion_matrix, classification_report

val = pd.DataFrame(pred_Y,columns=['Carrying out construction works', 'Reconstruction of engineering systems', 'Heat installation','Water heating system'])

accuracy = accuracy_score(test_Y.values, pred_Y)

# Precision, Recall, F1-Score
precision = precision_score(test_Y.values, pred_Y, average='micro')
recall = recall_score(test_Y.values, pred_Y, average='micro')
f1 = f1_score(test_Y.values, pred_Y, average='micro')

# Hamming Loss
hamming_loss_value = hamming_loss(test_Y.values, pred_Y)

# Jaccard Index
jaccard = jaccard_score(test_Y.values, pred_Y, average='micro')

# Average Precision Score
average_precision = average_precision_score(test_Y.values, pred_Y, average='micro')

# Confusion Matrix
confusion = confusion_matrix(test_Y.values.argmax(axis=1), pred_Y.argmax(axis=1))

# Classification Report
class_report = classification_report(test_Y.values, pred_Y)

# Print the computed metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Hamming Loss:", hamming_loss_value)
print("Jaccard Index:", jaccard)
print("Average Precision:", average_precision)
print("Confusion Matrix:\n", confusion)
print("Classification Report:\n", class_report)

Accuracy: 0.225
Precision: 0.6909090909090909
Recall: 0.6129032258064516
F1-Score: 0.6495726495726496
Hamming Loss: 0.25625
Jaccard Index: 0.4810126582278481
Average Precision: 0.5734604105571848
Confusion Matrix:
 [[31  7]
 [ 2  0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.82      0.87        38
           1       0.33      0.35      0.34        20
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         1

   micro avg       0.69      0.61      0.65        62
   macro avg       0.32      0.29      0.30        62
weighted avg       0.68      0.61      0.65        62
 samples avg       0.71      0.65      0.63        62



  _warn_prf(average, modifier, msg_start, len(result))


In [328]:
len(train_Y.values.flatten())

472

In [340]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

#Set the parameters of each model by cross-validation gridsearch
#from custom-perceptron import my_perceptron
models = {'KNN': KNeighborsClassifier(),
          'OVR': OneVsRestClassifier(SVC()),
        #   'GNB': OneVsRestClassifier(GaussianNB()),
          'DT': DecisionTreeClassifier(),
          'RF': RandomForestClassifier(n_jobs=-1), 
          'MLP': MLPClassifier(learning_rate='adaptive',shuffle=True, max_iter=500),
          'XGB': XGBClassifier()}

param_grid = [{'KNN__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15], 'KNN__weights': ['uniform', 'distance']},
              {'OVR__estimator__kernel': ['rbf', 'linear'], 'OVR__estimator__gamma': ['scale', 'auto'],
                'OVR__estimator__C': [10, 100, 1000]},
            #   {'GNB__priors':[None, [0.5,0.5], [0.1, 0.9], [0.000001,0.99999], [0.000000001,0.99999999]]},
              {'DT__criterion': ['entropy'], 'DT__max_depth': [6], 'DT__min_samples_leaf': [1], 'DT__min_samples_split': [4]},
              {'RF__n_estimators': [200, 600], 'RF__max_depth': [4, 10, None], 'RF__min_samples_leaf': [1, 2, 5]},
              {'MLP__hidden_layer_sizes': [150,100,50], 'MLP__activation':['relu','logistic','tanh'], 
               'MLP__solver': ['adam','lbfgs','sgd']}, #,verbose=True, early_stopping=True
              {'XGB__learning_rate': [.1,.2,.3], 'XGB__max_depth': [1, 2, 3, 4, 5, 6],
               'XGB__min_child_weight': [1,2],'XGB__subsample': [1.0, 0.5, 0.1],
               'XGB__n_estimators': [200, 600]}]

best_scores=[]
params=[]

for (classifier, model_params, name) in list(zip(models.values(), param_grid, models.keys())):
    print(classifier, model_params, name)
    print(f"\n\nTuning hyper-parameters, based on accuracy for {name} with parameter grid:\n {model_params}\n")

    pipe = Pipeline([(name, models[name])])
    clf = GridSearchCV(estimator=pipe, param_grid=model_params, cv=5, scoring='accuracy', n_jobs=-1)    
    clf.fit(train_X, train_Y) 

    print(f"Mean performance of each parameter combination based on Cross Validation")
    performance = pd.DataFrame(clf.cv_results_['params'])
    performance["Score"] = clf.cv_results_['mean_test_score']
    print(performance)

    print("\nBest parameters set found on training set:")
    print(clf.best_params_)
    params.append(clf.best_params_)

    print("\nThe scores are computed on the full evaluation set:")
    #evaluate and store scores of estimators of each category on validation set
    score = clf.score(test_X, test_Y)
    print("Accuracy:", score)
    best_scores.append(score)

    pred_Y = clf.predict(test_X)
    print(metrics.classification_report(test_Y, pred_Y, digits=5))
    # cm = confusion_matrix(test_Y, pred_Y); plt.figure()
    # plot_confusion_matrix(cm, classes=[0,1], title=name, cmap=plt.cm.Greens)
    # print("True Positives: {}, False Positives: {}, True Negatives: {}, False Negatives: {} \n\n".format(cm[0,0], cm[0,1], cm[1,1], cm[1,0]))

final_scores = dict(zip(list(models.keys()), best_scores))


KNeighborsClassifier() {'KNN__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15], 'KNN__weights': ['uniform', 'distance']} KNN


Tuning hyper-parameters, based on accuracy for KNN with parameter grid:
 {'KNN__n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15], 'KNN__weights': ['uniform', 'distance']}

Mean performance of each parameter combination based on Cross Validation
    KNN__n_neighbors KNN__weights     Score
0                  1      uniform  0.356522
1                  1     distance  0.356522
2                  3      uniform  0.331522
3                  3     distance  0.348551
4                  5      uniform  0.356159
5                  5     distance  0.381522
6                  7      uniform  0.407246
7                  7     distance  0.381159
8                  9      uniform  0.406522
9                  9     distance  0.389855
10                11      uniform  0.389130
11                11     distance  0.397826
12                13      uniform  0.346377
13                13     d

  _warn_prf(average, modifier, msg_start, len(result))


Mean performance of each parameter combination based on Cross Validation
    OVR__estimator__C OVR__estimator__gamma OVR__estimator__kernel     Score
0                  10                 scale                    rbf  0.422464
1                  10                 scale                 linear  0.378986
2                  10                  auto                    rbf  0.405797
3                  10                  auto                 linear  0.378986
4                 100                 scale                    rbf  0.381159
5                 100                 scale                 linear  0.405797
6                 100                  auto                    rbf  0.372826
7                 100                  auto                 linear  0.405797
8                1000                 scale                    rbf  0.355072
9                1000                 scale                 linear  0.380435
10               1000                  auto                    rbf  0.355072
11 

  _warn_prf(average, modifier, msg_start, len(result))


Mean performance of each parameter combination based on Cross Validation
    RF__max_depth  RF__min_samples_leaf  RF__n_estimators     Score
0             4.0                     1               200  0.474275
1             4.0                     1               600  0.465580
2             4.0                     2               200  0.465942
3             4.0                     2               600  0.448913
4             4.0                     5               200  0.431522
5             4.0                     5               600  0.439855
6            10.0                     1               200  0.414855
7            10.0                     1               600  0.422826
8            10.0                     2               200  0.448913
9            10.0                     2               600  0.439855
10           10.0                     5               200  0.431522
11           10.0                     5               600  0.431522
12            NaN                     1    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Mean performance of each parameter combination based on Cross Validation
   MLP__activation  MLP__hidden_layer_sizes MLP__solver     Score
0             relu                      150        adam  0.389130
1             relu                      150       lbfgs  0.321739
2             relu                      150         sgd  0.380072
3             relu                      100        adam  0.422464
4             relu                      100       lbfgs  0.356884
5             relu                      100         sgd  0.397464
6             relu                       50        adam  0.413768
7             relu                       50       lbfgs  0.381884
8             relu                       50         sgd  0.363043
9         logistic                      150        adam  0.396739
10        logistic                      150       lbfgs  0.356522
11        logistic                      150         sgd  0.329348
12        logistic                      100        adam  0.414493
13 

  _warn_prf(average, modifier, msg_start, len(result))


In [341]:
print(final_scores)

{'KNN': 0.425, 'OVR': 0.55, 'DT': 0.4, 'RF': 0.45, 'MLP': 0.45, 'XGB': 0.475}
