#Decision Tree Regressor



## Importación de librerías y datos

Por medio de nuestra libería ESIOS_contoller.py importamos nuestro último dataset de datos y lo parseamos para su uso. Sirve tanto como para Drive como jupiter.

In [0]:
import json, urllib, datetime, pickle, time
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.metrics import *
from keras.models import *
from keras.layers import *
from keras.optimizers import *
from scipy.stats import *
from importlib.machinery import SourceFileLoader


try:
  from google.colab import drive
  drive.mount('/content/drive')
  path = '/content/drive/My Drive/TFM/01.Utils/ESIOS_contoller.py'
  in_colab = True
except:
  path = '../utils/ESIOS_contoller.py'
  in_colab = False
  

esios_assembler = SourceFileLoader('esios', path).load_module()

esios_controller = esios_assembler.ESIOS(in_colab)
data_consumo = esios_controller.get_data('non-secuencial')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Mostrando los datos de data_total_for_non_serial.csv
(30555, 34)
________________________________________________________________________________


## Preparación de los datos

In [0]:
x_data = esios_controller.get_data_real_time()
y_data = esios_controller.get_target_data()

# Split the data
x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=0.33, shuffle=False)

#Reshape for the LSTM
x_train = x_train.to_numpy()
x_valid = x_valid.to_numpy()
y_train = y_train.to_numpy()
y_valid = y_valid.to_numpy()

print('Xtrain_dim:', x_train.shape)
print('Ytrain_dim:', y_train.shape)

Xtrain_dim: (20471, 30)
Ytrain_dim: (20471, 1)


##Modelo

In [0]:
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [0]:
print('MSE en entrenamiento es: ', mean_squared_error(regressor.predict(x_train), y_train))
print('MSE en validación es: ', mean_squared_error(regressor.predict(x_valid), y_valid))

x_predict = regressor.predict(x_valid)
print('MAE: ', mean_absolute_error(x_predict, y_valid))
print('MedianAE: ', median_absolute_error(x_predict, y_valid))
print('RMSE: ', mean_squared_log_error(x_predict, y_valid))
print('Variance: ', explained_variance_score(x_predict, y_valid))
print('Max Error: ', max_error(x_predict, y_valid))

MSE en entrenamiento es:  0.0
MSE en validación es:  197.48469588456962
MAE:  10.107732050773501
MedianAE:  7.640000000000001
RMSE:  0.06409100805384184
Variance:  -0.13850035353867396
Max Error:  121.31


## Normalizados

In [0]:
scaler = StandardScaler()
x_train_est = scaler.fit_transform(x_train)
y_train_est = scaler.fit_transform(y_train)
x_valid_est = scaler.fit_transform(x_valid)
y_valid_est = scaler.fit_transform(y_valid)

In [0]:
regressor_standarized = DecisionTreeRegressor(random_state=0)
regressor_standarized.fit(x_train_est, y_train_est)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [0]:
print('MSE en entrenamiento es: ', mean_squared_error(regressor.predict(x_train), y_train))
print('MSE en validación es: ', mean_squared_error(regressor.predict(x_valid), y_valid))

x_predict = regressor.predict(x_valid)
print('MAE: ', mean_absolute_error(x_predict, y_valid))
print('MedianAE: ', median_absolute_error(x_predict, y_valid))
print('RMSE: ', mean_squared_log_error(x_predict, y_valid))
print('Variance: ', explained_variance_score(x_predict, y_valid))
print('Max Error: ', max_error(x_predict, y_valid))

MSE en entrenamiento es:  0.0
MSE en validación es:  197.48469588456962
MAE:  10.107732050773501
MedianAE:  7.640000000000001
RMSE:  0.06409100805384184
Variance:  -0.13850035353867396
Max Error:  121.31


## Optimización de modelos

In [0]:
param_grid = {"criterion": ["mse", "mae"],
              "min_samples_split": [10, 20, 40],
              "max_depth": [2, 6, 8],
              "min_samples_leaf": [20, 40, 100],
              "max_leaf_nodes": [5, 20, 100],
              }

clf = GridSearchCV(regressor, param_grid, cv=5)
clf.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=0,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['mse', 'mae'], 'max_depth': [2, 6, 8],
                         'max_leaf_nodes': [5, 20, 100],
                         'min_samples_leaf': [20, 40, 100],
                    

In [0]:
print(clf.best_params_)


{'criterion': 'mse', 'max_depth': 8, 'max_leaf_nodes': 100, 'min_samples_leaf': 40, 'min_samples_split': 10}


## Try best model

In [0]:
regressor = DecisionTreeRegressor(criterion='mse', max_depth= 8, max_leaf_nodes= 200, min_samples_leaf= 40, min_samples_split= 2)
model_fit = regressor.fit(x_train, y_train)

In [0]:
from sklearn.model_selection import cross_val_score

cross_val_score(regressor, x_valid, y_valid, cv=10)

array([ 0.03627348, -0.49693891, -0.48191267, -0.18630077, -0.34506269,
        0.18587934, -0.02767264, -0.32820391, -0.49538299,  0.00825829])

In [0]:
print('MSE en entrenamiento es: ', mean_squared_error(regressor.predict(x_train), y_train))
print('MSE en validación es: ', mean_squared_error(regressor.predict(x_valid), y_valid))

x_predict = regressor.predict(x_valid)
print('MAE: ', mean_absolute_error(x_predict, y_valid))
print('MedianAE: ', median_absolute_error(x_predict, y_valid))
print('RMSE: ', mean_squared_log_error(x_predict, y_valid))
print('Variance: ', explained_variance_score(x_predict, y_valid))
print('Max Error: ', max_error(x_predict, y_valid))

MSE en entrenamiento es:  82.41332068610687
MSE en validación es:  100.83844585427774
MAE:  6.634299708310123
MedianAE:  5.088858695652199
RMSE:  0.058220072665257756
Variance:  -0.2333943027858738
Max Error:  116.48024999999998
