# Árboles de decisión

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import sys
import os
sys.path.append("../")

from src import funciones_problemas_regresion as fpr

In [None]:
df =  pd.read_csv("../datos/df_autorenew_nonulls_estand_sinout_encoded.csv", index_col=0).reset_index(drop=True)
variable_respuesta = "price"

# Inicializamos la clase, separamos VR y VD y separamos el train y el test.
clase_arbol = fpr.ArbolesDecision(df, variable_respuesta)
clase_arbol.separar_variables()
clase_arbol.separar_train_test()

# Estudiamos cuales son los mejores parámetros, entrenamos al modelo y vemos las métricas.
params_arbol = {
    "max_depth" : [5,7,9,10,11],
    "min_samples_split" : [10, 50, 100, 200],
    "min_samples_leaf" : [10, 50, 100, 200],
    "max_leaf_nodes" : [4, 6, 8, 10, 20, 30]
}

clase_arbol.grid_fit_metrics(params_arbol)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=9, max_leaf_nodes=30, min_samples_leaf=10,
                      min_samples_split=10)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.636115,2501.775422,16354520.0,4044.071661
test,0.630525,2499.676654,16200910.0,4025.035573


In [3]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_2 = {
    "max_depth" : [7,8,9,10,11],
    "max_leaf_nodes" : [10, 20, 30, 40, 50],
    "min_samples_leaf" : [5, 10, 15, 20],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_2)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=8, max_leaf_nodes=50, min_samples_leaf=5,
                      min_samples_split=5)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.674299,2354.912462,14638360.0,3826.010901
test,0.668836,2351.586655,14521040.0,3810.647925


In [4]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_3 = {
    "max_depth" : [6,7,8,9,10],
    "max_leaf_nodes" : [40, 50, 60, 70],
    "min_samples_leaf" : [5, 10, 15, 20],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_3)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=9, max_leaf_nodes=70, min_samples_leaf=20,
                      min_samples_split=5)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.694544,2271.904893,13728490.0,3705.198311
test,0.690484,2269.818898,13571790.0,3683.991462


In [5]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_4 = {
    "max_depth" : [7,8,9,10,11],
    "max_leaf_nodes" : [60, 70, 80, 90],
    "min_samples_leaf" : [10, 15, 20, 30],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_4)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=9, max_leaf_nodes=90, min_samples_leaf=30,
                      min_samples_split=5)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.708208,2208.030859,13114360.0,3621.376087
test,0.703019,2209.494536,13022140.0,3608.620408


In [6]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_5 = {
    "max_depth" : [7,8,9,10,11],
    "max_leaf_nodes" : [80, 90, 100, 110],
    "min_samples_leaf" : [20, 30, 40, 50],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_5)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=9, max_leaf_nodes=110, min_samples_leaf=40,
                      min_samples_split=5)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.718125,2159.851177,12668620.0,3559.301071
test,0.711433,2164.497721,12653220.0,3557.137097


In [7]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_6 = {
    "max_depth" : [7,8,9,10,11],
    "max_leaf_nodes" : [100, 110, 120, 130],
    "min_samples_leaf" : [30, 40, 50, 60],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_6)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=11, max_leaf_nodes=130, min_samples_leaf=40,
                      min_samples_split=5)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.726343,2131.598574,12299290.0,3507.034831
test,0.71884,2139.49086,12328410.0,3511.183975


In [8]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_7 = {
    "max_depth" : [10,11, 12, 13],
    "max_leaf_nodes" : [120, 130, 140, 150],
    "min_samples_leaf" : [30, 40, 50, 60],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_7)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=11, max_leaf_nodes=150, min_samples_leaf=30,
                      min_samples_split=5)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.733135,2096.323025,11994040.0,3463.241765
test,0.724884,2105.998219,12063400.0,3473.240773


In [None]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_7 = {
    "max_depth" : [10, 11, 12, 13],
    "max_leaf_nodes" : [140, 150, 160, 170, 180],
    "min_samples_leaf" : [10, 20, 30, 40, 50, 60],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_7)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=11, max_leaf_nodes=170, min_samples_leaf=20,
                      min_samples_split=15)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.739462,2075.949673,11709690.0,3421.94178
test,0.73024,2088.451304,11828530.0,3439.263001


In [None]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_8 = {
    "max_depth" : [10, 11, 12, 13],
    "max_leaf_nodes" : [140, 150, 160, 170, 180],
    "min_samples_leaf" : [10, 20, 30, 40, 50, 60],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_8)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=11, max_leaf_nodes=180, min_samples_leaf=20,
                      min_samples_split=5)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.741945,2065.935816,11598080.0,3405.595737
test,0.732821,2077.019594,11715380.0,3422.773125


In [11]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_9 = {
    "max_depth" : [10, 11, 12, 13],
    "max_leaf_nodes" : [170, 180, 190, 200],
    "min_samples_leaf" : [10, 20, 30, 40],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_9)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=11, max_leaf_nodes=200, min_samples_leaf=20,
                      min_samples_split=15)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.746447,2049.242492,11395710.0,3375.753626
test,0.736245,2064.149772,11565250.0,3400.771444


In [12]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_10 = {
    "max_depth" : [10, 11, 12, 13],
    "max_leaf_nodes" : [200, 220, 240, 260],
    "min_samples_leaf" : [10, 20, 30, 40],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_10)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=12, max_leaf_nodes=260, min_samples_leaf=10,
                      min_samples_split=5)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.758958,1995.459882,10833440.0,3291.419637
test,0.747587,2012.691727,11067920.0,3326.848189


In [13]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_11 = {
    "max_depth" : [10, 11, 12, 13],
    "max_leaf_nodes" : [260, 300, 350, 400],
    "min_samples_leaf" : [5, 10, 20, 30, 40],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_11)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=13, max_leaf_nodes=400, min_samples_leaf=10,
                      min_samples_split=10)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.776205,1915.78824,10058300.0,3171.481854
test,0.763896,1939.394076,10352780.0,3217.573917


In [14]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_12 = {
    "max_depth" : [11, 12, 13],
    "max_leaf_nodes" : [400, 450, 500, 550],
    "min_samples_leaf" : [5, 10, 20, 30, 40],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_12)

Las mejores métricas para el modelo de DecisionTreeRegressor son:


  _data = np.array(data, dtype=dtype, copy=copy,


DecisionTreeRegressor(max_depth=13, max_leaf_nodes=550, min_samples_leaf=10,
                      min_samples_split=10)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.787898,1857.930271,9532746.0,3087.514526
test,0.773138,1887.339344,9947535.0,3153.971308


In [15]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_13 = {
    "max_depth" : [12, 13, 14, 15],
    "max_leaf_nodes" : [550, 600, 700, 800],
    "min_samples_leaf" : [5, 10, 20, 30, 40],
    "min_samples_split" : [5, 10, 15, 20]
}

clase_arbol.grid_fit_metrics(params_arbol_13)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=15, max_leaf_nodes=800, min_samples_leaf=10,
                      min_samples_split=20)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.802706,1787.706416,8867231.0,2977.78957
test,0.781624,1831.809429,9575443.0,3094.421324


In [16]:
# Reajustamos los parámetros y volvemos a probar
params_arbol_14 = {
    "max_depth" : [14, 15, 16, 17],
    "max_leaf_nodes" : [800, 900, 1000, 1200],
    "min_samples_leaf" : [5, 10, 20, 30],
    "min_samples_split" : [5, 10, 20, 30]
}

clase_arbol.grid_fit_metrics(params_arbol_14)

Las mejores métricas para el modelo de DecisionTreeRegressor son:
DecisionTreeRegressor(max_depth=17, max_leaf_nodes=1200, min_samples_leaf=10,
                      min_samples_split=30)


Unnamed: 0,r2_scores,MAE,MSE,RMSE
train,0.816483,1720.140263,8248020.0,2871.936549
test,0.788387,1784.23813,9278892.0,3046.127365


Cada vez empieza a haber más diferencia entre el RMSE del train y del test lo que me indica que puede que esté comenzando a haer un poco de overfitting, por lo que la siguiente estrategia sería volver al preprocesamiento y volver a intentarlo.