In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn import metrics

In [2]:
df = pd.read_csv('../data/train_data_encoded.csv')
print('shape:', df.shape)
df.head(5)

shape: (36944, 8)


Unnamed: 0,quality,color,clarity,price,weight_ES,depth_percent_ES,table_percent_ES,volume_ES
0,3,6,1,6.353,-0.625,0.357143,0.333333,-0.591767
1,4,5,5,9.183,0.484375,0.571429,-0.333333,0.550637
2,4,4,3,7.983,0.03125,-0.071429,0.666667,0.052269
3,2,3,1,8.371,0.59375,0.928571,0.0,0.624043
4,3,3,4,6.588,-0.53125,0.285714,0.666667,-0.511469


In [3]:
X = df.drop(["price"], axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

print("Datos de entrenamiento")
print("-----------------------")
print(y_train.describe())
print('\n')
print("Datos de testeo")
print("-----------------------")
print(y_test.describe())

Datos de entrenamiento
-----------------------
count    29555.000000
mean         7.702645
std          0.976815
min          5.814000
25%          6.809000
50%          7.689000
75%          8.505000
max          9.842000
Name: price, dtype: float64


Datos de testeo
-----------------------
count    7389.000000
mean        7.699702
std         0.982903
min         5.787000
25%         6.802000
50%         7.695000
75%         8.511000
max         9.841000
Name: price, dtype: float64


In [4]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [7]:
param = {"max_depth": [26, 32, 40],
        "max_features": [7, 9],
        "min_samples_split": [75, 100]}

decision_tree = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param,
            cv=10,
            n_jobs=-1,
            verbose=3,
            return_train_score = True,
            scoring="neg_mean_squared_error")

%time
decision_tree.fit(X_train, y_train)
best_dt = decision_tree.best_estimator_
print('best tree:', best_dt)

y_pred_test_dt = best_dt.predict(X_test)
y_pred_train_dt = best_dt.predict(X_train)
dt_results_1 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decision tree 1")
dt_results_1.style.background_gradient(cmap='coolwarm')

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.91 µs
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 2/10] END max_depth=26, max_features=7, min_samples_split=75;, score=(train=-0.011, test=-0.013) total time=   0.5s
[CV 3/10] END max_depth=26, max_features=7, min_samples_split=75;, score=(train=-0.011, test=-0.014) total time=   0.5s
[CV 1/10] END max_depth=26, max_features=7, min_samples_split=75;, score=(train=-0.011, test=-0.013) total time=   0.5s
[CV 5/10] END max_depth=26, max_features=7, min_samples_split=75;, score=(train=-0.011, test=-0.013) total time=   0.5s
[CV 4/10] END max_depth=26, max_features=7, min_samples_split=75;, score=(train=-0.011, test=-0.014) total time=   0.6s
[CV 7/10] END max_depth=26, max_features=7, min_samples_split=75;, score=(train=-0.011, test=-0.014) total time=   0.5s
[CV 6/10] END max_depth=26, max_features=7, min_samples_split=75;, score=(train=-0.011, test=-0.013) total time=   0.6s
[CV 8/10] END max_depth=26, max_f

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.087276,0.012754,0.112932,0.986797,test,Decision tree 1
1,0.079237,0.010517,0.102552,0.988978,train,Decision tree 1


In [8]:
param = {"max_depth": [8, 10, 12],
        "max_features": [4, 8],
        "min_samples_split": [10, 50, 100]}

decision_tree = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param,
            cv=10,
            n_jobs=-1,
            verbose=1,
            return_train_score = True,
            scoring="neg_mean_squared_error")

%time
decision_tree.fit(X_train, y_train)
best_dt = decision_tree.best_estimator_
print('best tree:', best_dt)
y_pred_test_dt = best_dt.predict(X_test)
y_pred_train_dt = best_dt.predict(X_train)
dt_results_2 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decision tree 2")
dt_results_2.style.background_gradient(cmap='coolwarm')

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 8.82 µs
Fitting 10 folds for each of 18 candidates, totalling 180 fits
best tree: DecisionTreeRegressor(max_depth=12, max_features=8, min_samples_split=50)


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.085305,0.012221,0.110549,0.987348,test,Decision tree 2
1,0.076904,0.009883,0.099416,0.989641,train,Decision tree 2


In [9]:
param = {"max_depth": [12, 18],
        "max_features": [8, 12],
        "min_samples_split": [10, 50, 100]}

decision_tree = GridSearchCV(
            estimator=DecisionTreeRegressor(),
            param_grid= param,
            cv=10,
            n_jobs=-1,
            verbose=1,
            return_train_score = True,
            scoring="neg_mean_squared_error")

%time
decision_tree.fit(X_train, y_train)
best_dt = decision_tree.best_estimator_
print('best tree:', best_dt)
y_pred_test_dt = best_dt.predict(X_test)
y_pred_train_dt = best_dt.predict(X_train)
dt_results_3 = metricas(y_test, y_train, y_pred_test_dt, y_pred_train_dt, "Decision tree 3")
dt_results_3.style.background_gradient(cmap='coolwarm')

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 7.87 µs
Fitting 10 folds for each of 12 candidates, totalling 120 fits
best tree: DecisionTreeRegressor(max_depth=12, max_features=12, min_samples_split=50)


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.085348,0.012238,0.110627,0.987331,test,Decision tree 3
1,0.076904,0.009883,0.099416,0.989641,train,Decision tree 3


In [10]:
df_results = pd.concat([dt_results_1, dt_results_2, dt_results_3], axis = 0)
df_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.087276,0.012754,0.112932,0.986797,test,Decision tree 1
1,0.079237,0.010517,0.102552,0.988978,train,Decision tree 1
0,0.085305,0.012221,0.110549,0.987348,test,Decision tree 2
1,0.076904,0.009883,0.099416,0.989641,train,Decision tree 2
0,0.085348,0.012238,0.110627,0.987331,test,Decision tree 3
1,0.076904,0.009883,0.099416,0.989641,train,Decision tree 3
