In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics

In [24]:
df = pd.read_csv('../data/train_data_encoded_full_1.csv')
print('shape:', df.shape)
df.head(5)

shape: (36960, 8)


Unnamed: 0,quality,color,clarity,price,weight_ES,depth_percent_ES,table_percent_ES,volume_ES
0,3,6,1,1.848927,-0.625,0.357143,0.333333,-0.828708
1,4,5,5,2.217354,0.484375,0.571429,-0.333333,0.425581
2,4,4,3,2.077314,0.03125,-0.071429,0.666667,0.047946
3,2,3,1,2.124773,0.59375,0.928571,0.0,0.471104
4,3,3,4,1.88525,-0.53125,0.285714,0.666667,-0.669188


In [26]:
#split dara train and test
X = df.drop(["price"], axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

print("Datos de entrenamiento")
print("-----------------------")
print(y_train.describe())
print('\n')
print("Datos de testeo")
print("-----------------------")
print(y_test.describe())

Datos de entrenamiento
-----------------------
count    29568.000000
mean         2.033940
std          0.127191
min          1.755614
25%          1.918833
50%          2.041090
75%          2.141007
max          2.286659
Name: price, dtype: float64


Datos de testeo
-----------------------
count    7392.000000
mean        2.031700
std         0.128554
min         1.760785
25%         1.914272
50%         2.037056
75%         2.140654
max         2.286659
Name: price, dtype: float64


In [27]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [28]:
gb = GradientBoostingRegressor(n_estimators=100, loss='squared_error', learning_rate=0.1, max_depth=7, max_features=7, random_state=0, criterion='friedman_mse')
gb.fit(X_train, y_train)

y_pred_gb_test= gb.predict(X_test)
y_pred_gb_train= gb.predict(X_train)

results_gradient_1 = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient 1")
results_gradient_1.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.009615,0.000167,0.012906,0.98992,test,Gradient 1
1,0.008479,0.000127,0.01129,0.99212,train,Gradient 1


In [29]:
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=9, max_features=5, random_state=0, criterion='squared_error')
gb.fit(X_train, y_train)

y_pred_gb_test= gb.predict(X_test)
y_pred_gb_train= gb.predict(X_train)

results_gradient_2 = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient 2")
results_gradient_2.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.009756,0.000173,0.013154,0.989529,test,Gradient 2
1,0.007176,9.5e-05,0.009738,0.994138,train,Gradient 2


In [30]:
gb = GradientBoostingRegressor(n_estimators=125, loss='squared_error', learning_rate=0.15, max_depth=6, max_features=8, random_state=0, criterion='squared_error')
gb.fit(X_train, y_train)

y_pred_gb_test= gb.predict(X_test)
y_pred_gb_train= gb.predict(X_train)

results_gradient_3 = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient 3")
results_gradient_3.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.009713,0.000169,0.012985,0.989796,test,Gradient 3
1,0.008621,0.000131,0.011432,0.991921,train,Gradient 3


In [31]:
df_results = pd.concat([results_gradient_1, results_gradient_2, results_gradient_3], axis = 0)
df_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.009615,0.000167,0.012906,0.98992,test,Gradient 1
1,0.008479,0.000127,0.01129,0.99212,train,Gradient 1
0,0.009756,0.000173,0.013154,0.989529,test,Gradient 2
1,0.007176,9.5e-05,0.009738,0.994138,train,Gradient 2
0,0.009713,0.000169,0.012985,0.989796,test,Gradient 3
1,0.008621,0.000131,0.011432,0.991921,train,Gradient 3
