In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics

In [2]:
df = pd.read_csv('../data/train_data_encoded_full_1.csv')
print('shape:', df.shape)
df.head(5)

shape: (36944, 8)


Unnamed: 0,quality,color,clarity,price,weight_ES,depth_percent_ES,table_percent_ES,volume_ES
0,3,6,1,6.353,-0.625,0.357143,0.333333,-0.591767
1,4,5,5,9.183,0.484375,0.571429,-0.333333,0.550637
2,4,4,3,7.983,0.03125,-0.071429,0.666667,0.052269
3,2,3,1,8.371,0.59375,0.928571,0.0,0.624043
4,3,3,4,6.588,-0.53125,0.285714,0.666667,-0.511469


In [3]:
#split dara train and test
X = df.drop(["price"], axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

print("Datos de entrenamiento")
print("-----------------------")
print(y_train.describe())
print('\n')
print("Datos de testeo")
print("-----------------------")
print(y_test.describe())

Datos de entrenamiento
-----------------------
count    29555.000000
mean         7.702645
std          0.976815
min          5.814000
25%          6.809000
50%          7.689000
75%          8.505000
max          9.842000
Name: price, dtype: float64


Datos de testeo
-----------------------
count    7389.000000
mean        7.699702
std         0.982903
min         5.787000
25%         6.802000
50%         7.695000
75%         8.511000
max         9.841000
Name: price, dtype: float64


In [4]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [5]:
gb = GradientBoostingRegressor(n_estimators=100, loss='squared_error', learning_rate=0.1, max_depth=8, random_state=0, criterion='friedman_mse')
gb.fit(X_train, y_train)

y_pred_gb_test= gb.predict(X_test)
y_pred_gb_train= gb.predict(X_train)

results_gradient_1 = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient 1")
results_gradient_1.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.070603,0.008793,0.093771,0.990897,test,Gradient 1
1,0.058625,0.006028,0.077638,0.993683,train,Gradient 1


In [6]:
gb = GradientBoostingRegressor(n_estimators=120, loss='squared_error', learning_rate=0.1, max_depth=7, random_state=0, criterion='friedman_mse')
gb.fit(X_train, y_train)

y_pred_gb_test= gb.predict(X_test)
y_pred_gb_train= gb.predict(X_train)

results_gradient_2 = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient 2")
results_gradient_2.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.070665,0.008708,0.093315,0.990985,test,Gradient 2
1,0.062614,0.006789,0.082393,0.992885,train,Gradient 2


In [7]:
gb = GradientBoostingRegressor(n_estimators=125, loss='squared_error', learning_rate=0.15, max_depth=6, random_state=0, criterion='friedman_mse')
gb.fit(X_train, y_train)

y_pred_gb_test= gb.predict(X_test)
y_pred_gb_train= gb.predict(X_train)

results_gradient_3 = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient 3")
results_gradient_3.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.071491,0.008885,0.094258,0.990802,test,Gradient 3
1,0.064578,0.007181,0.084738,0.992474,train,Gradient 3


In [8]:
gb = GradientBoostingRegressor(n_estimators=100, loss='squared_error', learning_rate=0.15, max_depth=7, random_state=0, criterion='squared_error')
gb.fit(X_train, y_train)

y_pred_gb_test= gb.predict(X_test)
y_pred_gb_train= gb.predict(X_train)

results_gradient_4 = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient 4")
results_gradient_4.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.070944,0.008883,0.09425,0.990804,test,Gradient 4
1,0.061619,0.006607,0.081282,0.993076,train,Gradient 4


In [9]:
gb = GradientBoostingRegressor(n_estimators=110, loss='squared_error', learning_rate=0.12, max_depth=7, random_state=0, criterion='squared_error')
gb.fit(X_train, y_train)

y_pred_gb_test= gb.predict(X_test)
y_pred_gb_train= gb.predict(X_train)

results_gradient_5 = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient 5")
results_gradient_5.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.07099,0.008789,0.09375,0.990901,test,Gradient 5
1,0.062201,0.006696,0.081827,0.992983,train,Gradient 5


In [10]:
df_results = pd.concat([results_gradient_1, results_gradient_2, results_gradient_3, results_gradient_4, results_gradient_5], axis = 0)
df_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.070603,0.008793,0.093771,0.990897,test,Gradient 1
1,0.058625,0.006028,0.077638,0.993683,train,Gradient 1
0,0.070665,0.008708,0.093315,0.990985,test,Gradient 2
1,0.062614,0.006789,0.082393,0.992885,train,Gradient 2
0,0.071491,0.008885,0.094258,0.990802,test,Gradient 3
1,0.064578,0.007181,0.084738,0.992474,train,Gradient 3
0,0.070944,0.008883,0.09425,0.990804,test,Gradient 4
1,0.061619,0.006607,0.081282,0.993076,train,Gradient 4
0,0.07099,0.008789,0.09375,0.990901,test,Gradient 5
1,0.062201,0.006696,0.081827,0.992983,train,Gradient 5
