In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
df = pd.read_csv('../data/train_data_encoded.csv')
print('shape:', df.shape)
df.head(5)

shape: (28942, 7)


Unnamed: 0,weight,quality,color,clarity,table_percent,price,depth_percent
0,0.3,3,6,1,0.58,6.353,0.624
1,1.01,4,5,5,0.56,9.183,0.627
2,0.72,4,4,3,0.59,7.983,0.618
3,1.08,2,3,1,0.57,8.371,0.632
4,0.36,3,3,4,0.59,6.588,0.623


In [3]:
#reduce predictor variables
df = df[['weight', 'quality', 'color', 'clarity', 'price']]
df

Unnamed: 0,weight,quality,color,clarity,price
0,0.30,3,6,1,6.353
1,1.01,4,5,5,9.183
2,0.72,4,4,3,7.983
3,1.08,2,3,1,8.371
4,0.36,3,3,4,6.588
...,...,...,...,...,...
28937,1.52,3,6,1,9.093
28938,0.37,3,4,3,6.545
28939,1.01,3,4,4,8.854
28940,0.80,1,3,1,7.768


In [4]:
#split dara train and test
X = df.drop(["price"], axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

print("Datos de entrenamiento")
print("-----------------------")
print(y_train.describe())
print('\n')
print("Datos de testeo")
print("-----------------------")
print(y_test.describe())

Datos de entrenamiento
-----------------------
count    23153.000000
mean         7.992375
std          0.993806
min          5.787000
25%          7.156000
50%          8.082000
75%          8.755000
max          9.842000
Name: price, dtype: float64


Datos de testeo
-----------------------
count    5789.000000
mean        8.006556
std         0.986775
min         5.866000
25%         7.189000
50%         8.089000
75%         8.755000
max         9.841000
Name: price, dtype: float64


In [5]:
def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [6]:
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=9, random_state=0, criterion='squared_error')
gb.fit(X_train, y_train)

y_pred_gb_test= gb.predict(X_test)
y_pred_gb_train= gb.predict(X_train)

results_gradient_1 = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient 1")
results_gradient_1.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.079641,0.011105,0.105382,0.988593,test,Gradient 1
1,0.063971,0.007366,0.085823,0.992542,train,Gradient 1


In [8]:
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=4, random_state=0, criterion='squared_error')
gb.fit(X_train, y_train)

y_pred_gb_test= gb.predict(X_test)
y_pred_gb_train= gb.predict(X_train)

results_gradient_2 = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient 2")
results_gradient_2.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.080288,0.011072,0.105226,0.988627,test,Gradient 2
1,0.078663,0.010716,0.103516,0.98915,train,Gradient 2


In [9]:
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=0, criterion='squared_error')
gb.fit(X_train, y_train)

y_pred_gb_test= gb.predict(X_test)
y_pred_gb_train= gb.predict(X_train)

results_gradient_3 = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "Gradient 3")
results_gradient_3.style.background_gradient(cmap='coolwarm')

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.077512,0.010482,0.102381,0.989233,test,Gradient 3
1,0.070554,0.008739,0.093483,0.991151,train,Gradient 3


In [12]:
param = {"criterion": ['squared_error'],
        "n_estimators": [100, 200, 300, 400, 500],
        "learning_rate": [0.1, 0.15, 0.05],
        "random_state": [0],
        "max_depth": [6, 8, 12]}

gradien_boost = GridSearchCV(
            estimator=GradientBoostingRegressor(),
            param_grid= param,
            cv=10,
            n_jobs = -1,
            verbose= 1,
            return_train_score = True,
            scoring="neg_mean_squared_error")

gradien_boost.fit(X_train, y_train)

best_gb = gradien_boost.best_estimator_
print(best_gb)


y_pred_test_gb = gradien_boost.predict(X_test)
y_pred_train_gb = gradien_boost.predict(X_train)
results_gradient_4 = metricas(y_test, y_train, y_pred_test_gb, y_pred_train_gb, "Gradient 4")
results_gradient_4.style.background_gradient(cmap='coolwarm')

Fitting 10 folds for each of 45 candidates, totalling 450 fits
GradientBoostingRegressor(criterion='squared_error', learning_rate=0.05,
                          max_depth=6, n_estimators=200, random_state=0)


Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.077533,0.01045,0.102225,0.989266,test,Gradient 4
1,0.073029,0.009316,0.096517,0.990568,train,Gradient 4


In [13]:
df_results = pd.concat([results_gradient_1, results_gradient_2, results_gradient_3, results_gradient_4], axis = 0)
df_results

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.079641,0.011105,0.105382,0.988593,test,Gradient 1
1,0.063971,0.007366,0.085823,0.992542,train,Gradient 1
0,0.080288,0.011072,0.105226,0.988627,test,Gradient 2
1,0.078663,0.010716,0.103516,0.98915,train,Gradient 2
0,0.077512,0.010482,0.102381,0.989233,test,Gradient 3
1,0.070554,0.008739,0.093483,0.991151,train,Gradient 3
0,0.077533,0.01045,0.102225,0.989266,test,Gradient 4
1,0.073029,0.009316,0.096517,0.990568,train,Gradient 4
