In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import xgboost as xgb
import matplotlib.pyplot as plt
plt.style.use('dark_background')

In [11]:
df = pd.read_csv('../data/train_data_encoded_full_1.csv')
df.price = np.log(df.price)
print('shape:', df.shape)
df.head(5)

shape: (36944, 8)


Unnamed: 0,quality,color,clarity,price,weight_ES,depth_percent_ES,table_percent_ES,volume_ES
0,3,6,1,1.848927,-0.625,0.357143,0.333333,-0.591767
1,4,5,5,2.217354,0.484375,0.571429,-0.333333,0.550637
2,4,4,3,2.077314,0.03125,-0.071429,0.666667,0.052269
3,2,3,1,2.124773,0.59375,0.928571,0.0,0.624043
4,3,3,4,1.88525,-0.53125,0.285714,0.666667,-0.511469


In [12]:
X = df.drop(["price"], axis = 1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size   = 0.8, random_state = 42)

def metricas(y_test, y_train, y_test_pred, y_train_pred, tipo_modelo):
    resultados = {'MAE': [metrics.mean_absolute_error(y_test, y_test_pred), metrics.mean_absolute_error(y_train, y_train_pred)],
                'MSE': [metrics.mean_squared_error(y_test, y_test_pred), metrics.mean_squared_error(y_train, y_train_pred)],
                'RMSE': [np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)), np.sqrt(metrics.mean_squared_error(y_train, y_train_pred))],
                'R2':  [metrics.r2_score(y_test, y_test_pred), metrics.r2_score(y_train, y_train_pred)],
                 "set": ["test", "train"]}
    df = pd.DataFrame(resultados)
    df["modelo"] = tipo_modelo
    return df

In [13]:
params = { 'max_depth': [7],
           'learning_rate': [0.02],
           'n_estimators': [750],
           'colsample_bytree': [0.9, 1]}
xgbr = xgb.XGBRegressor(seed = 20, objective ='reg:squarederror')
clf = GridSearchCV(estimator=xgbr, 
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   verbose=3,
                   n_jobs=-1)
clf.fit(X_train, y_train)

print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

y_pred_gb_test = clf.predict(X_test)
y_pred_gb_train= clf.predict(X_train)

results_xg_boost = metricas(y_test, y_train, y_pred_gb_test, y_pred_gb_train, "XG Boost GridSearch")
results_xg_boost.style.background_gradient(cmap='coolwarm')

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END colsample_bytree=0.9, learning_rate=0.02, max_depth=7, n_estimators=750;, score=-0.000 total time=  17.0s
[CV 4/5] END colsample_bytree=0.9, learning_rate=0.02, max_depth=7, n_estimators=750;, score=-0.000 total time=  16.9s
[CV 3/5] END colsample_bytree=0.9, learning_rate=0.02, max_depth=7, n_estimators=750;, score=-0.000 total time=  16.9s
[CV 5/5] END colsample_bytree=0.9, learning_rate=0.02, max_depth=7, n_estimators=750;, score=-0.000 total time=  16.9s
[CV 2/5] END colsample_bytree=0.9, learning_rate=0.02, max_depth=7, n_estimators=750;, score=-0.000 total time=  17.4s
[CV 3/5] END colsample_bytree=1, learning_rate=0.02, max_depth=7, n_estimators=750;, score=-0.000 total time=  18.1s
[CV 1/5] END colsample_bytree=1, learning_rate=0.02, max_depth=7, n_estimators=750;, score=-0.000 total time=  18.6s
[CV 2/5] END colsample_bytree=1, learning_rate=0.02, max_depth=7, n_estimators=750;, score=-0.000 total time=  

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.009495,0.000162,0.01274,0.990107,test,XG Boost GridSearch
1,0.00856,0.000131,0.011434,0.991925,train,XG Boost GridSearch
