In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
import pickle
import os
import joblib

#os.chdir(os.path.dirname(__file__))

data = pd.read_csv('data/winequality.csv', sep=";")

In [3]:
X = data[['alcohol', 'pH', 'sulphates']]
y = data['quality'] 

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.20,
                                                    random_state=42)

rf_reg = RandomForestRegressor(max_depth = 10, random_state= 42)
lgb_reg = LGBMRegressor(max_depth = 10, random_state = 42, verbose = -100)
xgb_reg = XGBRegressor(max_depth = 10, random_state = 42)

print(f"RandomForest: {-np.mean(cross_val_score(rf_reg, X_train, y_train, cv=5, scoring= 'neg_mean_absolute_percentage_error'))}")
print(f"LGB: {-np.mean(cross_val_score(lgb_reg, X_train, y_train, cv=5, scoring= 'neg_mean_absolute_percentage_error'))}")
print(f"XGB: {-np.mean(cross_val_score(xgb_reg, X_train, y_train, cv=5, scoring= 'neg_mean_absolute_percentage_error'))}")

RandomForest: 0.11338373478738006
LGB: 0.11616817574748495
XGB: 0.10519744008779526


In [4]:
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

param_grid = {
    "n_estimators": [100,200,1000],
    "max_depth": [1,5,10,20],
    "learning_rate": [0.01, 0.1, 0.2]
}

xgb_reg = XGBRegressor()

xgb_grid = GridSearchCV(xgb_reg,
                       param_grid= param_grid,
                       cv = 5,
                       scoring = "neg_mean_absolute_percentage_error",
                       n_jobs= -1)

xgb_grid.fit(X_train, y_train)

0,1,2
,estimator,"XGBRegressor(...ree=None, ...)"
,param_grid,"{'learning_rate': [0.01, 0.1, ...], 'max_depth': [1, 5, ...], 'n_estimators': [100, 200, ...]}"
,scoring,'neg_mean_absolute_percentage_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [5]:
print(xgb_grid.best_params_)
print(xgb_grid.best_score_)

{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 1000}
-0.10409436523914337


In [8]:

rmse = np.sqrt(mean_squared_error(y_test, xgb_grid.predict(X_test)))
mape = mean_absolute_percentage_error(y_test, xgb_grid.predict(X_test))

print(f"Evaluation metric RMSE: {str(rmse)}, MAPE: {str(mape)}")

Evaluation metric RMSE: 0.7845779110894523, MAPE: 0.09065722674131393


In [None]:
model = XGBRegressor(max_depth=10, learning_rate=0.1, n_estimators=1000, random_state=42)
model.fit(X,y) #entrenamos con todos los datos

with open('ad_model.pkl', 'wb') as f:
    pickle.dump(model, f)

NotFittedError: need to call fit or load_model beforehand