In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score,train_test_split, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc,precision_recall_curve, accuracy_score, \
recall_score, precision_score, confusion_matrix
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier, BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
import itertools as it
import time as time
import xgboost as xgb
from pyearth import Earth

# Load and Prepare Data

In [65]:
red_train_x = pd.read_csv('red_train_x.csv')
white_train_x = pd.read_csv('white_train_x.csv')
red_train_y = pd.read_csv('red_train_y.csv')
white_train_y = pd.read_csv('white_train_y.csv')
red_test_x = pd.read_csv('red_test_x.csv')
white_test_x = pd.read_csv('white_test_x.csv')
red_test_y = pd.read_csv('red_test_y.csv')
white_test_y = pd.read_csv('white_test_y.csv')
combined_train_x = pd.read_csv("combined_train_x")
combined_train_y = pd.read_csv("combined_train_y")
combined_test_x = pd.read_csv("combined_test_x")
combined_test_y = pd.read_csv("combined_test_y")

# Create Base XGBoost Models

## Red Model

In [66]:
red_model = xgb.XGBRegressor().fit(red_train_x, red_train_y)
mae = mean_absolute_error(red_test_y, np.around(red_model.predict(red_test_x)))
print('MAE for red base model: ', mae)

MAE for red base model:  0.4225


In [67]:
np.sqrt(mean_squared_error(red_test_y, np.around(red_model.predict(red_test_x))))

0.7053367989832943

## White Model

In [68]:
white_model = xgb.XGBRegressor().fit(white_train_x, white_train_y)
mae = mean_absolute_error(white_test_y, np.around(white_model.predict(white_test_x)))
print('MAE for white base model: ', mae)

MAE for white base model:  0.37551020408163266


In [69]:
np.sqrt(mean_squared_error(white_test_y, np.around(white_model.predict(white_test_x))))

0.6737043498600912

## Combined Model

In [70]:
combined_model = xgb.XGBRegressor().fit(combined_train_x, combined_train_y)
mae = mean_absolute_error(combined_test_y, np.around(combined_model.predict(combined_test_x)))
print('MAE for combined base model: ', mae)

MAE for combined base model:  0.3944615384615385


In [71]:
np.sqrt(mean_squared_error(combined_test_y, np.around(combined_model.predict(combined_test_x))))

0.689704507519204

# Tuning Models

## Red Model

### MAE OPTIMAL

In [72]:
start_time = time.time()
param_grid = {'max_depth': [2, 8, 14],
              'n_estimators': [50, 100, 500, 1000],
              'learning_rate': [0.001, 0.01, 0.1],
              'subsample': [0.5, 0.75, 1],
              'colsample_bytree': [0.5, 0.75, 1.0],
              'reg_lambda':[0, 1, 10],
              'gamma': [0, 10]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_distributions = param_grid,
                                 n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(red_train_x, red_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits




Best: 0.630872 using {'subsample': 0.75, 'reg_lambda': 0, 'n_estimators': 100, 'max_depth': 14, 'learning_rate': 0.1, 'gamma': 0, 'colsample_bytree': 1.0}
Time taken =  1.3804953535397848  minutes


In [97]:
start_time = time.time()
param_grid = {'max_depth': [12, 14, 16],
              'n_estimators': [100, 200, 300, 400],
              'learning_rate': [0.05, 0.1, 0.5],
              'colsample_bytree': [0.8, 0.9, 1],
              'reg_lambda':[0, 0.1, 0.2],
              'gamma': [0, 0.5, 1]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1, subsample = 0.75),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(red_train_x, red_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best: 0.624138 using {'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 12, 'n_estimators': 400, 'reg_lambda': 0.2}
Time taken =  9.039521582921346  minutes


In [99]:
start_time = time.time()
param_grid = {'max_depth': [8,9,10,11,12],
              'n_estimators': [400],
              'learning_rate': [0.05],
              'colsample_bytree': [1],
              'reg_lambda':[0.2, 0.3, 0.4, 0.5],
              'gamma': [0]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1, subsample = 0.75),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(red_train_x, red_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best: 0.624138 using {'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 12, 'n_estimators': 400, 'reg_lambda': 0.2}
Time taken =  0.28086400032043457  minutes


In [102]:
red_model_tuned = xgb.XGBRegressor(random_state = 1, gamma = 0, learning_rate = 0.05, max_depth = 9, n_estimators = 400,
                                  reg_lambda = 0.1, subsample = 0.75, colsample_bytree = 0.8).fit(red_train_x, red_train_y)
mae = mean_absolute_error(red_test_y, np.around(red_model_tuned.predict(red_test_x)))
print('MAE for red tuned model: ', mae)

MAE for red tuned model:  0.3725


In [87]:
np.sqrt(mean_squared_error(red_test_y, np.around(red_model_tuned.predict(red_test_x))))

0.6652067347825036

### RMSE OPTIMAL

In [88]:
start_time = time.time()
param_grid = {'max_depth': [2, 8, 14],
              'n_estimators': [50, 100, 500, 1000],
              'learning_rate': [0.001, 0.01, 0.1],
              'subsample': [0.5, 0.75, 1],
              'colsample_bytree': [0.5, 0.75, 1.0],
              'reg_lambda':[0, 1, 10],
              'gamma': [0, 10]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_distributions = param_grid,
                                 n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(red_train_x, red_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits




Best: 0.577629 using {'subsample': 0.75, 'reg_lambda': 1, 'n_estimators': 1000, 'max_depth': 8, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.75}
Time taken =  1.7339248339335123  minutes


In [90]:
start_time = time.time()
param_grid = {'max_depth': [7, 8, 9, 10],
              'n_estimators': [1000, 2000, 3000],
              'learning_rate': [0.01, 0.025, 0.05],
              'subsample': [0.75],
              'colsample_bytree': [0.75],
              'reg_lambda':[0.5, 1, 1.5],
              'gamma': [0, 0.1, 0.2]}

cv = KFold(n_splits=2,shuffle=True,random_state=1)
grid_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_distributions = param_grid,
                                 n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(red_train_x, red_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 2 folds for each of 200 candidates, totalling 400 fits
Best: 0.626779 using {'subsample': 0.75, 'reg_lambda': 0.5, 'n_estimators': 2000, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.75}
Time taken =  2.272305949529012  minutes


In [91]:
start_time = time.time()
param_grid = {'max_depth': [10,11,12,13],
              'n_estimators': [1500, 2000, 2500],
              'learning_rate': [0.01],
              'subsample': [0.75],
              'colsample_bytree': [0.75],
              'reg_lambda':[0.1,0.2,0.3,0.4,0.5],
              'gamma': [0, 0.01]}

cv = KFold(n_splits=2,shuffle=True,random_state=1)
grid_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_distributions = param_grid,
                                 n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(red_train_x, red_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")



Fitting 2 folds for each of 120 candidates, totalling 240 fits
Best: 0.624200 using {'subsample': 0.75, 'reg_lambda': 0.2, 'n_estimators': 1500, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.75}
Time taken =  1.9386982878049215  minutes


In [93]:
red_model_tuned_rmse = xgb.XGBRegressor(random_state = 1, gamma = 0, learning_rate = 0.01, max_depth = 10, n_estimators = 1500,
                                  reg_lambda = 0.2, subsample = 0.75, colsample_bytree = 0.75).fit(red_train_x, red_train_y)
mae = mean_absolute_error(red_test_y, np.around(red_model_tuned_rmse.predict(red_test_x)))
print('MAE for red tuned model: ', mae)

MAE for red tuned model:  0.3675


In [94]:
np.sqrt(mean_squared_error(red_test_y, np.around(red_model_tuned_rmse.predict(red_test_x))))

0.6576473218982952

## White Model

In [105]:
start_time = time.time()
param_grid = {'max_depth': [4,6,8],
              'n_estimators': [100, 500, 1000],
              'learning_rate': [0.01, 0.05, 0.1],
              'subsample': [0.5, 0.75, 1],
              'colsample_bytree': [0.5, 0.75, 1],
              'reg_lambda':[0, 1, 10],
              'gamma': [0, 10, 100]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_distributions = param_grid,
                                 n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(white_train_x, white_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best: 0.635257 using {'subsample': 0.75, 'reg_lambda': 0, 'n_estimators': 1000, 'max_depth': 8, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.5}
Time taken =  3.8692580978075664  minutes


In [106]:
start_time = time.time()
param_grid = {'max_depth': [8, 10, 12, 14],
              'n_estimators': [1000, 2000, 3000],
              'learning_rate': [0.001, 0.01],
              'subsample': [0.75],
              'colsample_bytree': [0.5],
              'reg_lambda':[0, 0.2, 0.4, 0.6, 0.8],
              'gamma': [0]}

cv = KFold(n_splits=2,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(white_train_x, white_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 2 folds for each of 120 candidates, totalling 240 fits




Best: 0.651685 using {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 14, 'n_estimators': 1000, 'reg_lambda': 0.6, 'subsample': 0.75}
Time taken =  3.6731221675872803  minutes


In [107]:
start_time = time.time()
param_grid = {'max_depth': [13, 14, 15, 16],
              'n_estimators': [500, 1000, 1500],
              'learning_rate': [0.01],
              'subsample': [0.75],
              'colsample_bytree': [0.5],
              'reg_lambda':[0.6],
              'gamma': [0]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(white_train_x, white_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best: 0.625868 using {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 16, 'n_estimators': 1000, 'reg_lambda': 0.6, 'subsample': 0.75}
Time taken =  1.2273271163304647  minutes


In [113]:
white_model_tuned = xgb.XGBRegressor(random_state = 1, gamma = 0, learning_rate = 0.01, max_depth = 9, n_estimators = 1000,
                                  reg_lambda = 0.6, subsample = 0.75, colsample_bytree = 0.5).fit(white_train_x, white_train_y)
mae = mean_absolute_error(white_test_y, np.around(white_model_tuned.predict(white_test_x)))
rmse = np.sqrt(mean_squared_error(white_test_y, np.around(white_model_tuned.predict(white_test_x))))

print('RMSE for white tuned model: ', rmse)
print('MAE for white tuned model: ', mae)

RMSE for white tuned model:  0.6375975315548971
MAE for white tuned model:  0.34775510204081633


## Combined Model

In [None]:
start_time = time.time()
param_grid = {'max_depth': [4,6,8],
              'n_estimators': [100, 500, 1000],
              'learning_rate': [0.01, 0.05, 0.1],
              'subsample': [0.5, 0.75, 1],
              'colsample_bytree': [0.5, 0.75, 1],
              'reg_lambda':[0, 1, 10],
              'gamma': [0, 10, 100]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_distributions = param_grid,
                                 n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(combined_train_x, combined_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [46]:
start_time = time.time()
param_grid = {'max_depth': [12, 14, 16],
              'n_estimators': [1000, 2000, 3000],
              'learning_rate': [0.01, 0.025, 0.05],
              'subsample': [0.75],
              'reg_lambda':[0, 0.1, 0.2],
              'gamma': [0, 1]}

cv = KFold(n_splits=2,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1, subsample = 0.5),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(combined_train_x, combined_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 2 folds for each of 162 candidates, totalling 324 fits




Best: 0.677211 using {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 16, 'n_estimators': 3000, 'reg_lambda': 0.2, 'subsample': 0.75}
Time taken =  11.891242869695027  minutes


In [47]:
start_time = time.time()
param_grid = {'max_depth': [16, 17, 18],
              'n_estimators': [3000, 4000],
              'learning_rate': [0.01],
              'subsample': [0.75],
              'reg_lambda':[0.2, 0.3, 0.4],
              'gamma': [0]}

cv = KFold(n_splits=2,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1, subsample = 0.5),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(combined_train_x, combined_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 2 folds for each of 18 candidates, totalling 36 fits
Best: 0.676535 using {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 16, 'n_estimators': 4000, 'reg_lambda': 0.3, 'subsample': 0.75}
Time taken =  2.294661017258962  minutes


In [59]:
combined_model_tuned = xgb.XGBRegressor(random_state = 1, gamma = 0, learning_rate = 0.01, max_depth = 16, n_estimators = 4000,
                                  reg_lambda = 0.3, subsample = 0.75).fit(combined_train_x, combined_train_y)
mae = mean_absolute_error(combined_test_y, np.around(combined_model_tuned.predict(combined_test_x)))
print('MAE for combined tuned model: ', mae)

MAE for combined tuned model:  0.35507692307692307


In [53]:
np.sqrt(mean_squared_error(combined_test_y, np.around(combined_model_tuned.predict(combined_test_x))))

0.6596036105699566

# Summary

## Red Wine

Base model MAE: 0.3855

Tuned model MAE: 0.3782

Improvement: 0.0073 (0.02%)

## White Wine

Base model MAE:  0.4716

Tuned model MAE: 0.4023

Improvement: 0.0693 (14.69%)

## Combined Wines

Base model MAE: 0.4752

Tuned model MAE: 0.3941

Improvement: 0.0711 (14.96%)