In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score,train_test_split, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc,precision_recall_curve, accuracy_score, \
recall_score, precision_score, confusion_matrix
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier, BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
import itertools as it
import time as time
import xgboost as xgb
from pyearth import Earth

# Load and Prepare Data

In [65]:
red_train_x = pd.read_csv('red_train_x.csv')
white_train_x = pd.read_csv('white_train_x.csv')
red_train_y = pd.read_csv('red_train_y.csv')
white_train_y = pd.read_csv('white_train_y.csv')
red_test_x = pd.read_csv('red_test_x.csv')
white_test_x = pd.read_csv('white_test_x.csv')
red_test_y = pd.read_csv('red_test_y.csv')
white_test_y = pd.read_csv('white_test_y.csv')
combined_train_x = pd.read_csv("combined_train_x")
combined_train_y = pd.read_csv("combined_train_y")
combined_test_x = pd.read_csv("combined_test_x")
combined_test_y = pd.read_csv("combined_test_y")

# Create Base XGBoost Models

## Red Model

In [183]:
red_model = xgb.XGBRegressor().fit(red_train_x, red_train_y)
mae = mean_absolute_error(red_test_y, np.around(red_model.predict(red_test_x)))
rmse = np.sqrt(mean_squared_error(red_test_y, np.around(red_model.predict(red_test_x))))
print('RMSE for red base model: ', rmse)
print('MAE for red base model: ', mae)

RMSE for red base model:  0.7053367989832943
MAE for red base model:  0.4225


## White Model

In [184]:
white_model = xgb.XGBRegressor().fit(white_train_x, white_train_y)
mae = mean_absolute_error(white_test_y, np.around(white_model.predict(white_test_x)))
rmse = np.sqrt(mean_squared_error(white_test_y, np.around(white_model.predict(white_test_x))))
print('RMSE for white base model: ', rmse)
print('MAE for white base model: ', mae)

RMSE for white base model:  0.6737043498600912
MAE for white base model:  0.37551020408163266


## Combined Model

In [185]:
combined_model = xgb.XGBRegressor().fit(combined_train_x, combined_train_y)
mae = mean_absolute_error(combined_test_y, np.around(combined_model.predict(combined_test_x)))
rmse = np.sqrt(mean_squared_error(combined_test_y, np.around(combined_model.predict(combined_test_x))))
print('RMSE for combined base model: ', rmse)
print('MAE for combined base model: ', mae)

RMSE for combined base model:  0.689704507519204
MAE for combined base model:  0.3944615384615385


# Tuning Models

## Red Model

In [149]:
start_time = time.time()
param_grid = {'max_depth': [4, 6, 8],
              'n_estimators': [100, 500, 1000],
              'learning_rate': [0.01, 0.05, 0.1],
              'subsample': [0.5, 0.75, 1],
              'colsample_bytree': [0.5, 0.75, 1.0],
              'reg_lambda':[0, 1, 10],
              'gamma': [0, 10, 100]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_distributions = param_grid,
                                 n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(red_train_x, red_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best: 0.578070 using {'subsample': 1, 'reg_lambda': 0, 'n_estimators': 1000, 'max_depth': 8, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.5}
Time taken =  1.7860136349995932  minutes


In [150]:
start_time = time.time()
param_grid = {'max_depth': [8, 9, 10, 11, 12],
              'n_estimators': [1000, 1500, 2000],
              'learning_rate': [0.05],
              'subsample': [0.8, 0.9, 1],
              'colsample_bytree': [0.3, 0.4, 0.5],
              'reg_lambda':[0, 0.25, 0.5],
              'gamma': [0]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(red_train_x, red_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 405 candidates, totalling 2025 fits
Best: 0.574486 using {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 12, 'n_estimators': 1500, 'reg_lambda': 0.5, 'subsample': 0.9}
Time taken =  4.21326855023702  minutes


In [176]:
#manual tune max_depth, subsample, and colsample_bytree
red_model_tuned_rmse = xgb.XGBRegressor(random_state = 1, gamma = 0, learning_rate = 0.05, max_depth = 11, n_estimators = 1500,
                                  reg_lambda = 0.5, subsample = 1, colsample_bytree = 0.6).fit(red_train_x, red_train_y)
mae = mean_absolute_error(red_test_y, np.around(red_model_tuned_rmse.predict(red_test_x)))
rmse = np.sqrt(mean_squared_error(red_test_y, np.around(red_model_tuned_rmse.predict(red_test_x))))
print("RMSE for red tuned model: ", rmse)
print('MAE for red tuned model: ', mae)

RMSE for red tuned model:  0.6442049363362563
MAE for red tuned model:  0.345


## White Model

In [105]:
start_time = time.time()
param_grid = {'max_depth': [4,6,8],
              'n_estimators': [100, 500, 1000],
              'learning_rate': [0.01, 0.05, 0.1],
              'subsample': [0.5, 0.75, 1],
              'colsample_bytree': [0.5, 0.75, 1],
              'reg_lambda':[0, 1, 10],
              'gamma': [0, 10, 100]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_distributions = param_grid,
                                 n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(white_train_x, white_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best: 0.635257 using {'subsample': 0.75, 'reg_lambda': 0, 'n_estimators': 1000, 'max_depth': 8, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.5}
Time taken =  3.8692580978075664  minutes


In [106]:
start_time = time.time()
param_grid = {'max_depth': [8, 10, 12, 14],
              'n_estimators': [1000, 2000, 3000],
              'learning_rate': [0.001, 0.01],
              'subsample': [0.75],
              'colsample_bytree': [0.5],
              'reg_lambda':[0, 0.2, 0.4, 0.6, 0.8],
              'gamma': [0]}

cv = KFold(n_splits=2,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(white_train_x, white_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 2 folds for each of 120 candidates, totalling 240 fits




Best: 0.651685 using {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 14, 'n_estimators': 1000, 'reg_lambda': 0.6, 'subsample': 0.75}
Time taken =  3.6731221675872803  minutes


In [107]:
start_time = time.time()
param_grid = {'max_depth': [13, 14, 15, 16],
              'n_estimators': [500, 1000, 1500],
              'learning_rate': [0.01],
              'subsample': [0.75],
              'colsample_bytree': [0.5],
              'reg_lambda':[0.6],
              'gamma': [0]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(white_train_x, white_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best: 0.625868 using {'colsample_bytree': 0.5, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 16, 'n_estimators': 1000, 'reg_lambda': 0.6, 'subsample': 0.75}
Time taken =  1.2273271163304647  minutes


In [180]:
#manual tuning of max_depth, colsample, and learning_rate
white_model_tuned = xgb.XGBRegressor(random_state = 1, gamma = 0, learning_rate = 0.02, max_depth = 9, n_estimators = 1000,
                                  reg_lambda = 0.6, subsample = 0.75, colsample_bytree = 0.6).fit(white_train_x, white_train_y)
mae = mean_absolute_error(white_test_y, np.around(white_model_tuned.predict(white_test_x)))
rmse = np.sqrt(mean_squared_error(white_test_y, np.around(white_model_tuned.predict(white_test_x))))

print('RMSE for white tuned model: ', rmse)
print('MAE for white tuned model: ', mae)

RMSE for white tuned model:  0.6279217421667403
MAE for white tuned model:  0.33877551020408164


## Combined Model

In [114]:
start_time = time.time()
param_grid = {'max_depth': [4,6,8],
              'n_estimators': [100, 500, 1000],
              'learning_rate': [0.01, 0.05, 0.1],
              'subsample': [0.5, 0.75, 1],
              'colsample_bytree': [0.5, 0.75, 1],
              'reg_lambda':[0, 1, 10],
              'gamma': [0, 10, 100]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_distributions = param_grid,
                                 n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(combined_train_x, combined_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Best: 0.625016 using {'subsample': 0.75, 'reg_lambda': 10, 'n_estimators': 500, 'max_depth': 8, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.75}
Time taken =  5.302819335460663  minutes


In [122]:
start_time = time.time()
param_grid = {'max_depth': [8,9,10,11,12],
              'n_estimators': [300, 500, 700],
              'learning_rate': [0.05],
              'subsample': [0.75],
              'colsample_bytree': [0.75],
              'reg_lambda':[2,4,6,8,10],
              'gamma': [0]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_squared_error')
grid_result = grid_search.fit(combined_train_x, combined_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 75 candidates, totalling 375 fits




Best: 0.619580 using {'colsample_bytree': 0.75, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 11, 'n_estimators': 300, 'reg_lambda': 2, 'subsample': 0.75}
Time taken =  3.602659785747528  minutes


In [144]:
#manual tuning of learning_rate
combined_model_tuned = xgb.XGBRegressor(random_state = 1, gamma = 0, learning_rate = 0.07, max_depth = 11, n_estimators = 300,
                                  reg_lambda = 2, subsample = 0.75, colsample_bytree = 0.75).fit(combined_train_x, combined_train_y)
mae = mean_absolute_error(combined_test_y, np.around(combined_model_tuned.predict(combined_test_x)))
rmse = np.sqrt(mean_squared_error(combined_test_y, np.around(combined_model_tuned.predict(combined_test_x))))

print('RMSE for combined tuned model: ', rmse)
print('MAE for combined tuned model: ', mae)

RMSE for combined tuned model:  0.6401922788086044
MAE for combined tuned model:  0.34584615384615386


# Summary

## Red Wine

Base model MAE: 0.423; Base model RMSE: 0.705

Tuned model MAE: 0.345; Tuned model RMSE: 0.644

MAE Improvement: 0.078 (18.44%); RMSE Improvement: 0.061 (8.65%)

## White Wine

Base model MAE:  0.376; Base model RMSE: 0.674

Tuned model MAE: 0.339; Tuned model RMSE: 0.628

Improvement: 0.037 (9.84%); RMSE Improvement: 0.046 (6.82%)

## Combined Wines

Base model MAE: 0.394; Base model RMSE: 0.690

Tuned model MAE: 0.346; Tuned model RMSE: 0.640

Improvement: 0.048 (12.18%); RMSE Improvement: 0.05 (7.25%)