In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score,train_test_split, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error,r2_score,roc_curve,auc,precision_recall_curve, accuracy_score, \
recall_score, precision_score, confusion_matrix
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier, BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier,AdaBoostRegressor,AdaBoostClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
import itertools as it
import time as time
import xgboost as xgb
from pyearth import Earth

# Load and Prepare Data

In [2]:
red_train = pd.read_csv('red_train.csv')
red_test = pd.read_csv('red_test.csv')
white_train = pd.read_csv('white_train.csv')
white_test = pd.read_csv('white_test.csv')

In [17]:
red_train.shape

(1199, 12)

In [18]:
white_train.shape

(3673, 12)

In [7]:
red_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,9.8,0.44,0.47,2.5,0.063,9.0,28.0,0.9981,3.24,0.65,10.8,6
1,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
2,8.5,0.18,0.51,1.75,0.071,45.0,88.0,0.99524,3.33,0.76,11.8,7
3,7.7,0.18,0.34,2.7,0.066,15.0,58.0,0.9947,3.37,0.78,11.8,6
4,6.2,0.65,0.06,1.6,0.05,6.0,18.0,0.99348,3.57,0.54,11.95,5


In [3]:
red_train_copy = red_train.copy()
red_test_copy = red_test.copy()
white_train_copy = white_train.copy()
white_test_copy = white_test.copy()
red_train_copy['type'] = 'red'
red_test_copy['type'] = 'red'
white_train_copy['type'] = 'white'
white_test_copy['type'] = 'white'

In [8]:
red_train_copy.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,9.8,0.44,0.47,2.5,0.063,9.0,28.0,0.9981,3.24,0.65,10.8,6,red
1,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,red
2,8.5,0.18,0.51,1.75,0.071,45.0,88.0,0.99524,3.33,0.76,11.8,7,red
3,7.7,0.18,0.34,2.7,0.066,15.0,58.0,0.9947,3.37,0.78,11.8,6,red
4,6.2,0.65,0.06,1.6,0.05,6.0,18.0,0.99348,3.57,0.54,11.95,5,red


In [4]:
combined_train = pd.concat([red_train_copy, white_train_copy], axis = 0)
combined_test = pd.concat([red_test_copy, white_test_copy], axis = 0)
combined_train = pd.get_dummies(combined_train)
combined_test = pd.get_dummies(combined_test)

In [9]:
combined_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type_red,type_white
0,9.8,0.44,0.47,2.5,0.063,9.0,28.0,0.9981,3.24,0.65,10.8,6,1,0
1,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,1,0
2,8.5,0.18,0.51,1.75,0.071,45.0,88.0,0.99524,3.33,0.76,11.8,7,1,0
3,7.7,0.18,0.34,2.7,0.066,15.0,58.0,0.9947,3.37,0.78,11.8,6,1,0
4,6.2,0.65,0.06,1.6,0.05,6.0,18.0,0.99348,3.57,0.54,11.95,5,1,0


In [10]:
red_train_x = red_train.drop('quality', axis = 1)
red_train_y = red_train['quality']
red_test_x = red_test.drop('quality', axis = 1)
red_test_y = red_test['quality']
white_train_x = white_train.drop('quality', axis = 1)
white_train_y = white_train['quality']
white_test_x = white_test.drop('quality', axis = 1)
white_test_y = white_test['quality']
combined_train_x = combined_train.drop('quality', axis = 1)
combined_train_y = combined_train['quality']
combined_test_x = combined_test.drop('quality', axis = 1)
combined_test_y = combined_test['quality']

# Create Base XGBoost Models

## Red Model

In [62]:
red_model = xgb.XGBRegressor().fit(red_train_x, red_train_y)
mae = mean_absolute_error(red_test_y, np.around(red_model.predict(red_test_x)))
print('MAE for red base model: ', mae)

MAE for red base model:  0.35


In [56]:
np.sqrt(mean_squared_error(red_test_y, np.around(red_model.predict(red_test_x))))

0.6324555320336759

## White Model

In [63]:
white_model = xgb.XGBRegressor().fit(white_train_x, white_train_y)
mae = mean_absolute_error(white_test_y, np.around(white_model.predict(white_test_x)))
print('MAE for white base model: ', mae)

MAE for white base model:  0.39346938775510204


In [57]:
np.sqrt(mean_squared_error(white_test_y, np.around(white_model.predict(white_test_x))))

0.7021831844254004

## Combined Model

In [64]:
combined_model = xgb.XGBRegressor().fit(combined_train_x, combined_train_y)
mae = mean_absolute_error(combined_test_y, np.around(combined_model.predict(combined_test_x)))
print('MAE for combined base model: ', mae)

MAE for combined base model:  0.40184615384615385


In [58]:
np.sqrt(mean_squared_error(combined_test_y, np.around(combined_model.predict(combined_test_x))))

0.7003295927358335

# Tuning Models

## Red Model

In [19]:
start_time = time.time()
param_grid = {'max_depth': [2, 8, 14],
              'n_estimators': [50, 100, 500, 1000],
              'learning_rate': [0.001, 0.01, 0.1],
              'subsample': [0.5, 0.75, 1],
              'reg_lambda':[0, 1, 10],
              'gamma': [0, 10]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1, subsample = 0.5),                                                       
                             param_distributions = param_grid,
                                 n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(red_train_x, red_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits




Best: 0.649099 using {'subsample': 0.5, 'reg_lambda': 0, 'n_estimators': 1000, 'max_depth': 14, 'learning_rate': 0.01, 'gamma': 0}
Time taken =  1.4288663029670716  minutes


In [21]:
start_time = time.time()
param_grid = {'max_depth': [12, 14, 16],
              'n_estimators': [1000, 2000, 3000],
              'learning_rate': [0.01, 0.025, 0.05],
              'subsample': [0.2, 0.35, 0.5],
              'reg_lambda':[0, 0.1, 0.2],
              'gamma': [0, 1]}

cv = KFold(n_splits=2,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1, subsample = 0.5),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(red_train_x, red_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 2 folds for each of 486 candidates, totalling 972 fits
Best: 0.673798 using {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 14, 'n_estimators': 3000, 'reg_lambda': 0.1, 'subsample': 0.5}
Time taken =  7.955299270153046  minutes


In [22]:
start_time = time.time()
param_grid = {'max_depth': [14],
              'n_estimators': [3000, 3500, 4000],
              'learning_rate': [0.01],
              'subsample': [0.5],
              'reg_lambda':[0.1],
              'gamma': [0, 0.1, 0.5]}

cv = KFold(n_splits=2,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1, subsample = 0.5),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(red_train_x, red_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 2 folds for each of 9 candidates, totalling 18 fits
Best: 0.673797 using {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 14, 'n_estimators': 4000, 'reg_lambda': 0.1, 'subsample': 0.5}
Time taken =  0.6000357151031495  minutes


In [61]:
red_model_tuned = xgb.XGBRegressor(random_state = 1, gamma = 0, learning_rate = 0.01, max_depth = 14, n_estimators = 4000,
                                  reg_lambda = 0.1, subsample = 0.5).fit(red_train_x, red_train_y)
mae = mean_absolute_error(red_test_y, np.around(red_model_tuned.predict(red_test_x)))
print('MAE for red tuned model: ', mae)

MAE for red tuned model:  0.3175


In [55]:
np.sqrt(mean_squared_error(red_test_y, np.around(red_model_tuned.predict(red_test_x))))

0.5894913061275798

## White Model

In [33]:
start_time = time.time()
param_grid = {'max_depth': [2, 8, 14],
              'n_estimators': [50, 100, 500, 1000],
              'learning_rate': [0.001, 0.01, 0.1],
              'subsample': [0.5, 0.75, 1],
              'reg_lambda':[0, 1, 10],
              'gamma': [0, 10]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1, subsample = 0.5),                                                       
                             param_distributions = param_grid,
                                 n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(white_train_x, white_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits




Best: 0.644747 using {'subsample': 0.75, 'reg_lambda': 0, 'n_estimators': 1000, 'max_depth': 14, 'learning_rate': 0.01, 'gamma': 0}
Time taken =  5.387266755104065  minutes


In [34]:
start_time = time.time()
param_grid = {'max_depth': [14, 16, 18],
              'n_estimators': [1000, 2000, 3000],
              'learning_rate': [0.01, 0.025, 0.05],
              'subsample': [0.75],
              'reg_lambda':[0, 0.1, 0.2],
              'gamma': [0, 1]}

cv = KFold(n_splits=2,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1, subsample = 0.5),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(white_train_x, white_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 2 folds for each of 162 candidates, totalling 324 fits
Best: 0.683244 using {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 16, 'n_estimators': 3000, 'reg_lambda': 0.1, 'subsample': 0.75}
Time taken =  9.353732399145763  minutes


In [60]:
white_model_tuned = xgb.XGBRegressor(random_state = 1, gamma = 0, learning_rate = 0.01, max_depth = 16, n_estimators = 3000,
                                  reg_lambda = 0.1, subsample = 0.75).fit(white_train_x, white_train_y)
mae = mean_absolute_error(white_test_y, np.around(white_model_tuned.predict(white_test_x)))
print('MAE for white tuned model: ', mae)

MAE for white tuned model:  0.3551020408163265


In [54]:
np.sqrt(mean_squared_error(white_test_y, np.around(white_model_tuned.predict(white_test_x))))

0.6571428571428571

## Combined Model

In [43]:
start_time = time.time()
param_grid = {'max_depth': [2, 8, 14],
              'n_estimators': [50, 100, 500, 1000],
              'learning_rate': [0.001, 0.01, 0.1],
              'subsample': [0.5, 0.75, 1],
              'reg_lambda':[0, 1, 10],
              'gamma': [0, 10]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
grid_search = RandomizedSearchCV(estimator=xgb.XGBRegressor(random_state=1, subsample = 0.5),                                                       
                             param_distributions = param_grid,
                                 n_iter = 200,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(combined_train_x, combined_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits




Best: 0.646128 using {'subsample': 0.75, 'reg_lambda': 0, 'n_estimators': 1000, 'max_depth': 14, 'learning_rate': 0.01, 'gamma': 0}
Time taken =  7.204989167054494  minutes


In [46]:
start_time = time.time()
param_grid = {'max_depth': [12, 14, 16],
              'n_estimators': [1000, 2000, 3000],
              'learning_rate': [0.01, 0.025, 0.05],
              'subsample': [0.75],
              'reg_lambda':[0, 0.1, 0.2],
              'gamma': [0, 1]}

cv = KFold(n_splits=2,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1, subsample = 0.5),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(combined_train_x, combined_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 2 folds for each of 162 candidates, totalling 324 fits




Best: 0.677211 using {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 16, 'n_estimators': 3000, 'reg_lambda': 0.2, 'subsample': 0.75}
Time taken =  11.891242869695027  minutes


In [47]:
start_time = time.time()
param_grid = {'max_depth': [16, 17, 18],
              'n_estimators': [3000, 4000],
              'learning_rate': [0.01],
              'subsample': [0.75],
              'reg_lambda':[0.2, 0.3, 0.4],
              'gamma': [0]}

cv = KFold(n_splits=2,shuffle=True,random_state=1)
grid_search = GridSearchCV(estimator=xgb.XGBRegressor(random_state=1, subsample = 0.5),                                                       
                             param_grid = param_grid,
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv,
                          scoring = 'neg_mean_absolute_error')
grid_result = grid_search.fit(combined_train_x, combined_train_y)

print("Best: %f using %s" % (np.sqrt(-grid_result.best_score_), grid_result.best_params_))
print("Time taken = ",(time.time()-start_time)/60," minutes")

Fitting 2 folds for each of 18 candidates, totalling 36 fits
Best: 0.676535 using {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 16, 'n_estimators': 4000, 'reg_lambda': 0.3, 'subsample': 0.75}
Time taken =  2.294661017258962  minutes


In [59]:
combined_model_tuned = xgb.XGBRegressor(random_state = 1, gamma = 0, learning_rate = 0.01, max_depth = 16, n_estimators = 4000,
                                  reg_lambda = 0.3, subsample = 0.75).fit(combined_train_x, combined_train_y)
mae = mean_absolute_error(combined_test_y, np.around(combined_model_tuned.predict(combined_test_x)))
print('MAE for combined tuned model: ', mae)

MAE for combined tuned model:  0.35507692307692307


In [53]:
np.sqrt(mean_squared_error(combined_test_y, np.around(combined_model_tuned.predict(combined_test_x))))

0.6596036105699566

# Summary

## Red Wine

Base model MAE: 0.3855

Tuned model MAE: 0.3782

Improvement: 0.0073 (0.02%)

## White Wine

Base model MAE:  0.4716

Tuned model MAE: 0.4023

Improvement: 0.0693 (14.69%)

## Combined Wines

Base model MAE: 0.4752

Tuned model MAE: 0.3941

Improvement: 0.0711 (14.96%)