In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, precision_recall_curve, accuracy_score, recall_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid, cross_val_predict
from sklearn.tree import export_graphviz 
from six import StringIO
from IPython.display import Image  
import time as tm
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression, LogisticRegression
import itertools as it

The below code was used to split the data into train and test for both red and white wine.

In [5]:
red = pd.read_csv('winequality-red.csv', delimiter = ';')
white = pd.read_csv('winequality-white.csv', delimiter = ';')

In [9]:
red_train, red_test = train_test_split(red, test_size = 0.25)

In [12]:
white_train, white_test = train_test_split(white, test_size = 0.25)

In [15]:
red_train.to_csv('red_train.csv', index = False)
red_test.to_csv('red_test.csv', index = False)
white_train.to_csv('white_train.csv', index = False)
white_test.to_csv('white_test.csv', index = False)

The below code starts after splitting the train and test for red and white wine.

In [30]:
red_train = pd.read_csv('red_train.csv')
red_test = pd.read_csv('red_test.csv')
white_train = pd.read_csv('white_train.csv')
white_test = pd.read_csv('white_test.csv')

Creating combined train and test data for both red and white wine, along with a new column for dummy variables of whether they are white or red wine.

In [31]:
red_train_copy = red_train.copy()
red_test_copy = red_test.copy()
white_train_copy = white_train.copy()
white_test_copy = white_test.copy()
red_train_copy['type'] = 'red'
red_test_copy['type'] = 'red'
white_train_copy['type'] = 'white'
white_test_copy['type'] = 'white'

In [34]:
combined_train = pd.concat([red_train_copy, white_train_copy], axis = 0)
combined_test = pd.concat([red_test_copy, white_test_copy], axis = 0)
combined_train = pd.get_dummies(combined_train)
combined_test = pd.get_dummies(combined_test)

Creating separate dataframes for the predictors and response variable in X and y for both red, white, and combined data on the wines.

In [40]:
red_train_x = red_train.drop('quality', axis = 1)
red_train_y = red_train['quality']
red_test_x = red_test.drop('quality', axis = 1)
red_test_y = red_test['quality']
white_train_x = white_train.drop('quality', axis = 1)
white_train_y = white_train['quality']
white_test_x = white_test.drop('quality', axis = 1)
white_test_y = white_test['quality']
combined_train_x = combined_train.drop('quality', axis = 1)
combined_train_y = combined_train['quality']
combined_test_x = combined_test.drop('quality', axis = 1)
combined_test_y = combined_test['quality']

Begin by fitting a simple DecisionTreeRegressor() model to see for comparison purposes.

In [52]:
model = DecisionTreeRegressor(random_state = 1) 
model.fit(red_train_x, red_train_y)
pred = model.predict(red_test_x)
print(f'The test MAE for red wine data is: {mean_absolute_error(red_test_y, pred)}')
print(f'The train MAE for red wine data is: {mean_absolute_error(red_train_y, model.predict(red_train_x))}')

The test MAE for red wine data is: 0.425
The train MAE for red wine data is: 0.0


In [53]:
model = DecisionTreeRegressor(random_state = 1) 
model.fit(white_train_x, white_train_y)
pred = model.predict(white_test_x)
print(f'The test MAE for white wine data is: {mean_absolute_error(white_test_y, pred)}')
print(f'The train MAE for red wine data is: {mean_absolute_error(white_train_y, model.predict(white_train_x))}')

The test MAE for white wine data is: 0.4963265306122449
The train MAE for red wine data is: 0.0


In [54]:
model = DecisionTreeRegressor(random_state = 1) 
model.fit(combined_train_x, combined_train_y)
pred = model.predict(combined_test_x)
print(f'The test MAE for white wine data is: {mean_absolute_error(combined_test_y, pred)}')
print(f'The train MAE for red wine data is: {mean_absolute_error(combined_train_y, model.predict(combined_train_x))}')

The test MAE for white wine data is: 0.49353846153846154
The train MAE for red wine data is: 0.0


Begin by tuning a simple DecisionTreeRegressor() model for the red wine using a coarse grid search and then a finer grid search to see for comparison purposes.

In [48]:
parameters = {'max_depth': range(10, 32, 3), 
              'max_leaf_nodes': range(2, 4100, 100),
              'min_samples_leaf': [1, 2, 4, 8, 16]}
cv = KFold(n_splits = 5, shuffle = True, random_state = 1)
model = GridSearchCV(DecisionTreeRegressor(random_state = 1), parameters, n_jobs = -1, verbose = 1, cv = cv, 
                     scoring = ['neg_mean_absolute_error', 'r2'], refit = 'neg_mean_absolute_error')
model.fit(red_train_x, red_train_y)
print(-model.best_score_, model.best_params_) 

Fitting 5 folds for each of 1640 candidates, totalling 8200 fits
0.517918410041841 {'max_depth': 22, 'max_leaf_nodes': 302, 'min_samples_leaf': 1}


In [50]:
pred = model.predict(red_test_x)
print(f'The test MAE for red wine data is: {mean_absolute_error(red_test_y, pred)}')
print(f'The train MAE for red wine data is: {mean_absolute_error(red_train_y, model.predict(red_train_x))}')

The test MAE for red wine data is: 0.40881592214974566
The train MAE for red wine data is: 0.06695985070665209


In [49]:
parameters = {'max_depth': range(19, 26), 
              'max_leaf_nodes': range(202, 403),
              'min_samples_leaf': [1]}
cv = KFold(n_splits = 5, shuffle = True, random_state = 1)
model = GridSearchCV(DecisionTreeRegressor(random_state = 1), parameters, n_jobs = -1, verbose = 1, cv = cv, 
                     scoring = ['neg_mean_absolute_error', 'r2'], refit = 'neg_mean_absolute_error')
model.fit(red_train_x, red_train_y)
print(-model.best_score_, model.best_params_) 

Fitting 5 folds for each of 1407 candidates, totalling 7035 fits
0.5142499439686203 {'max_depth': 20, 'max_leaf_nodes': 256, 'min_samples_leaf': 1}


In [51]:
pred = model.predict(red_test_x)
print(f'The test MAE for red wine data is: {mean_absolute_error(red_test_y, pred)}')
print(f'The train MAE for red wine data is: {mean_absolute_error(red_train_y, model.predict(red_train_x))}')

The test MAE for red wine data is: 0.40881592214974566
The train MAE for red wine data is: 0.06695985070665209


The test MAE for red wine data improved from 0.425 to 0.409.

Next, proceed by tuning a simple DecisionTreeRegressor() model for the white wine using a coarse grid search and then a finer grid search to see for comparison purposes.

In [59]:
parameters = {'max_depth': range(10, 32, 3), 
              'max_leaf_nodes': range(2, 4100, 100),
              'min_samples_leaf': [1, 2, 4, 8, 16]}
cv = KFold(n_splits = 5, shuffle = True, random_state = 1)
model = GridSearchCV(DecisionTreeRegressor(random_state = 1), parameters, n_jobs = -1, verbose = 1, cv = cv, 
                     scoring = ['neg_mean_absolute_error', 'r2'], refit = 'neg_mean_absolute_error')
model.fit(white_train_x, white_train_y)
print(-model.best_score_, model.best_params_) 

Fitting 5 folds for each of 1640 candidates, totalling 8200 fits
0.5138325567673547 {'max_depth': 25, 'max_leaf_nodes': 902, 'min_samples_leaf': 1}


In [60]:
pred = model.predict(white_test_x)
print(f'The test MAE for white wine data is: {mean_absolute_error(white_test_y, pred)}')
print(f'The train MAE for white wine data is: {mean_absolute_error(white_train_y, model.predict(white_train_x))}')

The test MAE for white wine data is: 0.5172277376355146
The train MAE for white wine data is: 0.04316875561391792


In [61]:
parameters = {'max_depth': range(22, 29), 
              'max_leaf_nodes': range(802, 1003),
              'min_samples_leaf': [1]}
cv = KFold(n_splits = 5, shuffle = True, random_state = 1)
model = GridSearchCV(DecisionTreeRegressor(random_state = 1), parameters, n_jobs = -1, verbose = 1, cv = cv, 
                     scoring = ['neg_mean_absolute_error', 'r2'], refit = 'neg_mean_absolute_error')
model.fit(white_train_x, white_train_y)
print(-model.best_score_, model.best_params_) 

Fitting 5 folds for each of 1407 candidates, totalling 7035 fits
0.5133149714552114 {'max_depth': 25, 'max_leaf_nodes': 853, 'min_samples_leaf': 1}


In [62]:
pred = model.predict(white_test_x)
print(f'The test MAE for white wine data is: {mean_absolute_error(white_test_y, pred)}')
print(f'The train MAE for white wine data is: {mean_absolute_error(white_train_y, model.predict(white_train_x))}')

The test MAE for white wine data is: 0.5202131225779824
The train MAE for white wine data is: 0.05770620935698176


The test MAE for white wine data did not improve and went from 0.496 to 0.520.

Next, proceed by tuning a simple DecisionTreeRegressor() model for the combined wine data using a coarse grid search and then a finer grid search to see for comparison purposes.

In [63]:
parameters = {'max_depth': range(10, 32, 3), 
              'max_leaf_nodes': range(2, 4100, 100),
              'min_samples_leaf': [1, 2, 4, 8, 16]}
cv = KFold(n_splits = 5, shuffle = True, random_state = 1)
model = GridSearchCV(DecisionTreeRegressor(random_state = 1), parameters, n_jobs = -1, verbose = 1, cv = cv, 
                     scoring = ['neg_mean_absolute_error', 'r2'], refit = 'neg_mean_absolute_error')
model.fit(combined_train_x, combined_train_y)
print(-model.best_score_, model.best_params_) 

Fitting 5 folds for each of 1640 candidates, totalling 8200 fits
0.5264543077274084 {'max_depth': 25, 'max_leaf_nodes': 1202, 'min_samples_leaf': 1}


In [65]:
pred = model.predict(combined_test_x)
print(f'The test MAE for combined wine data is: {mean_absolute_error(combined_test_y, pred)}')
print(f'The train MAE for combined wine data is: {mean_absolute_error(combined_train_y, model.predict(combined_train_x))}')

The test MAE for combined wine data is: 0.4918411203349246
The train MAE for combined wine data is: 0.026721088442903447


In [66]:
parameters = {'max_depth': range(22, 29), 
              'max_leaf_nodes': range(1102, 1303),
              'min_samples_leaf': [1]}
cv = KFold(n_splits = 5, shuffle = True, random_state = 1)
model = GridSearchCV(DecisionTreeRegressor(random_state = 1), parameters, n_jobs = -1, verbose = 1, cv = cv, 
                     scoring = ['neg_mean_absolute_error', 'r2'], refit = 'neg_mean_absolute_error')
model.fit(combined_train_x, combined_train_y)
print(-model.best_score_, model.best_params_) 

Fitting 5 folds for each of 1407 candidates, totalling 7035 fits
0.5260067713668947 {'max_depth': 24, 'max_leaf_nodes': 1118, 'min_samples_leaf': 1}


In [67]:
pred = model.predict(combined_test_x)
print(f'The test MAE for combined wine data is: {mean_absolute_error(combined_test_y, pred)}')
print(f'The train MAE for combined wine data is: {mean_absolute_error(combined_train_y, model.predict(combined_train_x))}')

The test MAE for combined wine data is: 0.5045065538096555
The train MAE for combined wine data is: 0.04582898145486125


The test MAE for combined wine data did not improve and went from 0.494 to 0.505.