In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV, ParameterGrid, KFold, StratifiedKFold
from sklearn.ensemble import BaggingRegressor,BaggingClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt 

In [2]:
X_test = pd.read_csv('X_test_stratified.csv')
X_train = pd.read_csv('X_train_stratified.csv')
y_test = pd.read_csv('y_test_stratified.csv')
y_train = pd.read_csv('y_train_stratified.csv')
y_train = y_train['quality']

initial bagging model

In [3]:
initialmodel = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1)).fit(X_train, y_train)

using the initial model to predict for X_test

In [6]:
y_pred = initialmodel.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.6214285714285714
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.71      0.30      0.43        33
           5       0.63      0.65      0.64       291
           6       0.61      0.70      0.66       440
           7       0.61      0.47      0.53       176
           8       0.75      0.51      0.61        35
           9       0.00      0.00      0.00         1

    accuracy                           0.62       980
   macro avg       0.47      0.38      0.41       980
weighted avg       0.62      0.62      0.61       980



optimizing bagging hyperparameters for precision/recall

In [9]:
from sklearn.exceptions import FitFailedWarning

warnings.filterwarnings('ignore', category=FitFailedWarning)

n_samples = X_train.shape[0]
n_features = X_train.shape[1]

params = {'base_estimator': [DecisionTreeClassifier(random_state = 1)],
          'n_estimators': [150,200,250],
          'max_samples': [0.5,1.0],
          'max_features': [0.5,1.0],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
bagging_classifier_grid = GridSearchCV(BaggingClassifier(random_state=1, n_jobs=-1), 
                                      param_grid =params, cv=cv, n_jobs=-1, verbose=1,
                                      scoring = {'precision': 
                                                 make_scorer(precision_score, average='weighted', zero_division = 0),
           'recall': make_scorer(recall_score, average='weighted', zero_division = 0)}, refit='precision')
bagging_classifier_grid.fit(X_train, y_train)

print('Train accuracy : %.3f'%bagging_classifier_grid.best_estimator_.score(X_train, y_train))
print('Test accuracy : %.3f'%bagging_classifier_grid.best_estimator_.score(X_test, y_test))
print('Best accuracy Through Grid Search : %.3f'%bagging_classifier_grid.best_score_)
print('Best Parameters : ',bagging_classifier_grid.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Train accuracy : 0.997
Test accuracy : 0.650
Best accuracy Through Grid Search : 0.701
Best Parameters :  {'base_estimator': DecisionTreeClassifier(random_state=1), 'bootstrap': True, 'bootstrap_features': True, 'max_features': 0.5, 'max_samples': 0.5, 'n_estimators': 250}


testing the model with optimal hyperparameters for precision/recall on X_test

In [10]:
model = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1), n_estimators=250, 
                          random_state=1,max_features=0.5,
                        max_samples=0.5, bootstrap=True,bootstrap_features=True).fit(X_train, y_train)


In [11]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.65
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.71      0.15      0.25        33
           5       0.71      0.61      0.66       291
           6       0.60      0.83      0.70       440
           7       0.74      0.40      0.52       176
           8       1.00      0.49      0.65        35
           9       0.00      0.00      0.00         1

    accuracy                           0.65       980
   macro avg       0.54      0.35      0.40       980
weighted avg       0.67      0.65      0.63       980



optimizing bagging hyperparameters for accuracy

In [12]:
n_samples = X_train.shape[0]
n_features = X_train.shape[1]

params = {'base_estimator': [DecisionTreeClassifier(random_state = 1)],
          'n_estimators': [150,200,250],
          'max_samples': [0.5,1.0],
          'max_features': [0.5,1.0],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
bagging_classifier_grid = GridSearchCV(BaggingClassifier(random_state=1, n_jobs=-1), 
                                      param_grid =params, cv=cv, n_jobs=-1, verbose=1,
                                      scoring = ['accuracy'], refit='accuracy')
bagging_classifier_grid.fit(X_train, y_train)

print('Train accuracy : %.3f'%bagging_classifier_grid.best_estimator_.score(X_train, y_train))
print('Test accuracy : %.3f'%bagging_classifier_grid.best_estimator_.score(X_test, y_test))
print('Best accuracy Through Grid Search : %.3f'%bagging_classifier_grid.best_score_)
print('Best Parameters : ',bagging_classifier_grid.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Train accuracy : 1.000
Test accuracy : 0.650
Best accuracy Through Grid Search : 0.668
Best Parameters :  {'base_estimator': DecisionTreeClassifier(random_state=1), 'bootstrap': True, 'bootstrap_features': False, 'max_features': 0.5, 'max_samples': 1.0, 'n_estimators': 150}


testing the model with optimal hyperparameters for accuracy on X_test

In [13]:
model = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1), n_estimators=150, 
                          random_state=1,max_features=0.5,
                        max_samples=1.0, bootstrap=True,bootstrap_features=False).fit(X_train, y_train)

In [14]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.65
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.60      0.18      0.28        33
           5       0.70      0.59      0.64       291
           6       0.61      0.82      0.70       440
           7       0.71      0.44      0.54       176
           8       1.00      0.51      0.68        35
           9       0.00      0.00      0.00         1

    accuracy                           0.65       980
   macro avg       0.52      0.36      0.41       980
weighted avg       0.66      0.65      0.64       980



optimizing bagging hyperparameters for accuracy for f1 score

In [15]:
n_samples = X_train.shape[0]
n_features = X_train.shape[1]

params = {'base_estimator': [DecisionTreeClassifier(random_state = 1)],
          'n_estimators': [150,200,250],
          'max_samples': [0.5,1.0],
          'max_features': [0.5,1.0],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
bagging_classifier_grid = GridSearchCV(BaggingClassifier(random_state=1, n_jobs=-1), 
                                      param_grid =params, cv=cv, n_jobs=-1, verbose=1,
                                      scoring = ['f1_weighted'], refit='f1_weighted')
bagging_classifier_grid.fit(X_train, y_train)

print('Train accuracy : %.3f'%bagging_classifier_grid.best_estimator_.score(X_train, y_train))
print('Test accuracy : %.3f'%bagging_classifier_grid.best_estimator_.score(X_test, y_test))
print('Best accuracy Through Grid Search : %.3f'%bagging_classifier_grid.best_score_)
print('Best Parameters : ',bagging_classifier_grid.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Train accuracy : 1.000
Test accuracy : 0.660
Best accuracy Through Grid Search : 0.655
Best Parameters :  {'base_estimator': DecisionTreeClassifier(random_state=1), 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 250}


In [16]:
model = BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1), n_estimators=250, 
                          random_state=1,max_features=1.0,
                        max_samples=1.0, bootstrap=True,bootstrap_features=False).fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.6602040816326531
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         4
           4       0.71      0.30      0.43        33
           5       0.69      0.66      0.67       291
           6       0.63      0.78      0.70       440
           7       0.67      0.49      0.57       176
           8       1.00      0.54      0.70        35
           9       0.00      0.00      0.00         1

    accuracy                           0.66       980
   macro avg       0.53      0.40      0.44       980
weighted avg       0.67      0.66      0.65       980

