In [29]:
import numpy as np
import warnings
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, precision_recall_curve, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier


np.random.seed(42)

In [2]:
wine = pd.read_csv("~/ucare-summer2020/datasets/winequality-white.csv", sep=";")

X = wine.drop(columns=['quality'])
y = wine['quality']
y = y > 5
y = y.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [5]:
def report_clf(clf):
    y_train_predicted = clf.predict(X_train)
    y_test_predicted = clf.predict(X_test)
    print("\nTrain Accuracy: ", np.mean(y_train_predicted == y_train))
    print("-----------------------------------------")
    print("\nTest Accuracy: ", np.mean(y_test_predicted == y_test))
    print("\nTest Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_predicted))


    precision_test = precision_score(y_test, y_test_predicted) 
    print("\nTest Precision = %f" % precision_test)

    recall_test = recall_score(y_test, y_test_predicted)
    print("Test Recall = %f" % recall_test)


    f1_test = f1_score(y_test, y_test_predicted)
    print("Test F1 Score = %f" % f1_test)


    print("\nClassification Report:")
    print(classification_report(y_test, y_test_predicted))

In [28]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=5), n_estimators=500,
    algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

report_clf(ada_clf)


Train Accuracy:  1.0
-----------------------------------------

Test Accuracy:  0.8275510204081633

Test Confusion Matrix:
[[217 104]
 [ 65 594]]

Test Precision = 0.851003
Test Recall = 0.901366
Test F1 Score = 0.875461

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.68      0.72       321
           1       0.85      0.90      0.88       659

    accuracy                           0.83       980
   macro avg       0.81      0.79      0.80       980
weighted avg       0.82      0.83      0.82       980



In [27]:
param_grid = {"base_estimator__max_depth" : [1],
              "learning_rate" :   [0.5],
              "n_estimators": [200, 500]
             }

extra_trees_clf = ExtraTreesClassifier(criterion="gini", max_features="auto", 
                                       class_weight="balanced", oob_score=True, 
                                       bootstrap=True, n_jobs=-1)

ada_clf = AdaBoostClassifier(extra_trees_clf, algorithm="SAMME.R")

ada_cv = GridSearchCV(ada_clf, param_grid=param_grid, scoring = 'accuracy', verbose=3, cv=3)
ada_cv.fit(X_train, y_train)

report_clf(ada_cv)

In [48]:
gbrt = GradientBoostingClassifier(max_depth=10, n_estimators=300, learning_rate=0.5)
gbrt.fit(X_train, y_train)

report_clf(gbrt)


Train Accuracy:  1.0
-----------------------------------------

Test Accuracy:  0.8408163265306122

Test Confusion Matrix:
[[241  80]
 [ 76 583]]

Test Precision = 0.879336
Test Recall = 0.884674
Test F1 Score = 0.881997

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.75      0.76       321
           1       0.88      0.88      0.88       659

    accuracy                           0.84       980
   macro avg       0.82      0.82      0.82       980
weighted avg       0.84      0.84      0.84       980

