In [11]:
import numpy as np
import warnings
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, precision_recall_curve, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler, scale
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
import time
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.decomposition import PCA


np.random.seed(42)

In [69]:
wine = pd.read_csv("~/ucare-summer2020/datasets/winequality-white.csv", sep=";")

In [3]:
X = wine.drop(columns=['quality'])
y = wine['quality']
y = y > 5
y = y.astype(int)

In [73]:
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,1
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,1
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,1
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,1
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,1
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,1


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [9]:
%%time

t0 = time.time()
forest_clf = RandomForestClassifier(n_estimators=1000, criterion="gini", max_features="auto", 
                                    max_depth=32, class_weight="balanced", oob_score=True, verbose=1, n_jobs=-1)

forest_clf.fit(X_train, y_train)

print("Random Forest Training took {:.2f}s".format(training_forest_clf))

y_test_predicted = forest_clf.predict(X_test)
accuracy_forest_clf = accuracy_score(y_test, y_test_predicted)
print("\nTest Accuracy: ", accuracy_forest_clf)

print("\nTest Confusion Matrix:")
print(confusion_matrix(y_test, y_test_predicted))

print("\nClassification Report:")
print(classification_report(y_test, y_test_predicted))

print("\nScore of the training dataset obtained using an out-of-bag estimate: ", forest_clf.oob_score_)
print("\n")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    3.3s finished


Random Forest Training took 4.43s


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.2s



Test Accuracy:  0.8418367346938775

Test Confusion Matrix:
[[228  93]
 [ 62 597]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.71      0.75       321
           1       0.87      0.91      0.89       659

    accuracy                           0.84       980
   macro avg       0.83      0.81      0.82       980
weighted avg       0.84      0.84      0.84       980


Score of the training dataset obtained using an out-of-bag estimate:  0.8333333333333334


CPU times: user 10.2 s, sys: 584 ms, total: 10.8 s
Wall time: 4.68 s


[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.2s finished


In [12]:
pca = PCA(n_components=0.95)
pca.fit(X_train)

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

print("Number of Principle Components (Extracted Features): ", pca.n_components_)

Number of Principle Components (Extracted Features):  2


In [13]:
%%time

forest_clf_pca = RandomForestClassifier(n_estimators=1000, criterion="gini", max_features="auto", 
                                    max_depth=32, class_weight="balanced", oob_score=True, verbose=1, n_jobs=-1)

forest_clf_pca.fit(X_train_pca, y_train)

y_test_predicted = forest_clf_pca.predict(X_test_pca)
print("Test Accuracy (PCA): ", accuracy_score(y_test, y_test_predicted))

print("\nTest Confusion Matrix (PCA):")
print(confusion_matrix(y_test, y_test_predicted))

print("\nClassification Report (PCA):")
print(classification_report(y_test, y_test_predicted))

print("\nScore of the training dataset obtained using an out-of-bag estimate (PCA): ", forest_clf.oob_score_)
print("\n")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.2s


Test Accuracy (PCA):  0.736734693877551

Test Confusion Matrix (PCA):
[[170 151]
 [107 552]]

Classification Report (PCA):
              precision    recall  f1-score   support

           0       0.61      0.53      0.57       321
           1       0.79      0.84      0.81       659

    accuracy                           0.74       980
   macro avg       0.70      0.68      0.69       980
weighted avg       0.73      0.74      0.73       980


Score of the training dataset obtained using an out-of-bag estimate (PCA):  0.8333333333333334


CPU times: user 8.16 s, sys: 556 ms, total: 8.71 s
Wall time: 4.22 s


[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.2s finished


In [32]:
importances = forest_clf.feature_importances_
features = pd.DataFrame(X.columns).T
weights = pd.DataFrame(importances).T
pd.concat([features, weights])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.0644937,0.118244,0.0739964,0.0786552,0.0801979,0.0945172,0.0838503,0.115566,0.0721882,0.0640409,0.15425


In [33]:
extra_trees_clf = ExtraTreesClassifier(n_estimators=1000, criterion="gini", max_features="auto", 
                                       max_depth=32, class_weight="balanced", oob_score=True, 
                                       bootstrap=True, verbose=1, n_jobs=-1)
extra_trees_clf.fit(X_train, y_train)

y_test_predicted = extra_trees_clf.predict(X_test)
accuracy_extra_trees = accuracy_score(y_test, y_test_predicted)
print("\nTest Accuracy: ", accuracy_extra_trees)

print("\nTest Confusion Matrix:")
print(confusion_matrix(y_test, y_test_predicted))

print("\nClassification Report:")
print(classification_report(y_test, y_test_predicted))

print("\nScore of the training dataset obtained using an out-of-bag estimate: ", extra_trees_clf.oob_score_)
print("\n")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.2s



Test Accuracy:  0.8469387755102041

Test Confusion Matrix:
[[227  94]
 [ 56 603]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.71      0.75       321
           1       0.87      0.92      0.89       659

    accuracy                           0.85       980
   macro avg       0.83      0.81      0.82       980
weighted avg       0.84      0.85      0.84       980


Score of the training dataset obtained using an out-of-bag estimate:  0.8348647269014804




[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.3s finished


In [34]:
data = [["Random Forest (1000 trees)", accuracy_forest_clf], 
        ["Extra-Trees (1000 trees)", accuracy_extra_trees]]

pd.DataFrame(data, columns=["Classifier", "Accuracy"])

Unnamed: 0,Classifier,Accuracy
0,Random Forest (1000 trees),0.841837
1,Extra-Trees (1000 trees),0.846939


In [35]:
def report_clf(clf):
    y_train_predicted = clf.predict(X_train)
    y_test_predicted = clf.predict(X_test)
    print("\nTrain Accuracy: ", np.mean(y_train_predicted == y_train))
    print("-----------------------------------------")
    print("\nTest Accuracy: ", np.mean(y_test_predicted == y_test))
    print("\nTest Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_predicted))


    precision_test = precision_score(y_test, y_test_predicted) 
    print("\nTest Precision = %f" % precision_test)

    recall_test = recall_score(y_test, y_test_predicted)
    print("Test Recall = %f" % recall_test)


    f1_test = f1_score(y_test, y_test_predicted)
    print("Test F1 Score = %f" % f1_test)


    print("\nClassification Report:")
    print(classification_report(y_test, y_test_predicted))

In [46]:
def forest_grid_search():
    param_grid = {'n_estimators': [1000], 'max_depth': [1, 5, 10, 20, 30, 40, 50]}
    extra_trees_clf = ExtraTreesClassifier(criterion="gini", max_features="auto", 
                                       class_weight="balanced", oob_score=True, 
                                       bootstrap=True, verbose=1, n_jobs=-1)
    trees_cv = GridSearchCV(extra_trees_clf, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
    trees_cv.fit(X_train, y_train)

    params_optimal_trees = trees_cv.best_params_

    print("Best Score: %f" % trees_cv.best_score_)
    print("Optimal Hyperparameter Values: ", params_optimal_trees)
    print("\n")
    
    report_clf(trees_cv)

In [47]:
forest_grid_search()

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:  1.1min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.6s finished


Best Score: 0.879692
Optimal Hyperparameter Values:  {'max_depth': 30, 'n_estimators': 1000}




[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s



Train Accuracy:  1.0
-----------------------------------------

Test Accuracy:  0.8469387755102041

Test Confusion Matrix:
[[228  93]
 [ 57 602]]

Test Precision = 0.866187
Test Recall = 0.913505
Test F1 Score = 0.889217

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.71      0.75       321
           1       0.87      0.91      0.89       659

    accuracy                           0.85       980
   macro avg       0.83      0.81      0.82       980
weighted avg       0.84      0.85      0.84       980



[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    0.3s finished
