In [24]:
import numpy as np
import warnings
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, precision_recall_curve, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.over_sampling import ADASYN 
from sklearn.neighbors import KNeighborsClassifier
from imblearn.combine import SMOTEENN 

In [2]:
wine = pd.read_csv("~/ucare-summer2020/datasets/winequality-white.csv", sep=";")

In [4]:
X = wine.drop(columns=['quality'])
y = wine['quality']
y = y > 5
y = y.astype(int)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [6]:
good_total_ratio = np.mean(y_train)
bad_total_ratio = 1 - good_total_ratio
print("Ratio of good wines to total wines: %.2f" % good_total_ratio)
print("Ratio of bad wines to total wines: %.2f" % bad_total_ratio)

Ratio of good wines to total wines: 0.66
Ratio of bad wines to total wines: 0.34


In [7]:
def report_clf(clf, X_train, X_test, y_train, y_test):
    y_train_predicted = clf.predict(X_train)
    y_test_predicted = clf.predict(X_test)
    print("\nTrain Accuracy: ", np.mean(y_train_predicted == y_train))
    print("-----------------------------------------")
    print("\nTest Accuracy: ", np.mean(y_test_predicted == y_test))
    print("\nTest Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_predicted))


    precision_test = precision_score(y_test, y_test_predicted) 
    print("\nTest Precision = %f" % precision_test)

    recall_test = recall_score(y_test, y_test_predicted)
    print("Test Recall = %f" % recall_test)


    f1_test = f1_score(y_test, y_test_predicted)
    print("Test F1 Score = %f" % f1_test)


    print("\nClassification Report:")
    print(classification_report(y_test, y_test_predicted))

In [2]:
def sampling_report():
    print("NO SAMPLING:")
    param_grid = {'n_neighbors': [5, 10, 20, 40, 70, 200, 500], 'p': [1, 2, 10],
              'weights': ["uniform", "distance"]}
    knn_clf = KNeighborsClassifier()
    knn_cv = GridSearchCV(knn_clf, param_grid, scoring='f1', cv=5, verbose=3, n_jobs=-1)
    knn_cv.fit(X_train, y_train)


    params_optimal_knn = knn_cv.best_params_

    print("Best Score: %f" % knn_cv.best_score_)
    print("Optimal Hyperparameter Values: ", params_optimal_knn)
    print("\n")
    
    report_clf(knn_cv, X_train, X_test, y_train, y_test)
    
    
    
    print("UNDERSAMPLING:")
    rus = RandomUnderSampler() 
    X_resampled, y_resampled = rus.fit_sample(X_train, y_train)
    param_grid = {'n_neighbors': [5, 10, 20, 40, 70, 200, 500], 'p': [1, 2, 10],
              'weights': ["uniform", "distance"]}
    knn_clf = KNeighborsClassifier()
    knn_cv = GridSearchCV(knn_clf, param_grid, scoring='f1', cv=5, verbose=3, n_jobs=-1)
    knn_cv.fit(X_resampled, y_resampled)


    params_optimal_knn = knn_cv.best_params_

    print("Best Score: %f" % knn_cv.best_score_)
    print("Optimal Hyperparameter Values: ", params_optimal_knn)
    print("\n")
    
    report_clf(knn_cv, X_train, X_test, y_train, y_test)
    
    
    print("OVERSAMPLING:")
    ada = ADASYN()
    X_resampled, y_resampled = ada.fit_sample(X_train, y_train)
    param_grid = {'n_neighbors': [5, 10, 20, 40, 70, 200, 500], 'p': [1, 2, 10],
              'weights': ["uniform", "distance"]}
    knn_clf = KNeighborsClassifier()
    knn_cv = GridSearchCV(knn_clf, param_grid, scoring='f1', cv=5, verbose=3, n_jobs=-1)
    knn_cv.fit(X_resampled, y_resampled)


    params_optimal_knn = knn_cv.best_params_

    print("Best Score: %f" % knn_cv.best_score_)
    print("Optimal Hyperparameter Values: ", params_optimal_knn)
    print("\n")
    
    report_clf(knn_cv, X_train, X_test, y_train, y_test)
    
    
    print("SMOTEENN:")
    smo = SMOTEENN() 
    X_resampled, y_resampled = smo.fit_sample(X_train, y_train) 
    param_grid = {'n_neighbors': [5, 10, 20, 40, 70, 200, 500], 'p': [1, 2, 10],
              'weights': ["uniform", "distance"]}
    knn_clf = KNeighborsClassifier()
    knn_cv = GridSearchCV(knn_clf, param_grid, scoring='f1', cv=5, verbose=3, n_jobs=-1)
    knn_cv.fit(X_resampled, y_resampled)


    params_optimal_knn = knn_cv.best_params_

    print("Best Score: %f" % knn_cv.best_score_)
    print("Optimal Hyperparameter Values: ", params_optimal_knn)
    print("\n")
    
    report_clf(knn_cv, X_train, X_test, y_train, y_test)

In [27]:
sampling_report()

NO SAMPLING:
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:    6.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best Score: 0.843366
Optimal Hyperparameter Values:  {'n_neighbors': 70, 'p': 1, 'weights': 'distance'}



Train Accuracy:  1.0
-----------------------------------------

Test Accuracy:  0.7836734693877551

Test Confusion Matrix:
[[164 157]
 [ 55 604]]

Test Precision = 0.793693
Test Recall = 0.916540
Test F1 Score = 0.850704

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.51      0.61       321
           1       0.79      0.92      0.85       659

    accuracy                           0.78       980
   macro avg       0.77      0.71      0.73       980
weighted avg       0.78      0.78      0.77       980

UNDERSAMPLING:
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:    6.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best Score: 0.725045
Optimal Hyperparameter Values:  {'n_neighbors': 10, 'p': 1, 'weights': 'distance'}



Train Accuracy:  0.9050535987748851
-----------------------------------------

Test Accuracy:  0.7357142857142858

Test Confusion Matrix:
[[246  75]
 [184 475]]

Test Precision = 0.863636
Test Recall = 0.720789
Test F1 Score = 0.785773

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.77      0.66       321
           1       0.86      0.72      0.79       659

    accuracy                           0.74       980
   macro avg       0.72      0.74      0.72       980
weighted avg       0.77      0.74      0.74       980

OVERSAMPLING:
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:   10.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best Score: 0.792220
Optimal Hyperparameter Values:  {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}



Train Accuracy:  1.0
-----------------------------------------

Test Accuracy:  0.7459183673469387

Test Confusion Matrix:
[[229  92]
 [157 502]]

Test Precision = 0.845118
Test Recall = 0.761760
Test F1 Score = 0.801277

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.71      0.65       321
           1       0.85      0.76      0.80       659

    accuracy                           0.75       980
   macro avg       0.72      0.74      0.72       980
weighted avg       0.76      0.75      0.75       980

SMOTEENN:
Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    0.9s


Best Score: 0.960089
Optimal Hyperparameter Values:  {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}



Train Accuracy:  0.7378764675855028
-----------------------------------------

Test Accuracy:  0.6479591836734694

Test Confusion Matrix:
[[225  96]
 [249 410]]

Test Precision = 0.810277
Test Recall = 0.622155
Test F1 Score = 0.703863

Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.70      0.57       321
           1       0.81      0.62      0.70       659

    accuracy                           0.65       980
   macro avg       0.64      0.66      0.63       980
weighted avg       0.70      0.65      0.66       980



[Parallel(n_jobs=-1)]: Done 210 out of 210 | elapsed:    5.8s finished
