In [14]:
import numpy as np
import warnings
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, precision_recall_curve, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.over_sampling import ADASYN 
from sklearn.neighbors import KNeighborsClassifier
from imblearn.combine import SMOTEENN 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [2]:
wine = pd.read_csv("~/ucare-summer2020/datasets/winequality-white.csv", sep=";")

In [3]:
X = wine.drop(columns=['quality'])
y = wine['quality']
y = y > 5
y = y.astype(int)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [5]:
good_total_ratio = np.mean(y_train)
bad_total_ratio = 1 - good_total_ratio
print("Ratio of good wines to total wines: %.2f" % good_total_ratio)
print("Ratio of bad wines to total wines: %.2f" % bad_total_ratio)

Ratio of good wines to total wines: 0.66
Ratio of bad wines to total wines: 0.34


In [6]:
def report_clf(clf, X_train, X_test, y_train, y_test):
    y_train_predicted = clf.predict(X_train)
    y_test_predicted = clf.predict(X_test)
    print("\nTrain Accuracy: ", np.mean(y_train_predicted == y_train))
    print("-----------------------------------------")
    print("\nTest Accuracy: ", np.mean(y_test_predicted == y_test))
    print("\nTest Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_predicted))


    precision_test = precision_score(y_test, y_test_predicted) 
    print("\nTest Precision = %f" % precision_test)

    recall_test = recall_score(y_test, y_test_predicted)
    print("Test Recall = %f" % recall_test)


    f1_test = f1_score(y_test, y_test_predicted)
    print("Test F1 Score = %f" % f1_test)


    print("\nClassification Report:")
    print(classification_report(y_test, y_test_predicted))

In [11]:
def sampling_report(gamma, C):
    scaler = StandardScaler()
    print("NO SAMPLING:")
    param_grid = {'svc__kernel': ['rbf'], 'svc__max_iter': [20000],
                 'svc__C': C, 'svc__gamma': gamma}
    svm_clf = SVC()
    scaled_svm_clf = Pipeline([
        ("scaler", scaler),
        ("svc", svm_clf),
    ])
    svm_clf_cv = GridSearchCV(scaled_svm_clf, param_grid, scoring='accuracy',
                            cv=5, verbose=1, n_jobs=-1)
    svm_clf_cv.fit(X_train, y_train)
    print("Optimal Hyperparameter Values: ", svm_clf_cv.best_params_)
    print("\n")
    
    report_clf(svm_clf_cv, X_train, X_test, y_train, y_test)
    
    
    
    print("UNDERSAMPLING:")
    rus = RandomUnderSampler() 
    X_resampled, y_resampled = rus.fit_sample(X_train, y_train)
    param_grid = {'svc__kernel': ['rbf'], 'svc__max_iter': [20000],
                 'svc__C': C, 'svc__gamma': gamma}
    svm_clf = SVC()
    scaled_svm_clf = Pipeline([
        ("scaler", scaler),
        ("svc", svm_clf),
    ])
    svm_clf_cv = GridSearchCV(scaled_svm_clf, param_grid, scoring='accuracy',
                            cv=5, verbose=1, n_jobs=-1)
    svm_clf_cv.fit(X_train, y_train)
    print("Optimal Hyperparameter Values: ", svm_clf_cv.best_params_)
    print("\n")
    
    report_clf(svm_clf_cv, X_train, X_test, y_train, y_test)
    
    
    print("OVERSAMPLING:")
    ada = ADASYN()
    X_resampled, y_resampled = ada.fit_sample(X_train, y_train)
    param_grid = {'svc__kernel': ['rbf'], 'svc__max_iter': [20000],
                 'svc__C': C, 'svc__gamma': gamma}
    svm_clf = SVC()
    scaled_svm_clf = Pipeline([
        ("scaler", scaler),
        ("svc", svm_clf),
    ])
    svm_clf_cv = GridSearchCV(scaled_svm_clf, param_grid, scoring='accuracy',
                            cv=5, verbose=1, n_jobs=-1)
    svm_clf_cv.fit(X_train, y_train)
    print("Optimal Hyperparameter Values: ", svm_clf_cv.best_params_)
    print("\n")
    
    report_clf(svm_clf_cv, X_train, X_test, y_train, y_test)
    
    
    print("SMOTEENN:")
    smo = SMOTEENN() 
    X_resampled, y_resampled = smo.fit_sample(X_train, y_train) 
    param_grid = {'svc__kernel': ['rbf'], 'svc__max_iter': [20000],
                 'svc__C': C, 'svc__gamma': gamma}
    svm_clf = SVC()
    scaled_svm_clf = Pipeline([
        ("scaler", scaler),
        ("svc", svm_clf),
    ])
    svm_clf_cv = GridSearchCV(scaled_svm_clf, param_grid, scoring='accuracy',
                            cv=5, verbose=1, n_jobs=-1)
    svm_clf_cv.fit(X_train, y_train)
    print("Optimal Hyperparameter Values: ", svm_clf_cv.best_params_)
    print("\n")
    
    report_clf(svm_clf_cv, X_train, X_test, y_train, y_test)

In [15]:
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 1000000]
gamma = [0.000001, 0.0001, 0.01, 0.1, 1, 10, 100, 1000]
sampling_report(gamma, C)

NO SAMPLING:
Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  1.1min finished


Optimal Hyperparameter Values:  {'svc__C': 10, 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'svc__max_iter': 20000}



Train Accuracy:  0.8700867789688617
-----------------------------------------

Test Accuracy:  0.7938775510204081

Test Confusion Matrix:
[[201 120]
 [ 82 577]]

Test Precision = 0.827834
Test Recall = 0.875569
Test F1 Score = 0.851032

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.63      0.67       321
           1       0.83      0.88      0.85       659

    accuracy                           0.79       980
   macro avg       0.77      0.75      0.76       980
weighted avg       0.79      0.79      0.79       980

UNDERSAMPLING:
Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   35.3s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  1.4min finished


Optimal Hyperparameter Values:  {'svc__C': 10, 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'svc__max_iter': 20000}



Train Accuracy:  0.8700867789688617
-----------------------------------------

Test Accuracy:  0.7938775510204081

Test Confusion Matrix:
[[201 120]
 [ 82 577]]

Test Precision = 0.827834
Test Recall = 0.875569
Test F1 Score = 0.851032

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.63      0.67       321
           1       0.83      0.88      0.85       659

    accuracy                           0.79       980
   macro avg       0.77      0.75      0.76       980
weighted avg       0.79      0.79      0.79       980

OVERSAMPLING:
Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   37.0s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  1.4min finished


Optimal Hyperparameter Values:  {'svc__C': 10, 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'svc__max_iter': 20000}



Train Accuracy:  0.8700867789688617
-----------------------------------------

Test Accuracy:  0.7938775510204081

Test Confusion Matrix:
[[201 120]
 [ 82 577]]

Test Precision = 0.827834
Test Recall = 0.875569
Test F1 Score = 0.851032

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.63      0.67       321
           1       0.83      0.88      0.85       659

    accuracy                           0.79       980
   macro avg       0.77      0.75      0.76       980
weighted avg       0.79      0.79      0.79       980

SMOTEENN:
Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   33.0s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  1.3min finished


Optimal Hyperparameter Values:  {'svc__C': 10, 'svc__gamma': 0.1, 'svc__kernel': 'rbf', 'svc__max_iter': 20000}



Train Accuracy:  0.8700867789688617
-----------------------------------------

Test Accuracy:  0.7938775510204081

Test Confusion Matrix:
[[201 120]
 [ 82 577]]

Test Precision = 0.827834
Test Recall = 0.875569
Test F1 Score = 0.851032

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.63      0.67       321
           1       0.83      0.88      0.85       659

    accuracy                           0.79       980
   macro avg       0.77      0.75      0.76       980
weighted avg       0.79      0.79      0.79       980

