In [1]:
import numpy as np
import warnings
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, precision_recall_curve, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler 
from imblearn.over_sampling import ADASYN
from sklearn.neighbors import KNeighborsClassifier

In [2]:
wine = pd.read_csv("~/ucare-summer2020/datasets/winequality-white.csv", sep=";")

In [3]:
X = wine.drop(columns=['quality'])
y = wine['quality']
y = y > 5
y = y.astype(int)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [5]:
good_total_ratio = np.mean(y_train)
bad_total_ratio = 1 - good_total_ratio
print("Ratio of good wines to total wines: %.2f" % good_total_ratio)
print("Ratio of bad wines to total wines: %.2f" % bad_total_ratio)

Ratio of good wines to total wines: 0.66
Ratio of bad wines to total wines: 0.34


In [6]:
def report_clf(clf, X_train, X_test, y_train, y_test):
    y_train_predicted = clf.predict(X_train)
    y_test_predicted = clf.predict(X_test)
    print("\nTrain Accuracy: ", np.mean(y_train_predicted == y_train))
    print("-----------------------------------------")
    print("\nTest Accuracy: ", np.mean(y_test_predicted == y_test))
    print("\nTest Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_predicted))


    precision_test = precision_score(y_test, y_test_predicted) 
    print("\nTest Precision = %f" % precision_test)

    recall_test = recall_score(y_test, y_test_predicted)
    print("Test Recall = %f" % recall_test)


    f1_test = f1_score(y_test, y_test_predicted)
    print("Test F1 Score = %f" % f1_test)


    print("\nClassification Report:")
    print(classification_report(y_test, y_test_predicted))

In [7]:
def class_weight_report():
    print("NO CLASS WEIGHTS:")
    param_grid = {'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
                  'max_iter': [20000],
                  'C': [0.0001, 0.001, 0.1, 0.5, 1, 10]}
    lg_reg = LogisticRegression()
    lg_reg_cv = GridSearchCV(lg_reg, param_grid, scoring='accuracy',
                            cv=5, verbose=1, n_jobs=-1)
    lg_reg_cv.fit(X_train, y_train)
    params_optimal = lg_reg_cv.best_params_
    print("Optimal Hyperparameter Values: ", params_optimal)
    print("\n")
    
    report_clf(lg_reg_cv, X_train, X_test, y_train, y_test)
    
    
    
    print("WITH CLASS WEIGHTS:")
    param_grid = {'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
                  'max_iter': [20000],
                  'C': [0.0001, 0.001, 0.1, 0.5, 1, 10]}
    lg_reg = LogisticRegression(class_weight={0:good_total_ratio, 1:bad_total_ratio})
    lg_reg_cv = GridSearchCV(lg_reg, param_grid, scoring='accuracy',
                            cv=5, verbose=1, n_jobs=-1)
    lg_reg_cv.fit(X_train, y_train)
    params_optimal = lg_reg_cv.best_params_
    print("Optimal Hyperparameter Values: ", params_optimal)
    print("\n")
    
    report_clf(lg_reg_cv, X_train, X_test, y_train, y_test)

In [8]:
class_weight_report()

NO CLASS WEIGHTS:
Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Optimal Hyperparameter Values:  {'C': 10, 'max_iter': 20000, 'solver': 'liblinear'}



Train Accuracy:  0.7575293517100562
-----------------------------------------

Test Accuracy:  0.7418367346938776

Test Confusion Matrix:
[[153 168]
 [ 85 574]]

Test Precision = 0.773585
Test Recall = 0.871017
Test F1 Score = 0.819415

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.48      0.55       321
           1       0.77      0.87      0.82       659

    accuracy                           0.74       980
   macro avg       0.71      0.67      0.68       980
weighted avg       0.73      0.74      0.73       980

WITH CLASS WEIGHTS:
Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  75 tasks      | elapsed:   21.0s


Optimal Hyperparameter Values:  {'C': 10, 'max_iter': 20000, 'solver': 'liblinear'}



Train Accuracy:  0.7164369576314447
-----------------------------------------

Test Accuracy:  0.7153061224489796

Test Confusion Matrix:
[[232  89]
 [190 469]]

Test Precision = 0.840502
Test Recall = 0.711684
Test F1 Score = 0.770748

Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.72      0.62       321
           1       0.84      0.71      0.77       659

    accuracy                           0.72       980
   macro avg       0.70      0.72      0.70       980
weighted avg       0.75      0.72      0.72       980



[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.7min finished


In [13]:
print("Class weights increased precision but reduced recall.")

Class weights increased precision but reduced recall.


In [10]:
def under_over_sampler():
    print("UNDERSAMPLED:")
    rus = RandomUnderSampler() 
    X_resampled, y_resampled = rus.fit_sample(X_train, y_train)
    
    param_grid = {'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
                  'max_iter': [20000],
                  'C': [0.0001, 0.001, 0.1, 1, 10]}
    lg_reg = LogisticRegression()
    lg_reg_cv = GridSearchCV(lg_reg, param_grid, scoring='accuracy',
                            cv=5, verbose=1, n_jobs=-1)
    lg_reg_cv.fit(X_resampled, y_resampled)
    params_optimal = lg_reg_cv.best_params_
    print("Optimal Hyperparameter Values: ", params_optimal)
    print("\n")
    
    report_clf(lg_reg_cv, X_train, X_test, y_train, y_test)
    
    
    print("OVERSAMPLED:")
    ada = ADASYN()
    X_resampled, y_resampled = ada.fit_sample(X_train, y_train)
    
    param_grid = {'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
                  'max_iter': [20000],
                  'C': [0.0001, 0.001, 0.1, 1, 10]}
    lg_reg = LogisticRegression()
    lg_reg_cv = GridSearchCV(lg_reg, param_grid, scoring='accuracy',
                            cv=5, verbose=1, n_jobs=-1)
    lg_reg_cv.fit(X_resampled, y_resampled)
    params_optimal = lg_reg_cv.best_params_
    print("Optimal Hyperparameter Values: ", params_optimal)
    print("\n")
    
    report_clf(lg_reg_cv, X_train, X_test, y_train, y_test)

In [11]:
under_over_sampler()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


UNDERSAMPLED:
Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:   51.1s finished


Optimal Hyperparameter Values:  {'C': 10, 'max_iter': 20000, 'solver': 'lbfgs'}



Train Accuracy:  0.7179683511995917
-----------------------------------------

Test Accuracy:  0.7193877551020408

Test Confusion Matrix:
[[233  88]
 [187 472]]

Test Precision = 0.842857
Test Recall = 0.716237
Test F1 Score = 0.774405

Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.73      0.63       321
           1       0.84      0.72      0.77       659

    accuracy                           0.72       980
   macro avg       0.70      0.72      0.70       980
weighted avg       0.75      0.72      0.73       980

OVERSAMPLED:
Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:   52.4s
[Parallel(n_jobs=-1)]: Done 125 out of 125 | elapsed:  1.5min finished


Optimal Hyperparameter Values:  {'C': 10, 'max_iter': 20000, 'solver': 'newton-cg'}



Train Accuracy:  0.7240939254721797
-----------------------------------------

Test Accuracy:  0.7214285714285714

Test Confusion Matrix:
[[231  90]
 [183 476]]

Test Precision = 0.840989
Test Recall = 0.722307
Test F1 Score = 0.777143

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.72      0.63       321
           1       0.84      0.72      0.78       659

    accuracy                           0.72       980
   macro avg       0.70      0.72      0.70       980
weighted avg       0.75      0.72      0.73       980



In [12]:
print("Both over and undersampling have increased precision but greatly reduced recall. All methods so far have reduced accuracy and F1 score (or had the same F1 score).")

Both over and undersampling have increased precision but greatly reduced recall. All methods so far have reduced accuracy and F1 score (or had the same F1 score).
