In [49]:
import numpy as np
import warnings
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_curve, precision_recall_curve, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [20]:
wine = pd.read_csv("~/datasets/winequality-white.csv", sep=";")
wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7


In [11]:
X = wine.drop(columns=['quality'])

In [36]:
y = wine['quality']
y = y > 5

In [38]:
y = y.astype(int)
y

0       1
1       1
2       1
3       1
4       1
       ..
4893    1
4894    0
4895    1
4896    1
4897    1
Name: quality, Length: 4898, dtype: int64

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [42]:
param_grid = {'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
             'tol': [1e-3, 1e-4, 1e-5], 'max_iter': [3000, 5000, 10000, 20000],
             'C': [0.0001, 0.001, 0.1, 0.5, 1, 10]}
lg_reg = LogisticRegression()
lg_reg_cv = GridSearchCV(lg_reg, param_grid, scoring='accuracy',
                        cv=3, verbose=1, n_jobs=-1)
lg_reg_cv.fit(X_train, y_train)
params_optimal = lg_reg_cv.best_params_
print("Best Score (accuracy): %f" % lg_reg_cv.best_score_)
print("Optimal Hyperparameter Values: ", params_optimal)
print("\n")

Fitting 3 folds for each of 360 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 212 tasks      | elapsed:   51.9s
[Parallel(n_jobs=-1)]: Done 462 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 812 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed: 10.2min finished


Best Score (accuracy): 0.752425
Optimal Hyperparameter Values:  {'C': 10, 'max_iter': 3000, 'solver': 'lbfgs', 'tol': 0.001}




In [43]:
lg_reg_clf = LogisticRegression(**params_optimal)
lg_reg_clf.fit(X_train, y_train)

LogisticRegression(C=10, max_iter=3000, tol=0.001)

In [44]:
print("No. of Iterations:", lg_reg_clf.n_iter_ )
print("\nWeight Intercept:", lg_reg_clf.intercept_ )
print("Weight Coefficients:", lg_reg_clf.coef_ )

No. of Iterations: [521]

Weight Intercept: [-4.36260889]
Weight Coefficients: [[-1.89575431e-01 -6.79975761e+00 -5.10352048e-02  6.70094980e-02
  -3.73619264e-01  1.38572845e-02 -2.23070047e-03 -4.11684663e+00
   6.51117247e-02  1.39814849e+00  1.05612563e+00]]


In [61]:
y_test_predicted_GD = lg_reg_clf.predict(X_test)

accuracy_score_test_GD = np.mean(y_test_predicted == y_test)
print("\nTest Accuracy: ", accuracy_score_test_GD)


print("\nTest Confusion Matrix:")
print(confusion_matrix(y_test, y_test_predicted_GD))


precision_test_GD = precision_score(y_test, y_test_predicted_GD) 
print("\nTest Precision = %f" % precision_test_GD)

recall_test_GD = recall_score(y_test, y_test_predicted_GD)
print("Test Recall = %f" % recall_test_GD)


f1_test_GD = f1_score(y_test, y_test_predicted_GD)
print("Test F1 Score = %f" % f1_test_GD)


print("\nClassification Report:")
print(classification_report(y_test, y_test_predicted_GD))


Test Accuracy:  0.7408163265306122

Test Confusion Matrix:
[[154 167]
 [ 87 572]]

Test Precision = 0.774019
Test Recall = 0.867982
Test F1 Score = 0.818312

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.48      0.55       321
           1       0.77      0.87      0.82       659

    accuracy                           0.74       980
   macro avg       0.71      0.67      0.68       980
weighted avg       0.73      0.74      0.73       980



In [55]:
%%time
warnings.filterwarnings('ignore')
param_grid = {'alpha': [0.05, 0.01, 0.001],
              'penalty' : ["l2"],
              'learning_rate': ["constant", "optimal", "invscaling", "adaptive"], 
              'max_iter':[500, 1000, 3000, 7000],
              'eta0': [0.1, 0.01, 0.001],
              'tol': [1e-3, 1e-5, 1e-8],
              'loss': ['hinge', 'log', 'modified_huber']}
sgd_clf = SGDClassifier()
sgd_clf_cv = GridSearchCV(sgd_clf, param_grid, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
sgd_clf_cv.fit(X_train, y_train)
params_optimal = sgd_clf_cv.best_params_
print("Best Score (accuracy): %f" % sgd_clf_cv.best_score_)
print("Optimal Hyperparameter Values: ", params_optimal)
print("\n")

Fitting 3 folds for each of 1296 candidates, totalling 3888 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 1952 tasks      | elapsed:   20.3s


Best Score (accuracy): 0.699081
Optimal Hyperparameter Values:  {'alpha': 0.001, 'eta0': 0.1, 'learning_rate': 'adaptive', 'loss': 'log', 'max_iter': 3000, 'penalty': 'l2', 'tol': 1e-08}


CPU times: user 3.59 s, sys: 151 ms, total: 3.74 s
Wall time: 44 s


[Parallel(n_jobs=-1)]: Done 3888 out of 3888 | elapsed:   43.8s finished


In [57]:
sgd = SGDClassifier(**params_optimal)
sgd.fit(X_train, y_train)

SGDClassifier(alpha=0.001, eta0=0.1, learning_rate='adaptive', loss='log',
              max_iter=3000, tol=1e-08)

In [62]:
y_test_predicted_SGD = sgd.predict(X_test)

accuracy_score_test_SGD = np.mean(y_test_predicted_SGD == y_test)
print("\nTest Accuracy: ", accuracy_score_test_SGD)


print("\nTest Confusion Matrix:")
print(confusion_matrix(y_test, y_test_predicted_SGD))


precision_test_SGD = precision_score(y_test, y_test_predicted_SGD) 
print("\nTest Precision = %f" % precision_test_SGD)

recall_test_SGD = recall_score(y_test, y_test_predicted_SGD)
print("Test Recall = %f" % recall_test_SGD)


f1_test_SGD = f1_score(y_test, y_test_predicted_SGD)
print("Test F1 Score = %f" % f1_test_SGD)


print("\nClassification Report:")
print(classification_report(y_test, y_test_predicted_SGD))


Test Accuracy:  0.6887755102040817

Test Confusion Matrix:
[[169 152]
 [153 506]]

Test Precision = 0.768997
Test Recall = 0.767830
Test F1 Score = 0.768413

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.53      0.53       321
           1       0.77      0.77      0.77       659

    accuracy                           0.69       980
   macro avg       0.65      0.65      0.65       980
weighted avg       0.69      0.69      0.69       980



In [67]:
print("\nGRADIENT DESCENT REPORT:")
print("\nTest Accuracy: ", accuracy_score_test_GD)
print("\nTest Precision = %f" % precision_test_GD)
print("Test Recall = %f" % recall_test_GD)
print("Test F1 Score = %f" % f1_test_GD)

print("--------------------------------------")

print("\nSTOCHASTIC GRADIENT DESCENT REPORT:")
print("\nTest Accuracy: ", accuracy_score_test_SGD)
print("\nTest Precision = %f" % precision_test_SGD)
print("Test Recall = %f" % recall_test_SGD)
print("Test F1 Score = %f" % f1_test_SGD)


GRADIENT DESCENT REPORT:

Test Accuracy:  0.7408163265306122

Test Precision = 0.774019
Test Recall = 0.867982
Test F1 Score = 0.818312
--------------------------------------

STOCHASTIC GRADIENT DESCENT REPORT:

Test Accuracy:  0.6887755102040817

Test Precision = 0.768997
Test Recall = 0.767830
Test F1 Score = 0.768413


In [73]:
y_test.describe()


count    980.000000
mean       0.672449
std        0.469560
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: quality, dtype: float64

In [78]:
print("     Note that guessing 'good' wine every time would yield an accuracy of 0.672, which is almost the same as the stochastic gradient descent model's accuracy and quite close to the normal gradient descent model's accuracy.")
print("     This seems to show that the two models being used are probably biased and are not fitting properly.")


     Note that guessing 'good' wine every time would yield an accuracy of 0.672, which is almost the same as the stochastic gradient descent model's accuracy and quite close to the normal gradient descent model's accuracy.
     This seems to show that the two models being used are probably biased and are not fitting properly.
