In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Gather Data

In [2]:
df = pd.read_csv("data/winequality-red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [6]:
df.shape

(1599, 12)

In [3]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [4]:
from collections import Counter
Counter(df["quality"])

Counter({3: 10, 4: 53, 5: 681, 6: 638, 7: 199, 8: 18})

In [13]:
df["quality_bin"] = np.zeros(df.shape[0])

In [15]:
df["quality_bin"] = df["quality_bin"].where(df["quality"]>=6, 1)
#1 means good quality and 0 means bad quality

In [16]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,quality_bin
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1.0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1.0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1.0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0.0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1.0


In [17]:
Counter(df["quality_bin"])

Counter({0.0: 855, 1.0: 744})

In [18]:
#No missing data

In [19]:
#feature scaling

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
X_data = df.iloc[:,:11].values
y_data = df.iloc[:,12].values

In [22]:
scaler = StandardScaler()

In [28]:
X_data = scaler.fit_transform(X_data)

In [29]:
#train test splitting

In [36]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.3, random_state=42)

In [38]:
print("y_train: ",Counter(y_train),"\n",
      "y_test: ",Counter(y_test))

y_train:  Counter({0.0: 588, 1.0: 531}) 
 y_test:  Counter({0.0: 267, 1.0: 213})


In [39]:
#balancing the data

In [44]:
from imblearn.over_sampling import SMOTE

In [47]:
#resampling need to be done on training dataset only
X_train_res, y_train_res = SMOTE().fit_sample(X_train, y_train)

In [48]:
Counter(y_train_res)

Counter({0.0: 588, 1.0: 588})

In [117]:
#model selection 
from sklearn.linear_model import SGDClassifier
sg = SGDClassifier(random_state=42)

In [118]:
sg.fit(X_train_res,y_train_res)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [119]:
pred = sg.predict(X_test)

In [120]:
from sklearn.metrics import classification_report,accuracy_score

In [121]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

        0.0       0.76      0.59      0.67       267
        1.0       0.60      0.77      0.67       213

avg / total       0.69      0.67      0.67       480



In [122]:
accuracy_score(y_test, pred)

0.66874999999999996

In [123]:
#parameter tuning 
from sklearn.model_selection import GridSearchCV

In [139]:
model = SGDClassifier(random_state=42)
params = {'loss': ["hinge", "log", "perceptron"],
          'alpha':[0.001, 0.0001, 0.00001]}

In [140]:
clf = GridSearchCV(model, params)

In [141]:
clf.fit(X_train_res, y_train_res)



GridSearchCV(cv=None, error_score='raise',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'loss': ['hinge', 'log', 'perceptron'], 'alpha': [0.001, 0.0001, 1e-05]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [142]:
clf.best_score_

0.70493197278911568

In [143]:
clf.best_estimator_

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [144]:
clf.best_estimator_.loss

'hinge'

In [145]:
clf.best_estimator_.alpha

0.001

In [77]:
#final model by taking suitable parameters

In [146]:
clf = SGDClassifier(random_state=42, loss="hinge", alpha=0.001)

In [147]:
clf.fit(X_train_res, y_train_res)



SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [148]:
pred = clf.predict(X_test)

In [149]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

        0.0       0.80      0.58      0.68       267
        1.0       0.61      0.82      0.70       213

avg / total       0.72      0.69      0.69       480



In [150]:
accuracy_score(y_test, pred)

0.68958333333333333

In [151]:
#saving model into a pickle file for later use

In [153]:
from sklearn.externals import joblib
joblib.dump(clf, "data/wine_quality_clf.pkl")

['wine_quality_clf.pkl']

In [154]:
###
clf1 = joblib.load("data/wine_quality_clf.pkl")

In [158]:
X_test[0]

array([-0.35600048,  0.17976995, -0.98066858, -0.02753165,  0.56393475,
       -0.17930021, -0.01422508,  0.18726488, -0.46076217,  0.01092425,
       -0.77251161])

In [160]:
clf1.predict([X_test[0]])

array([ 1.])