In [1]:
import requests
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
http_request = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv')

In [3]:
http_text = http_request.text.split("\n")

In [6]:
http_text[0:11]

['"fixed acidity";"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"',
 '7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6',
 '6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6',
 '8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;10.1;6',
 '7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6',
 '7.2;0.23;0.32;8.5;0.058;47;186;0.9956;3.19;0.4;9.9;6',
 '8.1;0.28;0.4;6.9;0.05;30;97;0.9951;3.26;0.44;10.1;6',
 '6.2;0.32;0.16;7;0.045;30;136;0.9949;3.18;0.47;9.6;6',
 '7;0.27;0.36;20.7;0.045;45;170;1.001;3;0.45;8.8;6',
 '6.3;0.3;0.34;1.6;0.049;14;132;0.994;3.3;0.49;9.5;6',
 '8.1;0.22;0.43;1.5;0.044;28;129;0.9938;3.22;0.45;11;6']

In [7]:
data_array = []
for lines in http_text:
    data_array.append(lines.split(";"))
col_names = []
for col in data_array[0]:
    col_names.append(col.strip('"'))

data_frame = pd.DataFrame(data=data_array[1:],columns=col_names).apply(pd.to_numeric)

In [9]:
data_frame.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6.0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6.0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6.0


In [10]:
numrecs = data_frame.shape[0]
numrecs

4899

In [11]:
data_frame = data_frame.iloc[0:numrecs-2,:]
data_frame.shape

(4897, 12)

In [15]:
data_frame["quality"].value_counts()

6.0    2197
5.0    1457
7.0     880
8.0     175
4.0     163
3.0      20
9.0       5
Name: quality, dtype: int64

In [17]:
# Let us do a little pre-processing
label = data_frame["quality"]
predictors = data_frame[col_names[:-1]]
predictors.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [19]:
scaler_model = MinMaxScaler().fit_transform(predictors.values)
bins = np.linspace(0, 1, 10)
digitized = np.digitize(scaler_model, bins)

In [22]:
?np.digitize

In [21]:
digitized[0:5]

array([[3, 2, 2, 3, 1, 2, 4, 3, 3, 3, 2],
       [3, 2, 2, 1, 2, 1, 3, 2, 5, 3, 3],
       [4, 2, 3, 1, 2, 1, 2, 2, 5, 3, 4],
       [3, 2, 2, 2, 2, 2, 4, 2, 4, 2, 3],
       [3, 2, 2, 2, 2, 2, 4, 2, 4, 2, 3]], dtype=int64)

In [23]:
clf = svm.SVC()
clf.fit(X=digitized,y=label.values)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
?clf

In [25]:
score_knn = cross_val_score(clf, digitized, label.values, cv=4)
print("Cross Validation score : " + str(score_knn))
print("Cross Validation Mean score : " + str(score_knn.mean()))

Cross Validation score : [0.48655257 0.50816993 0.53676471 0.52864157]
Cross Validation Mean score : 0.5150321947387007


In [26]:
from sklearn.metrics import confusion_matrix
confusion_matrix(label.values, clf.predict(X=digitized))

array([[   2,    0,    6,   12,    0,    0,    0],
       [   0,   11,   89,   61,    2,    0,    0],
       [   0,    0,  869,  582,    6,    0,    0],
       [   0,    1,  316, 1784,   96,    0,    0],
       [   0,    0,   36,  575,  269,    0,    0],
       [   0,    0,    7,  119,   49,    0,    0],
       [   0,    0,    0,    2,    3,    0,    0]], dtype=int64)

In [27]:
# Let's try different kernel
clf = svm.SVC(kernel='poly')
clf.fit(X=digitized,y=label.values)  

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [28]:
score_knn = cross_val_score(clf, digitized, label.values, cv=5)
print("Cross Validation score : " + str(score_knn))
print("Cross Validation Mean score : " + str(score_knn.mean()))

Cross Validation score : [0.47604485 0.48216106 0.53013279 0.48773006 0.54601227]
Cross Validation Mean score : 0.5044162064364903


In [13]:
from sklearn.metrics import confusion_matrix
confusion_matrix(label.values, clf.predict(X=digitized))

array([[  10,    0,    3,    7,    0,    0,    0],
       [   1,   31,   66,   63,    2,    0,    0],
       [   0,    2,  802,  648,    5,    0,    0],
       [   4,    1,  325, 1766,  102,    0,    0],
       [   0,    0,   35,  606,  238,    1,    0],
       [   0,    0,    1,  104,   43,   27,    0],
       [   0,    0,    0,    2,    3,    0,    0]])

In [None]:
# Please vary and check for Accuracy and Confusion Matrix by varying (C, kernel, gamma)