### Import Library

In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn import datasets
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
%matplotlib inline

### Load Data

In [None]:
df = pd.read_csv("./data/winequality-white.csv", sep=';')
df.head()

Separate the dataset into **feature matrix X** and **respoinse vector Y**.

In [None]:
X_df = df.iloc[:,:-1]
X_df.head()

In [None]:
X = X_df.as_matrix()
y_df = df["quality"].values

### Data Preview

In [None]:
plt.hist(y_df, range=(1, 10))
plt.xlabel('Ratings of wines')
plt.ylabel('Amount')
plt.title('Distribution of wine ratings')
plt.savefig('data_hist.png', format = 'png', dpi = 1000)
plt.show()

In [None]:
# sklearn can only deal with numpy arrys
Y = np.array([1 if i>=7 else 0 for i in y_df])
good_ratio = sum(Y) / len(Y)
bad_ratio = 1 - good_ratio
print('Ratio of good wine: ', good_ratio)
print('Ratio of bad wine: ', bad_ratio)

### SVM Classifier Construction

Choose SVM method to do the classification.

#### Over sampling training data using SMOTE 

In [None]:
#Split set into test and train
X_training, X_test, Y_training, Y_test = train_test_split(X, Y, test_size=0.2, random_state=20)
X_train, X_val, Y_train, Y_val = train_test_split(X_training, Y_training, test_size=0.2, random_state=20)

In [None]:
sm = SMOTE(random_state=20, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(X_train, Y_train)

#### Using different kernals

In [None]:
cv_scores=[]
C=1.0

models = (
          svm.SVC(kernel='linear', C=C),
          svm.SVC(kernel='rbf', gamma=0.7, C=C),
          svm.SVC(kernel='rbf', gamma=1, C=C),
          svm.SVC(kernel='rbf', gamma=10, C=C),
          svm.SVC(kernel='poly', degree=2, C=C),
          svm.SVC(kernel='poly', degree=3, C=C))

for m in models:
    scores = cross_val_score(m, x_train_res, y_train_res, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())
    
print ("index          Score          MSE")
for i in range(len(models)):
    print ('%d          %.5f' % (i, cv_scores[i]))

#### Double-Resampling ( 5-fold + Leave-one-out)

In [None]:
C = 1
models = (
          svm.SVC(kernel='rbf', gamma=0.2, C=C),
          svm.SVC(kernel='rbf', gamma=0.4, C=C),
          svm.SVC(kernel='rbf', gamma=0.6, C=C),
          svm.SVC(kernel='rbf', gamma=0.8, C=C),
          svm.SVC(kernel='rbf', gamma=1, C=C),
          svm.SVC(kernel='rbf', gamma=2, C=C))
fold=1
kf = KFold(n_splits=5)
kf.get_n_splits(x_train_res)
for train_index, test_index in kf.split(x_train_res):
    x_trainset, x_testset = x_train_res[train_index], x_train_res[test_index]
    y_trainset, y_testset = y_train_res[train_index], y_train_res[test_index]
    gamma = 0.2
    #number of trees
    for train_index, test_index in loo.split(train):
        x_train, x_test = x_trainset[train_index], x_trainset[test_index]
        y_train, y_test = y_testset[train_index], y_testset[test_index]
        for m in models:
            cv_scores=[]
            score=0.0

            #internal loop
            scores=cross_val_score(m, x_train, y_train, cv=5,scoring='accuracy')

            est_model=m
            est_model.fit(x_trainset, y_trainset)
            test_error=0.0
            for n in range(len(x_testset)):
                test_error+=(est_model.predict(x_testset[n].reshape(1,11))-y_testset[n])**2
            test_error/=len(x_testset)
            if gamma == 1.2:
                gamma = 2
            print (('fold=%d, #gamma=%.1f, score=%.4f, test_error=%.4f')%(fold, gamma, scores.mean(), test_error))
            gamma += 0.2
    fold+=1

In [None]:
clf_rf = svm.SVC(kernel='rbf', gamma=0.7, C=1)
clf_rf.fit(X_train, Y_train)

print ('Validation Results')
print ('confusion matrix')
print (confusion_matrix(Y_val, clf_rf.predict(X_val)))
print ("accuracy         recall        precision      f1_score")
print (accuracy_score(Y_val, clf_rf.predict(X_val)),\
       recall_score(Y_val, clf_rf.predict(X_val)),\
       precision_score(Y_val, clf_rf.predict(X_val)),\
      f1_score(Y_val, clf_rf.predict(X_val)))
print ('\nTest Results')
print ('confusion matrix')
print (confusion_matrix(Y_test, clf_rf.predict(X_test)))
print ("accuracy         recall        precision      f1_score")
print (accuracy_score(Y_test, clf_rf.predict(X_test)),\
      recall_score(Y_test, clf_rf.predict(X_test)),\
       precision_score(Y_test, clf_rf.predict(X_test)),\
      f1_score(Y_test, clf_rf.predict(X_test)))