In [23]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_validate
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

import plotly.offline as py
from plotly.offline import download_plotlyjs, init_notebook_mode
import plotly.graph_objs as go

init_notebook_mode(connected=True)

red_path = "./res/winequality-red.csv"
white_path = "./res/winequality-white.csv"
result_folder = "./results/"

test_set_ratio = 0.1

### Preprocessing

In [145]:
df_red = pd.read_csv(red_path,";")
df_red['type'] = 0
df_white = pd.read_csv(white_path, ";")
df_white['type'] = 1


#df_ = pd.concat([df_red])
#df_ = pd.concat([df_white])
df_ = pd.concat([df_red,df_white])


## add ID column
df_ = pd.concat([df_red,df_white])
df_ = df_.reset_index(drop=True)
df_['ID'] = df_.index
df_['quality'] = [4 if (x==3 or x==4) else x for x in df_['quality']]
df_['quality'] = [8 if (x==8 or x==9) else x for x in df_['quality']]
df_['quality'] = df_['quality']-3

## create train/test for both for prediction (not training it)
df_train, df_test = train_test_split(df_, shuffle=True, test_size=test_set_ratio, random_state=0)
df_train = df_train.sort_values(by=['ID']).reset_index(drop=True)
df_test = df_test.sort_values(by=['ID']).reset_index(drop=True)

In [146]:
bip = df_.groupby(['quality']).agg(['count'])
bip

Unnamed: 0_level_0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type,ID
Unnamed: 0_level_1,count,count,count,count,count,count,count,count,count,count,count,count,count
quality,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
1,246,246,246,246,246,246,246,246,246,246,246,246,246
2,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138,2138
3,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836,2836
4,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079,1079
5,198,198,198,198,198,198,198,198,198,198,198,198,198


In [147]:
df_train.head(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type,ID
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2,0,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,2,0,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,2,0,2
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,3,0,3
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2,0,4
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,2,0,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,2,0,6
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,4,0,7
8,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,2,0,9
9,6.7,0.58,0.08,1.8,0.097,15.0,65.0,0.9959,3.28,0.54,9.2,2,0,10


### Initialize Training data

In [148]:
df_vars = df_train.loc[:, df_train.columns != 'quality']
df_class = df_train['quality']

X_train, X_test, y_train, y_test = train_test_split(df_vars, df_class, test_size=test_set_ratio, random_state=42)
clf = SVC(gamma='auto')

### Simple cross validation for well-founded results

In [149]:
scores = cross_validate(clf, df_vars, df_class, cv=5)
scores['test_score']

array([0.43003413, 0.42820513, 0.43418803, 0.43065068, 0.4353042 ])

In [None]:
min_samples_splits = [2,3,4,5,7,10,15,20,25]
results = []
for min_samples_split in min_samples_splits:
    #clf = DecisionTreeClassifier(min_samples_split=min_samples_split)
    clf = SVC(gamma='auto')
    scores = cross_validate(clf, df_vars, df_class, cv=5)
    results.append(scores['test_score'].mean())
    
plt.plot(min_samples_splits, results, "r", label="Test Accuracy")
plt.ylabel("accuracy")
plt.xlabel("min_samples_split")
plt.title("Mean 5-fold cross-validated accuracy")
plt.show()

### Model fitting and sample testing

In [150]:
model = clf.fit(X_train, y_train)
model.score(X_test, y_test)

0.48205128205128206

### Final prediction

In [151]:
df_res = pd.DataFrame()
df_res['ID'] = df_test['ID']
df_res['quality'] = pd.Series(model.predict(df_test.loc[:,df_test.columns != 'quality']))
df_res.head(10)
#np.count_nonzero(df_res['quality']==df_test['quality'])

Unnamed: 0,ID,quality
0,8,3
1,12,3
2,14,3
3,15,3
4,34,3
5,39,2
6,42,3
7,44,3
8,48,3
9,90,3


### Export

In [108]:
export_file_name = result_folder + "simple_svm.csv"
df_res.to_csv(export_file_name, index=False)

### Easy comparison

In [152]:
import traceback
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors.nearest_centroid import NearestCentroid

df_vars = df_train.loc[:, df_train.columns != 'quality']
df_class = df_train['quality']

# Adjust size here - Terminates in reasonable time for 10.000 elements and k<=40 attributes
# Won't terminate for 100.000
#df_vars=df_vars.head(5000)
#df_class=df_class.head(5000)
#df_class[df_class<0] = 0
#df_vars = np.where(df_vars < 0, 2, df_vars)

#X, y = df_vars,df_class
##Adjust k attributes here
#X_new = SelectKBest(chi2, k=1).fit_transform(X, y)
#df_vars=X_new
#X_train, X_test, y_train, y_test = train_test_split(X_new, df_class, test_size=test_set_ratio, random_state=0)

names = ["Nearest Centroid", "Linear SVM",
         "Decision Tree", "Naive Bayes"]

classifiers = [
    NearestCentroid(),
    SVC(gamma='auto'),
    DecisionTreeClassifier(),
    GaussianNB()]

cl_scores = {}

for name, clf in zip(names, classifiers):
    try:
        np.random.seed(32143421)
        scores = cross_validate(clf, df_vars, df_class, cv=10, scoring='accuracy', return_train_score=True)
        cl_scores[name] = scores
        print(name,"-- Training Set --", "Mean", scores['train_score'].mean(), "-- Min", scores['train_score'].min(), "-- Max", scores['train_score'].max())
        print(name,"-- Test Set --", "Mean", scores['test_score'].mean(), "-- Min", scores['test_score'].min(), "-- Max", scores['test_score'].max())
    except:
        print("Classification failed for", name)
        traceback.print_exc()

Nearest Centroid -- Training Set -- Mean 0.20661914967505562 -- Min 0.17841535246057383 -- Max 0.22249667490024702
Nearest Centroid -- Test Set -- Mean 0.20155526286554099 -- Min 0.029159519725557463 -- Max 0.4606164383561644
Linear SVM -- Training Set -- Mean 0.993520148023919 -- Min 0.9923997719931598 -- Max 0.9946788293424553
Linear SVM -- Test Set -- Mean 0.42929823994181804 -- Min 0.39419795221843 -- Max 0.44082332761578047
Decision Tree -- Training Set -- Mean 1.0 -- Min 1.0 -- Max 1.0
Decision Tree -- Test Set -- Mean 0.28836208023772913 -- Min 0.17435897435897435 -- Max 0.42538593481989706
Naive Bayes -- Training Set -- Mean 0.4757375384984509 -- Min 0.4040692146795969 -- Max 0.49667363619083826
Naive Bayes -- Test Set -- Mean 0.4312115713725492 -- Min 0.33843537414965985 -- Max 0.5247863247863248


In [None]:
traces = []
for clf_name in names:
    traces.append(go.Box(
        y=cl_scores[clf_name]['test_score'],
        boxpoints='all',
        name=clf_name
    ))
    
layout = go.Layout(title="Accuracy on Test Set (Wine Quality)")
    
py.iplot(go.Figure(data=traces, layout=layout))