In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler

In [3]:
df1 = pd.read_csv('data/vif-clean-winequality-red.csv') # Data without multicollinear factors
df2 = pd.read_csv('data/clean-winequality-red.csv') # Data without outliers and multicollinear factors

In [4]:
df1['quality'] = df1['quality'].apply(lambda x: 0 if x in[0,1,2,3,4,5] else 1)
df1.head()

Unnamed: 0.1,Unnamed: 0,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,pH,sulphates,alcohol,quality,outlier_check
0,0,0.7,0.0,1.9,0.076,11.0,34.0,3.51,0.56,9.4,0,0
1,1,0.88,0.0,2.6,0.098,25.0,67.0,3.2,0.68,9.8,0,0
2,2,0.76,0.04,2.3,0.092,15.0,54.0,3.26,0.65,9.8,0,0
3,3,0.28,0.56,1.9,0.075,17.0,60.0,3.16,0.58,9.8,1,0
4,4,0.7,0.0,1.9,0.076,11.0,34.0,3.51,0.56,9.4,0,0


In [5]:
df2['quality'] = df2['quality'].apply(lambda x: 0 if x in[0,1,2,3,4,5] else 1)
df2.head()

Unnamed: 0.1,Unnamed: 0,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,pH,sulphates,alcohol,quality,outlier_check
0,0,0.7,0.0,1.9,0.076,11.0,34.0,3.51,0.56,9.4,0,0
1,1,0.88,0.0,2.6,0.098,25.0,67.0,3.2,0.68,9.8,0,0
2,2,0.76,0.04,2.3,0.092,15.0,54.0,3.26,0.65,9.8,0,0
3,3,0.28,0.56,1.9,0.075,17.0,60.0,3.16,0.58,9.8,1,0
4,4,0.7,0.0,1.9,0.076,11.0,34.0,3.51,0.56,9.4,0,0


In [6]:
features = list(set(df1.columns) - set(['Unnamed: 0','quality','outlier_check']))

In [7]:
min_max_one = MinMaxScaler()

In [8]:
X_one = min_max_one.fit_transform(df1[features].values)

In [9]:
y_one = df1['quality'].values

In [10]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_one, y_one
                                                    , test_size=0.30, random_state=101)

SVM using a Linear Kernel

In [11]:
lin_svm = SVC(kernel='linear')

In [12]:
lin_svm_scores_5 = cross_val_score(lin_svm,X_train_1,y_train_1,scoring='f1_macro',cv=5)
lin_svm_scores_10 = cross_val_score(lin_svm,X_train_1,y_train_1,scoring='f1_macro',cv=10)

In [13]:
print("F1 Score (CV = 5): {}".format(round(np.array(lin_svm_scores_5).mean(),2)))
print("F1 Score (CV = 10): {}".format(round(np.array(lin_svm_scores_10).mean(),2)))

F1 Score (CV = 5): 0.74
F1 Score (CV = 10): 0.74


SVM using a RBF kernel

In [16]:
rbf_svm = SVC(kernel='rbf',gamma='auto')

In [17]:
rbf_svm_scores_5 = cross_val_score(rbf_svm,X_train_1,y_train_1,scoring='f1_macro',cv=5)
rbf_svm_scores_10 = cross_val_score(rbf_svm,X_train_1,y_train_1,scoring='f1_macro',cv=10)

In [18]:
print("F1 Score (CV = 5): {}".format(round(np.array(rbf_svm_scores_5).mean(),2)))
print("F1 Score (CV = 10): {}".format(round(np.array(rbf_svm_scores_10).mean(),2)))

F1 Score (CV = 5): 0.73
F1 Score (CV = 10): 0.73


In [20]:
from sklearn.linear_model import LogisticRegression

In [23]:
log_reg = LogisticRegression(solver='lbfgs')

In [25]:
lg_scores_5 = cross_val_score(log_reg,X_train_1,y_train_1,scoring='f1_macro',cv=5)
lg_scores_10 = cross_val_score(log_reg,X_train_1,y_train_1,scoring='f1_macro',cv=10)

In [26]:
print("F1 Score (CV = 5): {}".format(round(np.array(lg_scores_5).mean(),2)))
print("F1 Score (CV = 10): {}".format(round(np.array(lg_scores_10).mean(),2)))

F1 Score (CV = 5): 0.75
F1 Score (CV = 10): 0.74


Conclusion: Logistic Regression performs slightly better than an SVM model (linear and rbf kernel)
when multicollinear predictors are removed but
outliers are present in the data.

In [27]:
X_two = min_max_one.fit_transform(df2[features].values)
y_two = df2['quality'].values

In [28]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_two, y_two
                                                    , test_size=0.30, random_state=101)

SVM using a linear kernel

In [29]:
lin_svm_scores2_5 = cross_val_score(lin_svm,X_train_2,y_train_2,scoring='f1_macro',cv=5)
lin_svm_scores2_10 = cross_val_score(lin_svm,X_train_2,y_train_2,scoring='f1_macro',cv=10)

In [30]:
print("F1 Score (CV = 5): {}".format(round(np.array(lin_svm_scores2_5).mean(),2)))
print("F1 Score (CV = 10): {}".format(round(np.array(lin_svm_scores2_10).mean(),2)))

F1 Score (CV = 5): 0.74
F1 Score (CV = 10): 0.73


SVM using an RBF kernel

In [31]:
rbf_svm_scores2_5 = cross_val_score(rbf_svm,X_train_2,y_train_2,scoring='f1_macro',cv=5)
rbf_svm_scores2_10 = cross_val_score(rbf_svm,X_train_2,y_train_2,scoring='f1_macro',cv=10)

In [32]:
print("F1 Score (CV = 5): {}".format(round(np.array(rbf_svm_scores2_5).mean(),2)))
print("F1 Score (CV = 10): {}".format(round(np.array(rbf_svm_scores2_10).mean(),2)))

F1 Score (CV = 5): 0.74
F1 Score (CV = 10): 0.74


Using Logistic Regression

In [33]:
lg_scores2_5 = cross_val_score(log_reg,X_train_2,y_train_2,scoring='f1_macro',cv=5)
lg_scores2_10 = cross_val_score(log_reg,X_train_2,y_train_2,scoring='f1_macro',cv=10)

In [34]:
print("F1 Score (CV = 5): {}".format(round(np.array(lg_scores2_5).mean(),2)))
print("F1 Score (CV = 10): {}".format(round(np.array(lg_scores2_10).mean(),2)))

F1 Score (CV = 5): 0.74
F1 Score (CV = 10): 0.74


Conclusion: Logistic Regression and SVM with RBF kernel have the same performance when multicollinearity and outlier issues are addressed.