In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

In [2]:
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [29]:
from sklearn.linear_model import LogisticRegression

In [3]:
df1 = pd.read_csv('data/winequality-red.csv',sep=';') # full red wine data with multicollinear predictors and outliers present
df1.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
features = df1.columns[0:11].tolist()
print(features)

['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']


In [5]:
X = df1[features].values
y= df1['quality'].values

In [6]:
var_thresh = VarianceThreshold(threshold=0.8*(1-0.8))

In [7]:
var_thresh.fit(X)

VarianceThreshold(threshold=0.15999999999999998)

In [8]:
var_dict = {x[0]:x[1] for x in zip(features,var_thresh.variances_)}
var_dict
#variances = var_thresh.variances_

{'fixed acidity': 3.0295205688671114,
 'volatile acidity': 0.0320423261333205,
 'citric acid': 0.03792375112494089,
 'residual sugar': 1.9866539202698996,
 'chlorides': 0.002213757323311435,
 'free sulfur dioxide': 109.34645676374501,
 'total sulfur dioxide': 1081.42563558916,
 'density': 3.559801792630712e-06,
 'pH': 0.023820274241131787,
 'sulphates': 0.02871464701398349,
 'alcohol': 1.1349371714889036}

In [10]:
selected_feats =[]

for key in var_dict:
    if var_dict[key] > var_thresh.threshold:
        selected_feats.append(key)


In [12]:
selected_feats

['fixed acidity',
 'residual sugar',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'alcohol']

In [13]:
from project_functions import *

In [14]:
mini_df1 = df1[selected_feats]
mini_df1.head()

Unnamed: 0,fixed acidity,residual sugar,free sulfur dioxide,total sulfur dioxide,alcohol
0,7.4,1.9,11.0,34.0,9.4
1,7.8,2.6,25.0,67.0,9.8
2,7.8,2.3,15.0,54.0,9.8
3,11.2,1.9,17.0,60.0,9.8
4,7.4,1.9,11.0,34.0,9.4


In [15]:
vif_results,cols_2_keep = calculate_vif_per_factor(mini_df1,selected_feats)

In [16]:
vif_results

Unnamed: 0,Features,VIF
0,fixed acidity,1.06
1,residual sugar,1.08
2,free sulfur dioxide,1.85
3,total sulfur dioxide,1.92
4,alcohol,1.07


In [20]:
mini_df1 = mini_df1.assign(quality=y)
mini_df1.head()

Unnamed: 0,fixed acidity,residual sugar,free sulfur dioxide,total sulfur dioxide,alcohol,quality
0,7.4,1.9,11.0,34.0,9.4,5
1,7.8,2.6,25.0,67.0,9.8,5
2,7.8,2.3,15.0,54.0,9.8,5
3,11.2,1.9,17.0,60.0,9.8,6
4,7.4,1.9,11.0,34.0,9.4,5


In [22]:
mini_df1['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [23]:
mini_df1=mini_df1.assign(qual_1 = mini_df1['quality'].apply(lambda x: 0 if x in[0,1,2,3,4,5] else 1))
mini_df1.head()

Unnamed: 0,fixed acidity,residual sugar,free sulfur dioxide,total sulfur dioxide,alcohol,quality,qual_1
0,7.4,1.9,11.0,34.0,9.4,5,0
1,7.8,2.6,25.0,67.0,9.8,5,0
2,7.8,2.3,15.0,54.0,9.8,5,0
3,11.2,1.9,17.0,60.0,9.8,6,1
4,7.4,1.9,11.0,34.0,9.4,5,0


In [24]:
mini_df1['qual_1'].value_counts()

1    855
0    744
Name: qual_1, dtype: int64

In [26]:
min_max_one = MinMaxScaler()

In [27]:
new_X = min_max_one.fit_transform(mini_df1[selected_feats].values)
new_y = mini_df1['qual_1']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y,test_size=0.30, random_state=101)

In [33]:
lin_svm = SVC(kernel='linear')
rbf_svm = SVC(kernel='rbf',gamma='auto')
log_reg = LogisticRegression(solver='lbfgs')

Linear SVM Results

In [34]:
lin_svm_scores_5 = cross_val_score(lin_svm,X_train,y_train,scoring='f1_macro',cv=5)
lin_svm_scores_10 = cross_val_score(lin_svm,X_train,y_train,scoring='f1_macro',cv=10)

print("F1 Score (CV = 5): {}".format(round(np.array(lin_svm_scores_5).mean(),2)))
print("F1 Score (CV = 10): {}".format(round(np.array(lin_svm_scores_10).mean(),2)))

F1 Score (CV = 5): 0.72
F1 Score (CV = 10): 0.72


RBF SVM Results

In [36]:
rbf_svm_scores_5 = cross_val_score(rbf_svm,X_train,y_train,scoring='f1_macro',cv=5)
rbf_svm_scores_10 = cross_val_score(rbf_svm,X_train,y_train,scoring='f1_macro',cv=10)

print("F1 Score (CV = 5): {}".format(round(np.array(rbf_svm_scores_5).mean(),2)))
print("F1 Score (CV = 10): {}".format(round(np.array(rbf_svm_scores_10).mean(),2)))

F1 Score (CV = 5): 0.72
F1 Score (CV = 10): 0.72


Logistic Regression Results

In [38]:
lg_scores_5 = cross_val_score(log_reg,X_train,y_train,scoring='f1_macro',cv=5)
lg_scores_10 = cross_val_score(log_reg,X_train,y_train,scoring='f1_macro',cv=10)

print("F1 Score (CV = 5): {}".format(round(np.array(lg_scores_5).mean(),2)))
print("F1 Score (CV = 10): {}".format(round(np.array(lg_scores_10).mean(),2)))

F1 Score (CV = 5): 0.71
F1 Score (CV = 10): 0.72


Removing Outliers and running the 3 models again

In [39]:
outlier_flags = outlier_checker(mini_df1,features_iqr(mini_df1))
mini_df1 = mini_df1.assign(is_outlier = outlier_flags)
mini_df1.head()

Unnamed: 0,fixed acidity,residual sugar,free sulfur dioxide,total sulfur dioxide,alcohol,quality,qual_1,is_outlier
0,7.4,1.9,11.0,34.0,9.4,5,0,0
1,7.8,2.6,25.0,67.0,9.8,5,0,0
2,7.8,2.3,15.0,54.0,9.8,5,0,0
3,11.2,1.9,17.0,60.0,9.8,6,1,0
4,7.4,1.9,11.0,34.0,9.4,5,0,0


In [40]:
mini_df2 = mini_df1[mini_df1['is_outlier'] == 0]
mini_df2.head()

Unnamed: 0,fixed acidity,residual sugar,free sulfur dioxide,total sulfur dioxide,alcohol,quality,qual_1,is_outlier
0,7.4,1.9,11.0,34.0,9.4,5,0,0
1,7.8,2.6,25.0,67.0,9.8,5,0,0
2,7.8,2.3,15.0,54.0,9.8,5,0,0
3,11.2,1.9,17.0,60.0,9.8,6,1,0
4,7.4,1.9,11.0,34.0,9.4,5,0,0


In [41]:
new_X_2 = mini_df2[selected_feats].values
new_y_2 = mini_df2['qual_1'].values

In [42]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(new_X_2, new_y_2,test_size=0.30, random_state=101)

Linear Kernel SVM results without Outliers in data

In [43]:
lin_svm_scores_5 = cross_val_score(lin_svm,X_train_2,y_train_2,scoring='f1_macro',cv=5)
lin_svm_scores_10 = cross_val_score(lin_svm,X_train_2,y_train_2,scoring='f1_macro',cv=10)

print("F1 Score (CV = 5): {}".format(round(np.array(lin_svm_scores_5).mean(),2)))
print("F1 Score (CV = 10): {}".format(round(np.array(lin_svm_scores_10).mean(),2)))

F1 Score (CV = 5): 0.71
F1 Score (CV = 10): 0.71


RBF Kernel SVM results without Outliers in data

In [44]:
rbf_svm_scores_5 = cross_val_score(rbf_svm,X_train_2,y_train_2,scoring='f1_macro',cv=5)
rbf_svm_scores_10 = cross_val_score(rbf_svm,X_train_2,y_train_2,scoring='f1_macro',cv=10)

print("F1 Score (CV = 5): {}".format(round(np.array(rbf_svm_scores_5).mean(),2)))
print("F1 Score (CV = 10): {}".format(round(np.array(rbf_svm_scores_10).mean(),2)))

F1 Score (CV = 5): 0.67
F1 Score (CV = 10): 0.67


Logistic Regression results without Outliers in data

In [45]:
lg_scores_5 = cross_val_score(log_reg,X_train_2,y_train_2,scoring='f1_macro',cv=5)
lg_scores_10 = cross_val_score(log_reg,X_train_2,y_train_2,scoring='f1_macro',cv=10)

print("F1 Score (CV = 5): {}".format(round(np.array(lg_scores_5).mean(),2)))
print("F1 Score (CV = 10): {}".format(round(np.array(lg_scores_10).mean(),2)))

F1 Score (CV = 5): 0.71
F1 Score (CV = 10): 0.71
