In [1]:
import pandas as pd
from plotly.offline import download_plotlyjs,init_notebook_mode,iplot
import plotly.graph_objs as go

In [2]:
from project_functions import *

In [3]:
init_notebook_mode(connected=True)

In [4]:
df = pd.read_csv('data/winequality-red.csv',sep=';')

In [5]:
# Extract feature names
features = df.columns[0:len(df.columns)-1].tolist()
print(features)

['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']


In [6]:
# Derive Correlation matrix to create heatmap in Plotly
df_corr = pd.DataFrame(transform_corr_matrix(df[features].corr()),columns=['Feature_1','Feature_2','P_Corr'])
df_corr.head()

Unnamed: 0,Feature_1,Feature_2,P_Corr
0,fixed acidity,fixed acidity,1.0
1,fixed acidity,volatile acidity,-0.3
2,fixed acidity,citric acid,0.7
3,fixed acidity,residual sugar,0.1
4,fixed acidity,chlorides,0.1


In [7]:
# Create heat map to visualize degree of correlation between features

#Create heat map graph object
data = go.Heatmap(x = df_corr['Feature_1'].values.tolist(),
                  y = df_corr['Feature_2'].values.tolist(),
                  z = df_corr['P_Corr'].values.tolist(),
                  colorscale ='Jet')

In [8]:
#Create layout for full figure
layout = go.Layout(title='Red Wine Matrix Correlation')

In [9]:
#Instantiate figure object with graph and layout objects
fig = go.Figure(data = [data], layout = layout)

In [10]:
#Plot full figure
iplot(fig)

Calculate Variance Inflation Factor (VIF) for each feature.
Any feature with a VIF score greater than 5 is affected by multicollinearity
and should be removed from the dataset.

In [11]:
df_vif,cols_2_keep = calculate_vif_per_factor(df,features)
df_vif.head()

Unnamed: 0,Features,VIF
0,fixed acidity,7.77
1,volatile acidity,1.79
2,citric acid,3.13
3,residual sugar,1.7
4,chlorides,1.48


In [12]:
#Create new dataframe with only independent variables not affected by multicollinearity
y = df['quality']
df2 = df[cols_2_keep]
df2 = df2.assign(quality=y)
df2.head()

Unnamed: 0,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,pH,sulphates,alcohol,quality
0,0.7,0.0,1.9,0.076,11.0,34.0,3.51,0.56,9.4,5
1,0.88,0.0,2.6,0.098,25.0,67.0,3.2,0.68,9.8,5
2,0.76,0.04,2.3,0.092,15.0,54.0,3.26,0.65,9.8,5
3,0.28,0.56,1.9,0.075,17.0,60.0,3.16,0.58,9.8,6
4,0.7,0.0,1.9,0.076,11.0,34.0,3.51,0.56,9.4,5


In [13]:
#Identify outliers
outlier_results = outlier_checker(df2,features_iqr(df2))
df3 = df2.assign(outlier_check = outlier_results)
#df2['outlier_check'] = outlier_results
df3.head()

Unnamed: 0,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,pH,sulphates,alcohol,quality,outlier_check
0,0.7,0.0,1.9,0.076,11.0,34.0,3.51,0.56,9.4,5,0
1,0.88,0.0,2.6,0.098,25.0,67.0,3.2,0.68,9.8,5,0
2,0.76,0.04,2.3,0.092,15.0,54.0,3.26,0.65,9.8,5,0
3,0.28,0.56,1.9,0.075,17.0,60.0,3.16,0.58,9.8,6,0
4,0.7,0.0,1.9,0.076,11.0,34.0,3.51,0.56,9.4,5,0


In [14]:
df4 = df3[df3['outlier_check'] == 0].copy()
df4.head()

Unnamed: 0,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,pH,sulphates,alcohol,quality,outlier_check
0,0.7,0.0,1.9,0.076,11.0,34.0,3.51,0.56,9.4,5,0
1,0.88,0.0,2.6,0.098,25.0,67.0,3.2,0.68,9.8,5,0
2,0.76,0.04,2.3,0.092,15.0,54.0,3.26,0.65,9.8,5,0
3,0.28,0.56,1.9,0.075,17.0,60.0,3.16,0.58,9.8,6,0
4,0.7,0.0,1.9,0.076,11.0,34.0,3.51,0.56,9.4,5,0


In [15]:
#Run logistic regression without Cross Validation

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df4[cols_2_keep].values, df4['quality'].values
                                                    , test_size=0.30, random_state=101)

In [23]:
log_reg = LogisticRegression(multi_class='multinomial',solver='lbfgs',max_iter=10000).fit(X_train,y_train)

In [24]:
y_preds = log_reg.predict(X_test)

In [25]:
#Calculate R^2
from sklearn.metrics import r2_score

In [27]:
r2 = r2_score(y_test,y_preds)

In [28]:
print(r2)

0.15694691655324045


In [33]:
# Create csv file without outlier and multicollinear factors
df4.to_csv('data/clean-winequality-red.csv')

In [49]:
# Create csv file without multicollinear factors
df3.to_csv('data/vif-clean-winequality-red.csv')

In [50]:
from sklearn.linear_model import LogisticRegressionCV

In [52]:
log_reg_cv_5 = LogisticRegressionCV(cv=5,multi_class='multinomial',solver='lbfgs',max_iter=10000).fit(X_train,y_train)

In [53]:
y_preds_5 = log_reg_cv_5.predict(X_test)

In [54]:
r2_cv5 = r2_score(y_test,y_preds_5)
print(r2_cv5)

0.1405238045380438


In [55]:
log_reg_cv_10 = LogisticRegressionCV(cv=10,multi_class='multinomial',solver='lbfgs',max_iter=10000).fit(X_train,y_train)

In [56]:
y_preds_10 = log_reg_cv_10.predict(X_test)
r2_cv10 = r2_score(y_test,y_preds_10)
print(r2_cv10)

0.13504943386631163


Based on the R^2, it's evident that Logistic Regression is not a suitable approach as it can only explain ~ 14-16%
of the varibility in the data after accounting for multicillinearity and outliers. 