# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from matplotlib import style
import math
import pickle
%matplotlib inline
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
from math import pi
from pandas.plotting import parallel_coordinates
import networkx as nx

# Algorithms
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_curve,roc_auc_score
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV

# Load the dataset

In [None]:
wine=pd.read_csv("winequality.csv")
wine.head()

# Understanding Data

In [None]:
# See the number of rows and columns
print("Rows, columns: " + str(wine.shape))

# See the first five rows of the dataset
wine.head()

In [None]:
# Rename columns
wine.rename(columns= {'fixed acidity': 'fixed_acidity', 'volatile acidity': 'volatile_acidity', 'citric acid': 'citric_acid', 'residual sugar': 'residual_sugar', 'free sulfur dioxide': 'free_sulfur_dioxide','total sulfur dioxide': 'total_sulfur_dioxide' }, inplace=True)
wine.head()

In [None]:
# In the data there is no categorical variables to study but we have all numerical variables
# 11 of the features are floats, 1 is integers.
wine.info()

In [None]:
# Make a plot
plt.figure(figsize=(15,10))
parallel_coordinates(wine, 'quality', colormap=plt.get_cmap("Set1"))
plt.title("Red Wine data class visualization according to quality (3,4,5,6,7,8)")
plt.xlabel("Ingredients of data set")
plt.ylabel("level")
plt.xticks(rotation=45)
plt.savefig('Data_class_visualization.png')
plt.show()

In [None]:
#Number of unique values in this dataset

for col in wine.columns.values:
 print("Number of unique values of {}:{}".format(col,wine[col].nunique()))

In [None]:
#Describe the dataset to get a better idea on what's happening
wine.describe(include='all')

Observations:

Chlorides - Mininmum of 0.012, Maximum of 0.611 meaning some wines are really salty

Residual sugar - Minimu of 0.9, Maximum of 15.5. Some wines are really sweet

Fixed acidity ranges from 25% - 7.1 and 50% - 7.9. This could explain the huge number of outliers

pH - some wines are much more acid than others

The mean is more than median (50th percentile) in all columns

There is a large difference in 75% percentile and max in residual sugar,free sulfur dioxide,total sulfur dioxide

In [None]:
#Distribution of outliers within the data
collist=wine.columns.values
ncol=12
nrows=10
plt.figure(figsize=(ncol,5*ncol))
for i in range(0,len(collist)):
 plt.subplot(nrows,ncol,i+1)
 sns.boxplot(wine[collist[i]],color='green',orient='v')
 plt.tight_layout()

In [None]:
# Check Missing Values
print(wine.isna().sum())

The data looks very clean by looking at the first five rows

# Exploring Variables

In [None]:
# Histogram for the 'quality' variable making sure there is enough good quality wine
fig = px.histogram(wine,x='quality')
fig.show()

Observation:
    
Quality has most values concentrated in the categories of 5,6,7

Fewer values are concentrated in the categories of 3,4 8

In [None]:
# PLot pH histogram and scatter plot
# trace1 is scatter plot
data = wine.loc[:,["pH","citric_acid","residual_sugar","density"]]
data_1 = wine.quality
x = dict(zip(data_1.unique(),"rgb"))
trace1 = go.Scatter(
    x=data.index,
    y=data.pH,
    mode = "markers",
    xaxis='x2',
    yaxis='y2',
    name = "pH",
    marker = dict(color = 'rgba(0, 112, 20, 0.8)'),
)

# trace2 is histogram
trace2 = go.Histogram(
    x=data.pH,
    opacity=0.75,
    name = "pH",
    marker=dict(color='rgba(10, 200, 250, 0.6)'))

# add trace1 and trace2
trace_data = [trace1, trace2]
layout = go.Layout(
    xaxis2=dict(
        domain=[0.7, 1],
        anchor='y2',        
    ),
    yaxis2=dict(
        domain=[0.6, 0.95],
        anchor='x2',
    ),
    title = ' pH Histogram and Scatter Plot'
)
fig = go.Figure(data=trace_data, layout=layout)
iplot(fig)

In [None]:
# Calculate the correlation between features.
corr = wine.iloc[:,0:10].corr()
corr

In [None]:
# Transform links data frame:
links = corr.stack().reset_index()
links.columns = ['var1', 'var2','value']     
links.head(10)

In [None]:
# correlation links
threshold = -1          

# Keep only correlation over a threshold and remove self correlation (cor(A,A)=1)
links_filtered=links.loc[ (links['value'] >= threshold ) & (links['var1'] != links['var2']) ]
 
# Build graph
G=nx.from_pandas_edgelist(links_filtered, 'var1', 'var2')
 
# Plot the network
nx.draw_circular(G, with_labels=True, node_color='orange', node_size=200, edge_color='red', linewidths=0, font_size=15)

In [None]:
# spider graph quality vs pH and density
categories = list(wine)[1:]
N = len(categories)
angles = [ n / float(N)*2*pi for n in range(N)]
angles = angles + angles[:1]
plt.figure(figsize = (10,10))
ax = plt.subplot(111,polar = True)
ax.set_theta_offset(pi/2)
ax.set_theta_direction(-1)
plt.xticks(angles[:-1],categories)
ax.set_rlabel_position(0)
plt.yticks([0,2,4,6],["0","2","4","6"],color= "red", size = 7)
plt.ylim(0,6)

values = wine.loc[0].drop("quality").values.flatten().tolist()
values = values + values[:1]
ax.plot(angles,values,linewidth = 1,linestyle="solid",label ="pH" )
ax.fill(angles,values,"b",alpha=0.1)

values = wine.loc[1].drop("quality").values.flatten().tolist()
values = values + values[:1]
ax.plot(angles,values,linewidth = 1,linestyle="solid",label ="density" )
ax.fill(angles,values,"orange",alpha=0.1)
plt.legend(loc = "upper left",bbox_to_anchor = (0.1,0.1))
plt.show()

In [None]:
corr = wine.corr()
plt.subplots(figsize=(15,10))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
corr['quality'].sort_values(ascending=False)

Observation:
    
Quality is highly correlated with alcohol

Alcohol is negatively correlated with density -0.5

Density is highly positively correlated with fixed acidity

Volatile acidity is negatively correlated with quality and citric acid

Free sulphuric acid is highly correlated with total sulfur dioxide

Conclusion can be made that the attributes alcohol, sulphates, citric acid, fixed acidity have maximum correlation with 'quality'


In [None]:
# 3D Scatter Plot for good quality vs density, pH and sulphates
quality6 = wine[wine.quality ==6]

quality7 = wine[wine.quality == 7]

# # data 
quality8 = wine[wine.quality == 8]

# trace1
trace1 = go.Scatter3d(
    x=quality6.pH,
    y=quality6.density,
    z=quality6.sulphates,
    mode='markers',
    name = "quality-6",
    marker=dict(
        color='rgb(100, 101, 101)',
        size=12,
        line=dict(
            color='rgb(100, 100, 100)',
            width=0.1
        )
    )
)


# trace2 
trace2 = go.Scatter3d(
    x=quality7.pH,
    y=quality7.density,
    z=quality7.sulphates,
    mode='markers',
    name = "quality-7",
    marker=dict(
        color='rgb(217, 100, 100)',
        size=12,
        line=dict(
            color='rgb(255, 255, 255)',
            width=0.1
        )
    )
)
# trace3 
trace3 = go.Scatter3d(
    x=quality8.pH,
    y=quality8.density,
    z=quality8.sulphates,
    mode='markers',
    name = "quality-8",
    marker=dict(
        color='rgb(54, 170, 127)',
        size=12,
        line=dict(
            color='rgb(204, 204, 204)',
            width=0.1
        )
    )
)
data2 = [trace1, trace2, trace3]
layout = go.Layout(
    title = ' 3D quality-7 and quality-8',
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data2, layout=layout)
iplot(fig)

In [None]:
# Create Classification version of target variable
wine['rating'] = [1 if x >= 6 else 0 for x in wine['quality']]

# Good quality becomes 1 and Bad quality becomes 0

# Separate feature variables and target variable
#X = wine.iloc[:,:11]
X = wine.drop(['quality','rating'], axis = 1)
y = wine['rating']

In [None]:
# See proportion of good vs bad wines
wine['rating'].value_counts()

In [None]:
wine.groupby('rating').mean()

In [None]:
# Analysis of alcohol percentage with wine quality
bx = sns.boxplot(x="quality", y='alcohol', data = wine)
bx.set(xlabel='Wine Quality', ylabel='Alcohol Percent', title='Alcohol percent in different wine quality types')

Observation

Alcohol content increases as the quality of wine increases

In [None]:
# Analysis of pH & wine ratings
bx = sns.swarmplot(x="rating", y="pH", data = wine);
bx.set(xlabel='Wine Ratings', ylabel='pH', title='pH in different types of Wine ratings')

In [None]:
#Analysis of sulphates & wine ratings
bx = sns.boxplot(x="rating", y='sulphates', data = wine)
bx.set(xlabel='Wine Ratings', ylabel='Sulphates', title='Sulphates in different types of Wine ratings')



In [None]:
#Analysis of Citric Acid & wine ratings
bx = sns.violinplot(x="quality", y='citric_acid', data = wine)
bx.set(xlabel='Quality', ylabel='Citric Acid', title='Citric_acid in different types of Wine ratings')



In [None]:
# Analysis of fixed acidity & wine ratings
bx = sns.boxplot(x="rating", y='fixed_acidity', data = wine)
bx.set(xlabel='Wine Ratings', ylabel='Fixed Acidity', title='Fixed Acidity in different types of Wine ratings')

In [None]:
# Distribution plots
sns.FacetGrid(wine,hue='rating',height=6).map(sns.distplot,'alcohol').add_legend()

Observation



There is a higher probability of good quality wine, if alcohol content is >= 12

The probability of good quality wine decreases as alcohol content decreases

Sulphates level increases with the quality of wine

Citric acid increases as quality of the wine increases

In [None]:
# Linear Regression
# The graph below shows a linear regression between residual sugar and alcohol content for different quality ratings(bad, good)
sns.lmplot(x = 'alcohol', y = 'residual_sugar', col = 'rating', data = wine)

 
Observation

An observation can be made that in both types of wine the residual sugar content remains almost the same irrespective of change in alcohol content value.

# Building a Machine Learning Model

In [None]:
# Normalize feature variables and apply Standard Scaling to get optimized result
sc=StandardScaler()
X_features = X
X = sc.fit_transform(X)

In [None]:
# Splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
#Applying Machine Learning Algorithms
lg=LogisticRegression()
gnb = GaussianNB()

#prepare models
models=[]
models.append(('LogisticRegression',lg))
models.append(('Naive Bayes',gnb))

#evaluate each model
Model=[]
cvs=[]
score=[]
rocscore=[]
for name,model in models:
 print('**************',name,'***********')
 print('\n')
 Model.append(name)
 model.fit(X_train,y_train)
 print(model)
 predictions=model.predict(X_test)
 print('\n')
 acc=accuracy_score(y_test,predictions)
 print('accuracy score',acc)
 score.append(acc*100)
 cv=model_selection.cross_val_score(model,X,y,cv=10,scoring='accuracy').mean()
 print('Cross-val-score=',cv)
 cvs.append(cv*100)
 print('\n')
 false_positive_rate,true_positive_rate,thresholds=roc_curve(y_test,predictions)
 roc_auc=roc_auc_score(y_test,predictions)
 print('roc_auc_score',roc_auc)
 rocscore.append(roc_auc*100)
 print('\n')
 print(classification_report(y_test,predictions))
 print('\n')
 cm=confusion_matrix(y_test,predictions)
 print(cm)
 print('\n')
 plt.figure(figsize=(10,15))
 plt.subplot(911)
 plt.title(name)
 print(sns.heatmap(cm,annot=True))
 plt.subplot(912)
 plt.title(name)
 plt.plot(false_positive_rate,true_positive_rate,label='AUC'%roc_auc)
 plt.plot([0,1],[0,1],'k--')
 plt.xlabel('false_positive_rate')
 plt.ylabel('true_positive_rate')
 plt.show()

Selecting Best Model:

The precision of predicting good quality wine (1) is 77% whereas prediction of bad quality wine is 74%.

From above table we can observe the accuracy for Logistic Regression is 76% compared to Naive Bayes, 74%.This looks like a good score.

Logistic Regression model has a higher Cross-val-score of 73%.

Overall performance of either of the Logistic Regression algorithm is good.

Receiver Operating Characteristic(ROC) curve is a plot of the true positive rate against the false positive rate. It shows the tradeoff between sensitivity and specificity.

AUC(Area Under Curve) score for the case is 0.75. AUC score 1 represents perfect classifier, and 0.5 represents a worthless classifier.

In [None]:
#Show Mean Absolute Error, Mean Squared Error & Root Mean Squared Error 
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
print(lg.intercept_)

In [None]:
# Show R squared value for regression
print('R squared value: ',lg.score(X_train,y_train))

In [None]:
# Correlation
print('Correlation: ', math.sqrt(lg.score(X_train,y_train)))

In [None]:
# Confusion Matrix
y_predicted = lg.predict(X_test)
cm = confusion_matrix(y_test, y_predicted)
cm

In [None]:
# True positive is 135(34%), false negative is 50(12%)
# False positive is 48(12%), true negative, 167(42%)

# true positives: These are cases in which we predicted yes and are actually yes.
# true negatives: Cases predicted no, and no in actual.
# false positives: Cases predicted yes, but actual is no. 
# false negatives: Cases predicted no, yes in actual. 

In [None]:
#Predicting Values Logistic Regression
lg.fit(X_train,y_train)
predictions=lg.predict(X_test)
print('predicted :',predictions)
print('actual',y_test)

In [None]:
#converting the numpy array to list
x=np.array(y_predicted).tolist()

#printing first 5 predictions
print("\nThe prediction:\n")
for i in range(0,5):
    print (x[i])
    
#printing first five expectations
print("\nThe expectation:\n")
print (y_test.head())

Observation

Almost all of the values in the prediction are similar to the expectations. 

Our model predicted wrongly a 0 as 1 for the first 5 predictions showing that our model performs well



In [None]:
# More detailed prediction vs actual
predictions = lg.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

# Saving Model

In [None]:
# Save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

#use the loaded model to make prediction
result = loaded_model.predict(X_test)
result_accuracy = loaded_model.score(X_test, y_test)

print(result)
print(result_accuracy)

In [None]:
# An estimate of 74% the accuracy of the model on unseen data is reported

In [None]:
#save model using joblib
from sklearn.externals import joblib

#save the model in a file
joblib.dump(lg,'finalized_model_joblib.obj')

#load the model from a file
lg_from_joblib=joblib.load('finalized_model_joblib.obj')

#use the loaded model to make prediction
result = lg_from_joblib.predict(X_test)
result_accuracy = lg_from_joblib.score(X_test, y_test)

print(result)
print(result_accuracy)


In [None]:
# Filtering DataFrame for only good quality
wine_good = wine[wine['rating']==1]
wine_good.describe()

We can see that good quality wines have:

higher levels of alcohol on average
lower volatile acidity on average 
higher levels of sulphates on average
higher levels of residual sugar on average.

In [None]:
# Filtering Dataframe for only bad quality
wine_bad = wine[wine['rating']==0]
wine_bad.describe()

Conclusion 

Observations were made that the key factors that determine and affects the quality of the red wine. Wine quality is ultimately a subjective measure.

To make the predictions of wine quality we trained two models. As seen, the Logistic Regression and Naive Bayes model . The Logistic Regression performed marginally better and we decided to stick to this model.

It is possible to increase the quality of the wine, if variables such as residual sugars and alcohol are controlled.