# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from matplotlib import style
import math
import pickle
%matplotlib inline

# Algorithms

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_curve,roc_auc_score
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV

# Load the dataset

In [None]:
wine=pd.read_csv("winequality.csv")
wine.head()

# Understanding Data

In [None]:
# See the number of rows and columns
print("Rows, columns: " + str(wine.shape))

# See the first five rows of the dataset
wine.head()

In [None]:
wine.rename(columns= {'fixed acidity': 'fixed_acidity', 'volatile acidity': 'volatile_acidity', 'citric acid': 'citric_acid', 'residual sugar': 'residual_sugar', 'free sulfur dioxide': 'free_sulfur_dioxide','total sulfur dioxide': 'total_sulfur_dioxide' }, inplace=True)
wine.head()

In [None]:
# In the data there is no categorical variables to study but we have all numerical variables
# 11 of the features are floats, 1 is integers.
wine.info()

In [None]:
#Number of unique values in this dataset

for col in wine.columns.values:
 print("Number of unique values of {}:{}".format(col,wine[col].nunique()))

In [None]:
#Describe the dataset to get a better idea on what's happening
wine.describe(include='all')

Observations:

Chlorides - Mininmum of 0.012, Maximum of 0.611 meaning some wines are really salty

Residual sugar - Minimu of 0.9, Maximum of 15.5. Some wines are really sweet

Fixed acidity ranges from 25% - 7.1 and 50% - 7.9. This could explain the huge number of outliers

pH - some wines are much more acid than others

The mean is more than median (50th percentile) in all columns

There is a large difference in 75% percentile and max in residual sugar,free sulfur dioxide,total sulfur dioxide

In [None]:
#Distribution of outliers within the data
collist=wine.columns.values
ncol=12
nrows=10
plt.figure(figsize=(ncol,5*ncol))
for i in range(0,len(collist)):
 plt.subplot(nrows,ncol,i+1)
 sns.boxplot(wine[collist[i]],color='green',orient='v')
 plt.tight_layout()

In [None]:
# #Distribution of Skewness
# plt.figure(figsize=(20,20))
# for i in range(0,len(collist)):
#  plt.subplot(nrows,ncol,i+1)
#  sns.distplot(wine[collist[i]])

In [None]:
# Check Missing Values
print(wine.isna().sum())

The data looks very clean by looking at the first five rows

# Exploring Variables

In [None]:
# Histogram for the 'quality' variable making sure there is enough good quality wine
fig = px.histogram(wine,x='quality')
fig.show()

Observation:
    
Quality has most values concentrated in the categories of 5,6,7

Fewer values are concentrated in the categories of 3,4 8

In [None]:
corr = wine.corr()
plt.subplots(figsize=(15,10))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
corr['quality'].sort_values(ascending=False)

Observation:
    
Quality is highly correlated with alcohol

Alcohol is negatively correlated with density -0.5

Density is highly positively correlated with fixed acidity

Volatile acidity is negatively correlated with quality and citric acid

Free sulphuric acid is highly correlated with total sulfur dioxide

Conclusion can be made that the attributes alcohol, sulphates, citric acid, fixed acidity have maximum correlation with 'quality'


In [None]:
# Create Classification version of target variable
wine['rating'] = [1 if x >= 6 else 0 for x in wine['quality']]

# Good quality becomes 1 and Bad quality becomes 0

# Separate feature variables and target variable
#X = wine.iloc[:,:11]
X = wine.drop(['quality','rating'], axis = 1)
y = wine['rating']

In [None]:
# See proportion of good vs bad wines
wine['rating'].value_counts()

In [None]:
wine.groupby('rating').mean()

In [None]:
# Analysis of alcohol percentage with wine quality
bx = sns.boxplot(x="quality", y='alcohol', data = wine)
bx.set(xlabel='Wine Quality', ylabel='Alcohol Percent', title='Alcohol percent in different wine quality types')

Observation

Alcohol content increases as the quality of wine increases

In [None]:
# Analysis of pH & wine ratings
bx = sns.swarmplot(x="rating", y="pH", data = wine);
bx.set(xlabel='Wine Ratings', ylabel='pH', title='pH in different types of Wine ratings')

In [None]:
#Analysis of sulphates & wine ratings
bx = sns.boxplot(x="rating", y='sulphates', data = wine)
bx.set(xlabel='Wine Ratings', ylabel='Sulphates', title='Sulphates in different types of Wine ratings')

# Sulphates level increases with the quality of wine

In [None]:
#Analysis of Citric Acid & wine ratings
bx = sns.violinplot(x="quality", y='citric_acid', data = wine)
bx.set(xlabel='Quality', ylabel='Citric Acid', title='Citric_acid in different types of Wine ratings')

#Citric acid increases as quality of the wine increases

In [None]:
# Analysis of fixed acidity & wine ratings
bx = sns.boxplot(x="rating", y='fixed_acidity', data = wine)
bx.set(xlabel='Wine Ratings', ylabel='Fixed Acidity', title='Fixed Acidity in different types of Wine ratings')

In [None]:
# Distribution plots
sns.FacetGrid(wine,hue='rating',height=6).map(sns.distplot,'alcohol').add_legend()

Observation



There is a higher probability of good quality wine, if alcohol content is >= 12

The probability of good quality wine decreases as alcohol content decreases

In [None]:
# Linear Regression
# The graph below shows a linear regression between residual sugar and alcohol content for different quality ratings(bad, good)
sns.lmplot(x = 'alcohol', y = 'residual_sugar', col = 'rating', data = wine)

 
Observation

An observation can be made that in both types of wine the residual sugar content remains almost the same irrespective of change in alcohol content value.

# Building a Machine Learning Model

In [None]:
# Normalize feature variables and apply Standard Scaling to get optimized result
sc=StandardScaler()
X_features = X
X = sc.fit_transform(X)

In [None]:
# Splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
# from sklearn.metrics import classification_report
# from sklearn.tree import DecisionTreeClassifier
# model1 = DecisionTreeClassifier(random_state=1)
# model1.fit(X_train, y_train)
# y_pred1 = model1.predict(X_test)
# print(classification_report(y_test, y_pred1))

In [None]:
#Applying Machine Learning Algorithms

lg=LogisticRegression()
gnb = GaussianNB()

#prepare models
models=[]
models.append(('LogisticRegression',lg))
models.append(('Naive Bayes',gnb))

#evaluate each model
Model=[]
cvs=[]
score=[]
rocscore=[]
for name,model in models:
 print('**************',name,'***********')
 print('\n')
 Model.append(name)
 model.fit(X_train,y_train)
 print(model)
 predictions=model.predict(X_test)
 print('\n')
 acc=accuracy_score(y_test,predictions)
 print('accuracy score',acc)
 score.append(acc*100)
 cv=model_selection.cross_val_score(model,X,y,cv=10,scoring='accuracy').mean()
 print('Cross-val-score=',cv)
 cvs.append(cv*100)
 print('\n')
 false_positive_rate,true_positive_rate,thresholds=roc_curve(y_test,predictions)
 roc_auc=roc_auc_score(y_test,predictions)
 print('roc_auc_score',roc_auc)
 rocscore.append(roc_auc*100)
 print('\n')
 print(classification_report(y_test,predictions))
 print('\n')
 cm=confusion_matrix(y_test,predictions)
 print(cm)
 print('\n')
 plt.figure(figsize=(10,15))
 plt.subplot(911)
 plt.title(name)
 print(sns.heatmap(cm,annot=True))
 plt.subplot(912)
 plt.title(name)
 plt.plot(false_positive_rate,true_positive_rate,label='AUC'%roc_auc)
 plt.plot([0,1],[0,1],'k--')
 plt.xlabel('false_positive_rate')
 plt.ylabel('true_positive_rate')
 plt.show()

Selecting Best Model:

From above table we can observe the accuracy for Logistic Regression is 76% compared to Naive Bayes, 74%.This looks like a good score.

Logistic Regression model has a higher Cross-val-score of 73%.

Overall performance of either of the Logistic Regression algorithm is good.

Receiver Operating Characteristic(ROC) curve is a plot of the true positive rate against the false positive rate. It shows the tradeoff between sensitivity and specificity.

AUC(Area Under Curve) score for the case is 0.75. AUC score 1 represents perfect classifier, and 0.5 represents a worthless classifier.

In [None]:
#Show Mean Absolute Error, Mean Squared Error & Root Mean Squared Error 
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
print(lg.intercept_)

In [None]:
# Show R squared value for regression
print('R squared value: ',lg.score(X_train,y_train))

In [None]:
# Correlation
print('Correlation: ', math.sqrt(lg.score(X_train,y_train)))

In [None]:
y_predicted = lg.predict(X_test)
cm = confusion_matrix(y_test, y_predicted)
cm

In [None]:
# True positive is 135(34%), false negative is 50(12%)
# False positive is 48(12%), true negative, 167(42%)

# true positives: These are cases in which we predicted yes and are actually yes.
# true negatives: We predicted no, and no in actual.
# false positives: We predicted yes, but actual is no. (Type I error)
# false negatives: We predicted no, yes in actual. (Type II error)

In [None]:
#Predicting Values Logistic Regression
lg.fit(X_train,y_train)
predictions=lg.predict(X_test)
print('predicted :',predictions)
print('actual',y_test)

In [None]:
predictions = lg.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

# Saving Model

In [None]:
# Save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

#use the loaded model to make prediction
result = loaded_model.predict(X_test)
result_accuracy = loaded_model.score(X_test, y_test)

print(result)
print(result_accuracy)

In [None]:
# An estimate of 74% the accuracy of the model on unseen data is reported

In [None]:
#save model using joblib
from sklearn.externals import joblib

#save the model in a file
joblib.dump(lg,'finalized_model_joblib.obj')

#load the model from a file
lg_from_joblib=joblib.load('finalized_model_joblib.obj')

#use the loaded model to make prediction
result = lg_from_joblib.predict(X_test)
result_accuracy = lg_from_joblib.score(X_test, y_test)

print(result)
print(result_accuracy)


In [None]:
# Filtering DataFrame for only good quality
wine_good = wine[wine['rating']==1]
wine_good.describe()

We can see that good quality wines have:

higher levels of alcohol on average
lower volatile acidity on average 
Higher levels of sulphates on average
higher levels of residual sugar on average.

In [None]:
# Filtering Dataframe for only bad quality
wine_bad = wine[wine['rating']==0]
wine_bad.describe()

In [None]:

# MLP for Pima Indians Dataset Serialize to JSON and HDF5
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json
import numpy
import os
# # fix random seed for reproducibility
# numpy.random.seed(7)
# # load pima indians dataset
# dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")
# # split into input (X) and output (Y) variables
# X = dataset[:,0:8]
# Y = dataset[:,8]
# # create model
# model = Sequential()
# model.add(Dense(12, input_dim=8, activation='relu'))
# model.add(Dense(8, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))
# # Compile model
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# # Fit the model
# model.fit(X, Y, epochs=150, batch_size=10, verbose=0)
# # evaluate the model
# scores = model.evaluate(X, Y, verbose=0)
# print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
 
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")
 
# later...
 
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

In [None]:
pip install -q pyyaml h5py