In [None]:
# Dependencies
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import style
import plotly.express as px
import math
import pickle

# Algorithms

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,roc_curve,roc_auc_score
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV

# Load the dataset

In [None]:
wine=pd.read_csv("winequality.csv")
wine.head()

# Data Exploration

In [None]:
#Shape of the dataset
wine.shape

In [None]:
# In the data there is no categorical variables to study but we have all numerical variables
#11 of the features are floats, 1 is integers.
wine.info()

In [None]:
#Number of unique values in this dataset


for col in wine.columns.values:
 print("Number of unique values of {}:{}".format(col,wine[col].nunique()))

#The feature fixed acidity has more number of unique values and quality feature has less number of unique values .

In [None]:
#Describe the dataset to get a better idea on what's happening
wine.describe(include='all')

# chlorides - similar to residual sugar. Min - 0.012, max - 0.611
# residual sugar - min - 0.9, max - 15. there is a huge differenceand explains the outliers.
# fixed acidity ranges from 25% - 7.1 and 50% - 7.9. This could explain the huge number of outliers
# citric acid - this looks uniformly distributed
# free sulfur dioxide, total sulfur dioxide - not much difference

# The mean is more than median (50th percentile) in all columns

# There is a large difference in 75% percentile and max in residual sugar,free sulfur dioxide,total sulfur dioxide




In [None]:
#check null values present or not
print(wine.isna().sum())

In [None]:
#quality has most values concentrated in the categories of 5,6,7
#few observations concentrated in the categories of 4,8,3
fig = px.histogram(wine,x='quality')
fig.show()

In [None]:
#Here we see that fixed acidity does not give any specification to classify the quality.
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'fixed acidity', data = wine)

In [None]:
#Here we see that as quality increases, volatile acidity decreases 
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'volatile acidity', data = wine)

In [None]:
#Citric acid goes higher as quality of the wine increases
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'citric acid', data = wine)

In [None]:
# there is not much difference on impact of residual sugars on the quality of wine 
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'residual sugar', data = wine)

In [None]:
#Composition of chlorides decreases as the quality of the wine increases
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'chlorides', data = wine)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'free sulfur dioxide', data = wine)

In [None]:
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'total sulfur dioxide', data = wine)

In [None]:
#Sulphates level increases with the quality of wine
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'sulphates', data = wine)

In [None]:
#Alcohol content increases as the quality of wine increases
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'alcohol', data = wine)

In [None]:
#Make classification for the response variable.

#Dividing wine as good and bad by giving the limit for the quality
bins=(2,6.5,8)
group=['Bad','Good']
wine['quality']=pd.cut(wine['quality'],bins=bins,labels=group)

#Assigning labels to our quality variable
label_quality=LabelEncoder()

#Bad becomes 0 and good becomes 1 
wine['quality']=label_quality.fit_transform(wine['quality'])

In [None]:
wine['quality'].value_counts()

In [None]:
sns.countplot(wine['quality'])

In [None]:
#it is clear that mostly chlorides present in between the alcohol range between 9 to 13 and chloride percentage is 10% to 20%
plt.scatter(wine['chlorides'],wine['alcohol'],cmap='orange')

In [None]:
# Check correlation

sns.heatmap(wine.corr())

In [None]:
# density has a strong positive correlation with residual sugar, whereas it has a strong negative correlation with alcohol.
# pH & fixed acidity has negative correlation.
# density & fixed acidity has positive correlation.
# citric acid & fixed acidity has positive correlation.
# citric acid & volatile acidity has negative correlation.
# free sulphur dioxide & total sulphur dioxide has positive correlation.


In [None]:
#Drop pH column
wine.drop('pH',axis=1,inplace=True)

In [None]:
# Distribution plots

# The Chlorides range in between (0.0–0.1) quality of wine is high and at chloride value 0.5, 
# the quality of wine is reduced

# If Chlorides range increases, the quality of wine is decreases .

# quality of wine in between alcohol range (9–12) is high

sns.FacetGrid(wine,hue='quality',height=5).map(sns.distplot,'chlorides').add_legend()
sns.FacetGrid(wine,hue='quality',height=6).map(sns.distplot,'alcohol').add_legend()

In [None]:
#boxplot for to check outliers
sns.boxplot(x='quality',y='citric acid',data=wine)

In [None]:
sns.pairplot(wine ,hue ='quality', vars =['fixed acidity','volatile acidity', 'citric acid', 'residual sugar',
 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
 'sulphates', 'alcohol'])

# Light shades are highly correlated
# quality is highly correlated with alcohol
# alcohol is highly negatively correlated with density -0.5
# density is highly positively correlated with residual sugar
# volatile acidity is negatively correlated with quality
# free sulphuric acid is highly correlated with total sulfur dioxide

In [None]:
# heat map to view correlated values
corr = wine.corr()
plt.subplots(figsize=(15,10))
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

# Darker shades are highly correlated
# quality is highly correlated with alcohol
# alcohol is negatively correlated with density -0.5
# density is highly positively correlated with fixed acidity
# volatile acidity is negatively correlated with quality adn citric acid
# free sulphuric acid is highly correlated with total sulfur dioxide

In [None]:
#Identification and Removal of Outliers
#  we can observe that there is a lot of outliers present .
wine['fixed acidity'].plot.box()

In [None]:
#There are lots of outliers present and need to be removed those outliers
collist=wine.columns.values
ncol=12
nrows=10
plt.figure(figsize=(ncol,5*ncol))
for i in range(1,len(collist)):
 plt.subplot(nrows,ncol,i+1)
 sns.boxplot(wine[collist[i]],color='green',orient='v')
 plt.tight_layout()

In [None]:
#Removing Outliers
from scipy.stats import zscore
z=np.abs(zscore(wine))
z
threshold=3
np.where(z>3)

In [None]:
# View shape
wine_new=wine[(z<3).all(axis=1)]
wine_new.shape

In [None]:
# view new data
wine_new.head()

In [None]:
#Distribution of Skewness
plt.figure(figsize=(20,20))
for i in range(0,len(collist)):
 plt.subplot(nrows,ncol,i+1)
 sns.distplot(wine_new[collist[i]])

In [None]:
#Density and Sulphates and fixed acidity are normally distributed .
wine_new.skew()

In [None]:
#we can see x and y, y is the target varible and quality feature is assigned to y and all other features 
#are assigned to x..
x=wine_new.iloc[:,:-1]
y=wine_new.iloc[:,-1]

In [None]:
#removal of skew-ness using log function
for col in x.columns:
 if x.skew().loc[col]>0.55:
     x[col]=np.log1p(x[col])

In [None]:
#only density data is normally distributed
#other independent variables are right skewed or positively skewed
x.skew()

# Building a Machine Learning Model

In [None]:

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=43)

In [None]:
#Applying Standard Scaling to get optimized result
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.fit_transform(x_test)

In [None]:
#Applying Machine Learning Algorithms
lg=LogisticRegression()
gnb = GaussianNB()

#prepare models
models=[]
models.append(('LogisticRegression',lg))
models.append(('Naive Bayes',gnb))

#evaluate each model
Model=[]
cvs=[]
score=[]
rocscore=[]
for name,model in models:
 print('**************',name,'***********')
 print('\n')
 Model.append(name)
 model.fit(x_train,y_train)
 print(model)
 predictions=model.predict(x_test)
 print('\n')
 acc=accuracy_score(y_test,pred)
 print('accuracy score',acc)
 score.append(acc*100)
 cv=model_selection.cross_val_score(model,x,y,cv=10,scoring='accuracy').mean()
 print('Cross-val-score=',cv)
 cvs.append(cv*100)
 print('\n')
 false_positive_rate,true_positive_rate,thresholds=roc_curve(y_test,pred)
 roc_auc=roc_auc_score(y_test,pred)
 print('roc_auc_score',roc_auc)
 rocscore.append(roc_auc*100)
 print('\n')
 print(classification_report(y_test,pred))
 print('\n')
 cm=confusion_matrix(y_test,pred)
 print(cm)
 print('\n')
 plt.figure(figsize=(10,15))
 plt.subplot(911)
 plt.title(name)
 print(sns.heatmap(cm,annot=True))
 plt.subplot(912)
 plt.title(name)
 plt.plot(false_positive_rate,true_positive_rate,label='AUC'%roc_auc)
 plt.plot([0,1],[0,1],'k--')
 plt.xlabel('false_positive_rate')
 plt.ylabel('true_positive_rate')
 plt.show()

In [None]:
#Selecting Best Model
# From above table we can observe the accuracy for both Logistic Regression and Naive Bayes is 92%.This looks like a good score.
# And Logistic Regression model has a higher Cross-val-score of 87%.
# Overall performance of either of the Logistic Regression algorithm is good.

In [None]:
#Show Mean Absolute Error, Mean Squared Error & Root Mean Squared Error 
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
print(lg.intercept_)

In [None]:
# Show R squared value for regression
print('R squared value: ',lg.score(x_train,y_train))

In [None]:
# Correlation
print('Correlation: ', math.sqrt(lg.score(x_train,y_train)))

In [None]:
#Predicting Values Logistic Regression
lg.fit(x_train,y_train)
predictions=lg.predict(x_test)
print('predicted :',pred)
print('actual',y_test)

In [None]:
predictions = lg.predict(x_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

# Saving Model

In [None]:
# Save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

#use the loaded model to make prediction
result = loaded_model.predict(x_test)
result_accuracy = loaded_model.score(x_test, y_test)

print(result)
print(result_accuracy)


# An estimate of the accuracy of the model on unseen data is reported

In [None]:
#save model using joblib
from sklearn.externals import joblib

#save the model in a file
joblib.dump(rfc,'quality of wine.obj')

#load the model from a file
lg_from_joblib=joblib.load('quality of wine.obj')

#use the loaded model to make prediction
lg_from_joblib.predict(x_test)