In [23]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix


In [None]:

plt.rc('axes', linewidth = 1.5, labelsize = 14)
plt.rc('xtick', labelsize = 14)
plt.rc('ytick', labelsize = 14)
plt.rc('xtick.major', size = 3, width = 1.5)
plt.rc('ytick.major', size = 3, width = 1.5)

In [2]:
wineData = pd.read_csv('../input/winequality-red.csv')
wineData.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
wineData.quality.unique() 

array([5, 6, 7, 4, 8, 3])

Looks like wine quality in this dataset ranges from 3 to 8. Let's look at the distribution:

In [None]:
#wine as 'good'(1) if its quality is larger than or equal to 7, and 'not good'(0) otherwise. 

In [5]:
wineData['category'] = wineData['quality'] >= 7
wineData.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,category
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,False
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,False
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,False
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,False
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,False


In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
X = wineData[['fixed acidity','volatile acidity']].values
y = wineData['category'].values.astype(np.int)

scaler = StandardScaler()
Xstan = scaler.fit_transform(X)

#for plot
dataStan=pd.DataFrame()
dataStan['fixed acidity(stan)']=Xstan[:,0]
dataStan['volatile acidity(stan)']=Xstan[:,1]
dataStan['category']=y
dataStan.head()

Unnamed: 0,fixed acidity(stan),volatile acidity(stan),category
0,-0.52836,0.961877,0
1,-0.298547,1.967442,0
2,-0.298547,1.297065,0
3,1.654856,-1.384443,0
4,-0.52836,0.961877,0


In [8]:
logReg = LogisticRegression()
logReg.fit(Xstan,y)
logReg.coef_, logReg.intercept_



(array([[ 0.11854528, -0.98843455]]), array([-2.16578589]))

In [9]:
logReg.predict_proba(Xstan)

array([[0.96004294, 0.03995706],
       [0.98441694, 0.01558306],
       [0.97020716, 0.02979284],
       ...,
       [0.90069194, 0.09930806],
       [0.95155331, 0.04844669],
       [0.75417336, 0.24582664]])

In [10]:
yhat = logReg.predict(Xstan)
dataStan['predict']=yhat
dataStan.head()

Unnamed: 0,fixed acidity(stan),volatile acidity(stan),category,predict
0,-0.52836,0.961877,0,0
1,-0.298547,1.967442,0,0
2,-0.298547,1.297065,0,0
3,1.654856,-1.384443,0,0
4,-0.52836,0.961877,0,0


In [13]:
#all columns
X = wineData[wineData.columns[0:11]].values

In [14]:
y = wineData['category'].values.astype(np.int)

scaler = StandardScaler()
Xstan = scaler.fit_transform(X)

In [15]:
dataStan=pd.DataFrame(data = Xstan, columns = wineData.columns[0:11])
dataStan['category']=y
dataStan.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,category
0,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,0
1,-0.298547,1.967442,-1.391472,0.043416,0.223875,0.872638,0.624363,0.028261,-0.719933,0.12895,-0.584777,0
2,-0.298547,1.297065,-1.18607,-0.169427,0.096353,-0.083669,0.229047,0.134264,-0.331177,-0.048089,-0.584777,0
3,1.654856,-1.384443,1.484154,-0.453218,-0.26496,0.107592,0.4115,0.664277,-0.979104,-0.46118,-0.584777,0
4,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,0


In [16]:
logReg = LogisticRegression()
logReg.fit(Xstan,y)
logReg.intercept_, logReg.coef_



(array([-2.74747858]),
 array([[ 0.44832583, -0.44980744,  0.11472068,  0.32097755, -0.39780712,
          0.09977471, -0.51371325, -0.46204675,  0.02447459,  0.61383442,
          0.79008202]]))

In [21]:
yhat = logReg.predict(Xstan)
dataStan['predict'] = yhat
dataStan.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,category,predict
0,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,0,0
1,-0.298547,1.967442,-1.391472,0.043416,0.223875,0.872638,0.624363,0.028261,-0.719933,0.12895,-0.584777,0,0
2,-0.298547,1.297065,-1.18607,-0.169427,0.096353,-0.083669,0.229047,0.134264,-0.331177,-0.048089,-0.584777,0,0
3,1.654856,-1.384443,1.484154,-0.453218,-0.26496,0.107592,0.4115,0.664277,-0.979104,-0.46118,-0.584777,0,0
4,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,0,0
