In [97]:
# import datasets
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

In [77]:
#import data
data_path = 'winequality-red.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [80]:
df.shape

(1599, 12)

In [78]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [5]:
#drop the string type column
df = df.drop(['condition'], axis=1)
df.shape

(52, 26)

In [6]:
df = df.dropna()

In [139]:
# Identify x,y in the model
feature = 'quality'

# standardize data
X = df.drop(feature, axis=1)

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

y = df[feature]

In [102]:
#fit model
lm = linear_model.LinearRegression()
lm.fit(X,y)

In [141]:
X.shape

(1599, 11)

In [142]:
lm.coef_

array([ 0.28239325, -1.58204178, -0.18256395,  0.23843654, -1.12266087,
        0.30965466, -0.92387606, -0.24354145, -0.52533949,  1.53027847,
        1.79528504])

In [143]:
lm.intercept_

5.71255299667039

In [144]:
# evaluate the model
y_pred = lm.predict(X)
print('lm_mse = ',mean_squared_error(y,y_pred))

lm_mse =  0.41676716722140805


In [145]:
# Using L2 regularization/ Ridge Regression
ridge = linear_model.Ridge(alpha = 0.8)
ridge.fit(X,y)

In [146]:
ridge.coef_

array([ 0.33460698, -1.5389554 , -0.14399836,  0.2352939 , -0.99738647,
        0.27735934, -0.86320942, -0.30064041, -0.44063407,  1.43731333,
        1.75580503])

In [147]:
ridge.intercept_

5.675994359027349

In [148]:
y_pred = ridge.predict(X)
print('ridge_mse = ',mean_squared_error(y,y_pred))

ridge_mse =  0.41699956209915584


In [149]:
# Using L1 regularization/ Lasso Regression
lasso = linear_model.Lasso(alpha = 0.005)
lasso.fit(X,y)

In [150]:
lasso.coef_

array([ 0.14247746, -1.51436698,  0.        ,  0.        , -0.        ,
       -0.        , -0.26047956, -0.        , -0.        ,  0.72930993,
        1.85819644])

In [151]:
lasso.intercept_

5.327747469905474

In [152]:
y_pred = lasso.predict(X)
print('lasso_mse = ',mean_squared_error(y,y_pred))

lasso_mse =  0.4318786928702726


In [124]:
coef = pd.DataFrame({\
    'Columns':df.drop(feature, axis=1).columns,'Linear':np.round(lm.coef_,3),\
    'Ridge':np.round(ridge.coef_,3),\
    'Lasso':np.round(lasso.coef_,3)
})
coef

Unnamed: 0,Columns,Linear,Ridge,Lasso
0,fixed acidity,0.282,0.335,0.142
1,volatile acidity,-1.582,-1.539,-1.514
2,citric acid,-0.183,-0.144,0.0
3,residual sugar,0.238,0.235,0.0
4,chlorides,-1.123,-0.997,-0.0
5,free sulfur dioxide,0.31,0.277,-0.0
6,total sulfur dioxide,-0.924,-0.863,-0.26
7,density,-0.244,-0.301,-0.0
8,pH,-0.525,-0.441,-0.0
9,sulphates,1.53,1.437,0.729


In [122]:
mse = pd.DataFrame({'ff':df.drop(feature, axis=1).columns,'Linear':np.round(lm.coef_,2),'Ridge':np.round(ridge.coef_,2),'Lasso':np.round(lasso.coef_,2)})

In [156]:
# Compare with statistical methods
X = pd.DataFrame(X, columns=df.drop(feature, axis=1).columns)
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                quality   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.356
Method:                 Least Squares   F-statistic:                     81.35
Date:                Wed, 25 Oct 2023   Prob (F-statistic):          1.79e-145
Time:                        20:07:05   Log-Likelihood:                -1569.1
No. Observations:                1599   AIC:                             3162.
Df Residuals:                    1587   BIC:                             3227.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    5.7126 