In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

plt.style.use('fivethirtyeight')

%matplotlib inline

# Reading the data

In [2]:
data = pd.read_csv('winequality-white.csv', sep = ';')
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Implementing the Solution

## Adding an extra Column

In [3]:
X = np.append(arr = np.ones((X.shape[0], 1)), values = X, axis = 1)

## Splitting the data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Scaling the data

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
X_train

array([[ 0.00000000e+00, -1.00559005e+00,  7.05693804e-01, ...,
         3.30592878e-01, -1.22931277e+00,  1.29246750e+00],
       [ 0.00000000e+00, -4.14558693e-01, -3.75130561e-01, ...,
        -5.77045334e-04, -8.80043503e-01, -5.75105734e-01],
       [ 0.00000000e+00, -4.14558693e-01, -1.78617041e-01, ...,
         3.96826863e-01,  2.26337994e+00,  6.42876807e-01],
       ...,
       [ 0.00000000e+00,  2.94678933e-01, -6.69900843e-01, ...,
        -1.45772471e+00,  1.67764310e-01, -1.06229875e+00],
       [ 0.00000000e+00, -4.14558693e-01, -8.66414364e-01, ...,
         1.12540069e+00,  4.29716263e-01,  2.36882627e-01],
       [ 0.00000000e+00,  1.35853537e+00, -5.71644082e-01, ...,
        -5.30448923e-01,  3.42398946e-01, -8.18702242e-01]])

In [7]:
X_test

array([[ 0.        ,  0.76750402, -1.35769817, ...,  0.6617628 ,
         2.69996653, -0.73750341],
       [ 0.        , -0.76917751, -0.7681576 , ..., -0.06681103,
         0.51703358, -0.89990108],
       [ 0.        , -0.88738378,  0.41092352, ...,  0.6617628 ,
        -0.61809155,  1.536064  ],
       ...,
       [ 0.        , -1.24200259, -1.55421169, ...,  0.86046476,
        -1.40394741,  0.72407564],
       [ 0.        , -0.76917751, -0.08036028, ...,  1.45657062,
         0.08044699, -0.41270806],
       [ 0.        , -0.76917751, -0.08036028, ..., -0.199279  ,
        -1.05467814,  0.23688263]])

## Linear regression

In [8]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
predictions = regressor.predict(X_test)

In [9]:
predictions

array([5.80561034, 5.5143506 , 6.39142672, ..., 6.3493164 , 5.7112724 ,
       6.1595596 ])

In [10]:
y_test

1160    6
4569    6
160     7
4881    6
980     6
       ..
1928    6
1304    5
4532    6
2539    5
3889    6
Name: quality, Length: 1225, dtype: int64

# Evaluating and improving the model

In [11]:
r2_score(y_test, predictions)

0.30027460703142106

## Backwards elimination

In [12]:
X_opt = X[:, [0, 1, 2, 4, 6, 8, 9, 10, 11]]
X_opt = X_opt.astype(np.float64)
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,quality,R-squared:,0.282
Model:,OLS,Adj. R-squared:,0.281
Method:,Least Squares,F-statistic:,239.7
Date:,"Sat, 23 May 2020",Prob (F-statistic):,0.0
Time:,13:00:45,Log-Likelihood:,-5544.1
No. Observations:,4898,AIC:,11110.0
Df Residuals:,4889,BIC:,11160.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,154.1062,18.100,8.514,0.000,118.622,189.591
x1,0.0681,0.020,3.333,0.001,0.028,0.108
x2,-1.8881,0.110,-17.242,0.000,-2.103,-1.673
x3,0.0828,0.007,11.370,0.000,0.069,0.097
x4,0.0033,0.001,4.950,0.000,0.002,0.005
x5,-154.2913,18.344,-8.411,0.000,-190.254,-118.329
x6,0.6942,0.103,6.717,0.000,0.492,0.897
x7,0.6285,0.100,6.287,0.000,0.433,0.824
x8,0.1932,0.024,8.021,0.000,0.146,0.240

0,1,2,3
Omnibus:,114.194,Durbin-Watson:,1.621
Prob(Omnibus):,0.0,Jarque-Bera (JB):,251.255
Skew:,0.075,Prob(JB):,2.76e-55
Kurtosis:,4.099,Cond. No.,99500.0
