In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn import metrics 
import matplotlib.pyplot as plt 
import numpy as np 
import seaborn as sns
from sklearn.metrics import mean_squared_error

In [2]:
df_white = pd.read_csv('winequality-white.csv', sep=";")
df_white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
# Correlation of each feature with - white wine quality
correlations = df_white.corr()['quality'].drop('quality')
print(correlations)

fixed acidity          -0.113663
volatile acidity       -0.194723
citric acid            -0.009209
residual sugar         -0.097577
chlorides              -0.209934
free sulfur dioxide     0.008158
total sulfur dioxide   -0.174737
density                -0.307123
pH                      0.099427
sulphates               0.053678
alcohol                 0.435575
Name: quality, dtype: float64


In [4]:
def get_features(correlation_threshold):
    abs_corrs = correlations.abs()
    high_correlations = abs_corrs[abs_corrs > correlation_threshold]
    return high_correlations.keys()

In [5]:
features = get_features(0.05)
x = df_white[features] 
y = df_white['quality']
features

Index(['fixed acidity', 'volatile acidity', 'residual sugar', 'chlorides',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
      dtype='object')

In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=3)

In [7]:
# fitting linear regression to training data
regressor = LinearRegression()
regressor.fit(x_train,y_train)
  
# this gives the coefficients of the 10 features selected above.  
print(regressor.coef_)

[ 7.19526025e-02 -2.00854758e+00  8.84222120e-02  8.13364495e-02
  4.40648710e-04 -1.59972407e+02  7.49071966e-01  6.62392132e-01
  1.85418147e-01]


In [8]:
coeffecients = pd.DataFrame(regressor.coef_,features) 
coeffecients.columns = ['Coeffecient'] 
print(coeffecients)

                      Coeffecient
fixed acidity            0.071953
volatile acidity        -2.008548
residual sugar           0.088422
chlorides                0.081336
total sulfur dioxide     0.000441
density               -159.972407
pH                       0.749072
sulphates                0.662392
alcohol                  0.185418


In [9]:
train_pred = regressor.predict(x_train)
test_pred = regressor.predict(x_test) 

In [10]:
# calculating rmse
train_rmse = mean_squared_error(train_pred, y_train) ** 0.5
print(train_rmse)
test_rmse = mean_squared_error(test_pred, y_test) ** 0.5
print(test_rmse)

# The root-mean-square error (RMSE) is a measure of the differences between values (sample and population values) predicted by a model and the values actually observed.

0.7502105513563052
0.7577821690235869


In [11]:
# rounding off the predicted values for test set
predicted_data = np.round_(test_pred)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, test_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, test_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, test_pred)))

[5. 6. 6. ... 6. 6. 5.]
Mean Absolute Error: 0.589241235575669
Mean Squared Error: 0.5742338156900919
Root Mean Squared Error: 0.7577821690235869


### ----------------------------------------------------------------

In [12]:
df_red = pd.read_csv('winequality-red.csv', sep=";")
df_red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [13]:
# Correlation of each feature with - red wine quality
correlations = df_red.corr()['quality'].drop('quality')
print(correlations)

fixed acidity           0.124052
volatile acidity       -0.390558
citric acid             0.226373
residual sugar          0.013732
chlorides              -0.128907
free sulfur dioxide    -0.050656
total sulfur dioxide   -0.185100
density                -0.174919
pH                     -0.057731
sulphates               0.251397
alcohol                 0.476166
Name: quality, dtype: float64


In [14]:
features = get_features(0.05)
a = df_red[features] 
b = df_red['quality']
features

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'chlorides',
       'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH',
       'sulphates', 'alcohol'],
      dtype='object')

In [15]:
a_train,a_test,b_train,b_test=train_test_split(a,b,random_state=3)

In [16]:
# fitting linear regression to training data
regressor = LinearRegression()
regressor.fit(a_train,b_train)
  
# this gives the coefficients of the 10 features selected above.  
print(regressor.coef_)

[ 0.01773723 -0.99256049 -0.13962865 -1.59094279  0.00559652 -0.00351973
  0.76859036 -0.43741414  0.81288805  0.30148385]


In [17]:
coeffecients = pd.DataFrame(regressor.coef_,features) 
coeffecients.columns = ['Coeffecient'] 
print(coeffecients)

                      Coeffecient
fixed acidity            0.017737
volatile acidity        -0.992560
citric acid             -0.139629
chlorides               -1.590943
free sulfur dioxide      0.005597
total sulfur dioxide    -0.003520
density                  0.768590
pH                      -0.437414
sulphates                0.812888
alcohol                  0.301484


In [18]:
train_pred = regressor.predict(a_train)
test_pred = regressor.predict(a_test)

In [19]:
# calculating rmse
train_rmse = mean_squared_error(train_pred, b_train) ** 0.5
print(train_rmse)
test_rmse = mean_squared_error(test_pred, b_test) ** 0.5
print(test_rmse)

0.6525240995947054
0.627538153923098


In [20]:
# rounding off the predicted values for test set
predicted_data = np.round_(test_pred)
print('Mean Absolute Error:', metrics.mean_absolute_error(b_test, test_pred))
print('Mean Squared Error:', metrics.mean_squared_error(b_test, test_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(b_test, test_pred)))

[5. 6. 6. 6. 5. 5. 5. 6. 6. 6. 6. 5. 5. 5. 6. 5. 6. 5. 6. 6. 6. 6. 6. 5.
 5. 5. 5. 7. 6. 6. 6. 6. 5. 6. 5. 6. 7. 5. 5. 6. 5. 5. 6. 5. 5. 6. 6. 5.
 5. 5. 5. 6. 6. 6. 5. 5. 5. 5. 5. 5. 5. 5. 5. 6. 6. 7. 5. 6. 5. 6. 6. 6.
 6. 5. 6. 6. 5. 6. 6. 6. 6. 5. 5. 6. 6. 5. 5. 6. 6. 6. 6. 6. 6. 5. 5. 5.
 5. 5. 5. 7. 6. 6. 6. 6. 6. 5. 5. 5. 5. 6. 6. 5. 6. 6. 5. 5. 5. 5. 6. 6.
 6. 5. 5. 5. 6. 6. 6. 6. 5. 5. 5. 5. 6. 6. 7. 6. 6. 6. 5. 5. 6. 7. 5. 5.
 6. 5. 6. 6. 7. 5. 5. 5. 6. 5. 5. 6. 6. 5. 5. 6. 6. 6. 6. 6. 6. 5. 6. 5.
 5. 6. 6. 7. 5. 6. 6. 5. 6. 6. 5. 6. 7. 5. 6. 6. 6. 6. 6. 5. 6. 6. 6. 7.
 5. 5. 5. 5. 5. 6. 6. 5. 5. 5. 6. 6. 5. 6. 5. 6. 5. 5. 6. 6. 6. 6. 6. 6.
 6. 5. 5. 5. 5. 5. 6. 5. 6. 5. 7. 5. 5. 5. 6. 6. 5. 6. 5. 5. 5. 6. 6. 5.
 6. 6. 6. 5. 7. 6. 6. 5. 5. 6. 5. 5. 6. 5. 6. 5. 6. 5. 6. 5. 6. 6. 6. 6.
 5. 6. 5. 6. 5. 5. 6. 5. 6. 6. 6. 6. 5. 5. 5. 5. 6. 5. 5. 6. 5. 6. 5. 5.
 5. 6. 5. 7. 5. 5. 6. 6. 5. 6. 7. 7. 6. 6. 6. 6. 6. 6. 5. 6. 5. 5. 6. 5.
 6. 6. 6. 5. 5. 6. 5. 5. 6. 6. 7. 5. 5. 5. 6. 5. 6.