# 1. Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR


import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("winequality.csv", sep=';')

# 2. Cleaning the data

In [3]:
df['alcohol'] = df['alcohol'].astype('str')
df = df.loc[df['alcohol'].str.len() < 5]
df['alcohol'] = df['alcohol'].astype(float)

df['total sulfur dioxide'] = df['total sulfur dioxide'] / 100
df['chlorides'] = df['chlorides'] * 100


print("Data size: ", len(df))

Data size:  6421


In [4]:
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,White,7.0,0.27,0.36,20.7,4.5,45.0,1.7,1.001,3.0,0.45,8.8,6
1,White,6.3,0.3,0.34,1.6,4.9,14.0,1.32,0.994,3.3,0.49,9.5,6
2,White,8.1,0.28,0.4,6.9,5.0,30.0,0.97,0.9951,3.26,0.44,10.1,6
3,White,7.2,0.23,0.32,8.5,5.8,47.0,1.86,0.9956,3.19,0.4,9.9,6
4,White,7.2,0.23,0.32,8.5,5.8,47.0,1.86,0.9956,3.19,0.4,9.9,6


In [5]:
df_new = df.copy()

for i, col in enumerate(df_new.columns[1:12]):
    qtl = df[col].quantile(.99)
    df_new = df_new.loc[df[col]<=qtl]
    
print("Total of data after removing outliers: ", len(df_new))

Total of data after removing outliers:  5886


In [6]:
df_new.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1,White,6.3,0.3,0.34,1.6,4.9,14.0,1.32,0.994,3.3,0.49,9.5,6
2,White,8.1,0.28,0.4,6.9,5.0,30.0,0.97,0.9951,3.26,0.44,10.1,6
3,White,7.2,0.23,0.32,8.5,5.8,47.0,1.86,0.9956,3.19,0.4,9.9,6
4,White,7.2,0.23,0.32,8.5,5.8,47.0,1.86,0.9956,3.19,0.4,9.9,6
5,White,8.1,0.28,0.4,6.9,5.0,30.0,0.97,0.9951,3.26,0.44,10.1,6


# 3. Splitting the data

We are going to try some models on this section, but first we will filter the data to look only the white wine, and will split the data into trainning and testing sets.

In [7]:
def default_setting(x_cols):
    X = df_new.loc[df_new['type'] == 'White'].iloc[:, 1:12].iloc[:, x_cols].values  
    y = df_new.loc[df_new['type'] == 'White'].iloc[:, 12].values

    test_data_size = 0.2
    return train_test_split(X, y, test_size=test_data_size, random_state=0)

In [8]:
df_new.loc[df_new['type'] == 'White'].iloc[:, 1:12].iloc[:, np.arange(11)].head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
1,6.3,0.3,0.34,1.6,4.9,14.0,1.32,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,5.0,30.0,0.97,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,5.8,47.0,1.86,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,5.8,47.0,1.86,0.9956,3.19,0.4,9.9
5,8.1,0.28,0.4,6.9,5.0,30.0,0.97,0.9951,3.26,0.44,10.1


In [9]:
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
df.columns[:12]

Index(['type', 'fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'],
      dtype='object')

# 4. Modelling

We will try few regression models

In [10]:
def create_model(kind, x_train, y_train, degree=0, alpha=0.01):
    if kind == 'linear':
        model = LinearRegression()
        
    elif kind == 'poly':
        model = LinearRegression()
        polynomial_features= PolynomialFeatures(degree=degree)
        x_train = polynomial_features.fit_transform(x_train)
    
    elif kind == 'svr':
        model = SVR()
        
    elif kind == 'ridge':
        model = Ridge(alpha=alpha)
        
    elif kind == 'lasso':
        model = Lasso(alpha=alpha)
        
    elif kind == 'elastic':
        model = ElasticNet(alpha = alpha)
        
    model.fit(x_train, y_train)
    return model


def test_score(model, x, y):
    y_predict = model.predict(x)
    mse = mean_squared_error(y,y_predict)
    r2 = r2_score(y, y_predict)
    
    print("MSE: ",mse)
    print('R2: ', r2)
    

def poly_score(model, x, y, degree):
    polynomial_features= PolynomialFeatures(degree=degree)
    x_poly = polynomial_features.fit_transform(x)
    y_poly_pred = model.predict(x_poly)
    mse = mean_squared_error(y,y_poly_pred)
    r2 = r2_score(y,y_poly_pred)    
    print("MSE: ",mse)
    print('R2: ', r2)

In [11]:
def main_features(model, x, y, num_features):
    rfe = RFE(model, num_features)
    fit = rfe.fit(X_train, y_train)
    print("The main {} features are:".format(fit.n_features_))
    print(df_new.iloc[:, 1:12].columns[fit.support_].values)
    print("Feature Ranking: {}".format(fit.ranking_))
    
    return fit.support_

## Linear regression

In [12]:
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
model = 'linear'
model_base = create_model(model, X_train, y_train)

print('On train dataset')
test_score(model_base, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model_base, X_test, y_test)

#Reducing the number of features

print("\nFinding the most 3 important features")
feature_array = main_features(model_base, X_train, y_train, num_features=3)
X_train, X_test, y_train, y_test = default_setting(feature_array)

print("\nTesting with only important features")
print('On train dataset')
model_new = create_model(model, X_train, y_train)
test_score(model_new, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model_new, X_test, y_test)


print("\nFinding the most 5 important features")
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
feature_array = main_features(model_base, X_train, y_train, num_features=5)
X_train, X_test, y_train, y_test = default_setting(feature_array)

print("\nTesting with only important features")
print('On train dataset')
model_new = create_model(model, X_train, y_train)
test_score(model_new, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model_new, X_test, y_test)

On train dataset
MSE:  0.5750831972568101
R2:  0.26707927407737264


On test dataset
MSE:  0.5533341715095279
R2:  0.2819607169319578

Finding the most 3 important features
The main 3 features are:
['volatile acidity' 'sulphates' 'alcohol']
Feature Ranking: [4 1 8 5 7 6 3 9 2 1 1]

Testing with only important features
On train dataset
MSE:  0.6088500810379663
R2:  0.22404471996227782


On test dataset
MSE:  0.5714307768499923
R2:  0.25847748709783447

Finding the most 5 important features
The main 5 features are:
['volatile acidity' 'total sulfur dioxide' 'pH' 'sulphates' 'alcohol']
Feature Ranking: [2 1 6 3 5 4 1 7 1 1 1]

Testing with only important features
On train dataset
MSE:  0.6025438720070695
R2:  0.23208173325494885


On test dataset
MSE:  0.5708250383599466
R2:  0.2592635293368899


## Polynomial Regression

In [13]:
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
degrees = 2
model_base = create_model('poly', X_train, y_train, degree=degrees)

print('On train dataset')
poly_score(model_base, X_train, y_train, degree=degrees)
        
print("\n")
print('On test dataset')
poly_score(model_base, X_test, y_test, degree=degrees)

On train dataset
MSE:  0.4912627783685473
R2:  0.373905073460432


On test dataset
MSE:  0.5089733335799963
R2:  0.33952597478748137


In [14]:
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
degrees = 3
model_base = create_model('poly', X_train, y_train, degree=degrees)

print('On train dataset')
poly_score(model_base, X_train, y_train, degree=degrees)
        
print("\n")
print('On test dataset')
poly_score(model_base, X_test, y_test, degree=degrees)

On train dataset
MSE:  0.41447971582004606
R2:  0.4717620413044765


On test dataset
MSE:  4922.5403704722185
R2:  -6386.780730846306


## SVR

In [15]:
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
model_base = create_model('svr', X_train, y_train)
print('On train dataset')
test_score(model_base, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model_base, X_test, y_test)

On train dataset
MSE:  0.39268692753412787
R2:  0.49953608562812013


On test dataset
MSE:  0.5099454914351067
R2:  0.33826444501893627


## Ridge

In [16]:
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
model = 'ridge'
model_base = create_model(model, X_train, y_train)

print('On train dataset')
test_score(model_base, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model_base, X_test, y_test)

#Reducing the number of features

print("\nFinding the most 3 important features")
feature_array = main_features(model_base, X_train, y_train, num_features=3)
X_train, X_test, y_train, y_test = default_setting(feature_array)

print("\nTesting with only important features")
print('On train dataset')
model_new = create_model(model, X_train, y_train)
test_score(model_new, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model_new, X_test, y_test)


print("\nFinding the most 5 important features")
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
feature_array = main_features(model_base, X_train, y_train, num_features=5)
X_train, X_test, y_train, y_test = default_setting(feature_array)

print("\nTesting with only important features")
print('On train dataset')
model_new = create_model(model, X_train, y_train)
test_score(model_new, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model_new, X_test, y_test)

On train dataset
MSE:  0.5750832003057419
R2:  0.26707927019163014


On test dataset
MSE:  0.5533367972573243
R2:  0.2819573096056758

Finding the most 3 important features
The main 3 features are:
['volatile acidity' 'sulphates' 'alcohol']
Feature Ranking: [4 1 8 5 7 6 3 9 2 1 1]

Testing with only important features
On train dataset
MSE:  0.6088500840287354
R2:  0.22404471615066124


On test dataset
MSE:  0.5714322146084538
R2:  0.25847562137706737

Finding the most 5 important features
The main 5 features are:
['volatile acidity' 'total sulfur dioxide' 'pH' 'sulphates' 'alcohol']
Feature Ranking: [2 1 6 3 5 4 1 7 1 1 1]

Testing with only important features
On train dataset
MSE:  0.6025438752432974
R2:  0.23208172913050473


On test dataset
MSE:  0.5708264822068618
R2:  0.2592616557153836


## Lasso

In [17]:
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
model = 'lasso'
model_base = create_model(model, X_train, y_train)

print('On train dataset')
test_score(model_base, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model_base, X_test, y_test)

#Reducing the number of features

print("\nFinding the most 3 important features")
feature_array = main_features(model_base, X_train, y_train, num_features=3)
X_train, X_test, y_train, y_test = default_setting(feature_array)

print("\nTesting with only important features")
print('On train dataset')
model_new = create_model(model, X_train, y_train)
test_score(model_new, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model_new, X_test, y_test)


print("\nFinding the most 5 important features")
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
feature_array = main_features(model_base, X_train, y_train, num_features=5)
X_train, X_test, y_train, y_test = default_setting(feature_array)

print("\nTesting with only important features")
print('On train dataset')
model_new = create_model(model, X_train, y_train)
test_score(model_new, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model_new, X_test, y_test)

On train dataset
MSE:  0.5910029220721935
R2:  0.24679021620916164


On test dataset
MSE:  0.5732742561763755
R2:  0.2560852788410124

Finding the most 3 important features
The main 3 features are:
['fixed acidity' 'volatile acidity' 'alcohol']
Feature Ranking: [1 1 9 2 3 4 8 5 7 6 1]

Testing with only important features
On train dataset
MSE:  0.6189131539252885
R2:  0.2112197326895322


On test dataset
MSE:  0.5867232618485316
R2:  0.23863305035407756

Finding the most 5 important features
The main 5 features are:
['fixed acidity' 'volatile acidity' 'residual sugar' 'chlorides' 'alcohol']
Feature Ranking: [1 1 7 1 1 2 6 3 5 4 1]

Testing with only important features
On train dataset
MSE:  0.6001976871384732
R2:  0.2350718528154624


On test dataset
MSE:  0.5761804310320058
R2:  0.2523140537527183


In [18]:
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
model_base = create_model('lasso', X_train, y_train)

print('On train dataset')
test_score(model_base, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model_base, X_test, y_test)

#Reducing the number of features

print("\nFinding the most 3 important features")
feature_array = main_features(model_base, X_train, y_train, num_features=3)
X_train, X_test, y_train, y_test = default_setting(feature_array)

print("\nTesting with only important features")
print('On train dataset')
model = create_model('linear', X_train, y_train)
test_score(model, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model, X_test, y_test)


print("\nFinding the most 5 important features")
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
feature_array = main_features(model_base, X_train, y_train, num_features=5)
X_train, X_test, y_train, y_test = default_setting(feature_array)

print("\nTesting with only important features")
print('On train dataset')
model = create_model('linear', X_train, y_train)
test_score(model, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model, X_test, y_test)

On train dataset
MSE:  0.5910029220721935
R2:  0.24679021620916164


On test dataset
MSE:  0.5732742561763755
R2:  0.2560852788410124

Finding the most 3 important features
The main 3 features are:
['fixed acidity' 'volatile acidity' 'alcohol']
Feature Ranking: [1 1 9 2 3 4 8 5 7 6 1]

Testing with only important features
On train dataset
MSE:  0.6075709747646293
R2:  0.2256748901756631


On test dataset
MSE:  0.5699905739460617
R2:  0.26034637991858645

Finding the most 5 important features
The main 5 features are:
['fixed acidity' 'volatile acidity' 'residual sugar' 'chlorides' 'alcohol']
Feature Ranking: [1 1 7 1 1 2 6 3 5 4 1]

Testing with only important features
On train dataset
MSE:  0.5886091314100048
R2:  0.24984100746546511


On test dataset
MSE:  0.5585034484853894
R2:  0.27525275612844935


## ElasticNet

In [19]:
model_kind = 'elastic'

X_train, X_test, y_train, y_test = default_setting(np.arange(11))
model_base = create_model(model_kind, X_train, y_train)


print('On train dataset')
test_score(model_base, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model_base, X_test, y_test)

#Reducing the number of features

print("\nFinding the most 3 important features")
feature_array = main_features(model_base, X_train, y_train, num_features=3)
X_train, X_test, y_train, y_test = default_setting(feature_array)

print("\nTesting with only important features")
print('On train dataset')
model = create_model(model_kind, X_train, y_train)
test_score(model, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model, X_test, y_test)


print("\nFinding the most 5 important features")
X_train, X_test, y_train, y_test = default_setting(np.arange(11))
feature_array = main_features(model_base, X_train, y_train, num_features=5)
X_train, X_test, y_train, y_test = default_setting(feature_array)

print("\nTesting with only important features")
print('On train dataset')
model = create_model(model_kind, X_train, y_train)
test_score(model, X_train, y_train)
        
print("\n")
print('On test dataset')
test_score(model, X_test, y_test)

On train dataset
MSE:  0.5874552434616958
R2:  0.2513115918900183


On test dataset
MSE:  0.5689490896041015
R2:  0.26169787185371773

Finding the most 3 important features
The main 3 features are:
['volatile acidity' 'total sulfur dioxide' 'alcohol']
Feature Ranking: [2 1 9 5 6 8 1 7 4 3 1]

Testing with only important features
On train dataset
MSE:  0.6162524768738913
R2:  0.2146106600636194


On test dataset
MSE:  0.5861594839406501
R2:  0.23936464204970798

Finding the most 5 important features
The main 5 features are:
['fixed acidity' 'volatile acidity' 'total sulfur dioxide' 'sulphates'
 'alcohol']
Feature Ranking: [1 1 7 3 4 6 1 5 2 1 1]

Testing with only important features
On train dataset
MSE:  0.6123790111153372
R2:  0.21954723854328606


On test dataset
MSE:  0.5843825828403038
R2:  0.2416704544463919


In [25]:
X_train, X_test, y_train, y_test = default_setting(feature_array)
degrees = 2
model_base = create_model('poly', X_train, y_train, degree=degrees)

print('On train dataset')
poly_score(model_base, X_train, y_train, degree=degrees)
        
print("\n")
print('On test dataset')
poly_score(model_base, X_test, y_test, degree=degrees)

On train dataset
MSE:  0.556090897626018
R2:  0.291284206683192


On test dataset
MSE:  0.5422583113630557
R2:  0.29633341084536424
