In [172]:
import pandas as pd
import numpy as np

In [173]:
dataset = pd.read_csv('winequality-red.csv')

**I added 'NaN' values to the iris dataset so that the imputer can be understood.**

In [174]:
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


# Data Slicing
**We slice the arguments and the dependent variable in the dataset to give the model**

In [175]:
X = dataset.iloc[:,:-1]  #independent variable
y = dataset.iloc[:,-1:]  #dependent variable

In [176]:
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [177]:
y

Unnamed: 0,quality
0,5
1,5
2,5
3,6
4,5
...,...
1594,5
1595,6
1596,6
1597,5


# Missing Values

**Rows containing 'NaN' values can be removed from the dataset.**

In [178]:
dataset.dropna()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
5,7.4,0.660,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5
6,7.9,0.600,0.06,1.6,0.069,15.0,59.0,0.99640,3.30,0.46,9.4,5
7,7.3,0.650,0.00,1.2,0.065,15.0,21.0,0.99460,3.39,0.47,10.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


# Imputer
**Instead of losing rows with 'NaN' values, we can replace them with the average of the columns they are in.**

In [179]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(X)
X = imputer.transform(X)

In [180]:
X = pd.DataFrame(data = X, index = range(len(X)), columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density','pH', 'sulphates', 'alcohol'])
X

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,8.320213,0.7000,0.00000,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.800000,0.5276,0.00000,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.800000,0.7600,0.27112,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.200000,0.2800,0.56000,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.400000,0.7000,0.00000,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.200000,0.6000,0.08000,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.900000,0.5500,0.10000,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.300000,0.5100,0.13000,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.900000,0.6450,0.12000,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [181]:
y = pd.DataFrame(data = y, index = range(len(y)), columns = ['Setosa','Versicolor','Virginica'])
y

Unnamed: 0,Setosa,Versicolor,Virginica
0,,,
1,,,
2,,,
3,,,
4,,,
...,...,...,...
1594,,,
1595,,,
1596,,,
1597,,,


# Encoder
**There should be no categorical data in the data set in order for the regression algorithm to work, label encoder or one hot encoder methods can be used to convert categorical data into numerical data, I will not use it because there is no ketagoric data in our data set.**

In [182]:
"""
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:,0] = le.fit_transform(X[:,0])


from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
y = ohe.fit_transform(y).toarray()
"""


'\nfrom sklearn.preprocessing import LabelEncoder\nle = LabelEncoder()\nX[:,0] = le.fit_transform(X[:,0])\n\n\nfrom sklearn.preprocessing import OneHotEncoder\nohe = OneHotEncoder()\ny = ohe.fit_transform(y).toarray()\n'

# Splitting

In [183]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y,test_size=0.33, random_state=0)

In [184]:
x_train

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
243,15.0,0.210,0.44,2.2,0.075,10.0,24.0,1.00005,3.07,0.84,9.2
401,7.7,0.260,0.30,1.7,0.059,20.0,38.0,0.99490,3.29,0.47,10.8
538,12.9,0.350,0.49,5.8,0.066,5.0,35.0,1.00140,3.20,0.66,12.0
678,8.3,0.780,0.10,2.6,0.081,45.0,87.0,0.99830,3.48,0.53,10.0
1069,8.0,0.620,0.35,2.8,0.086,28.0,52.0,0.99700,3.31,0.62,10.8
...,...,...,...,...,...,...,...,...,...,...,...
763,9.3,0.655,0.26,2.0,0.096,5.0,35.0,0.99738,3.25,0.42,9.6
835,7.6,0.665,0.10,1.5,0.066,27.0,55.0,0.99655,3.39,0.51,9.3
1216,7.9,0.570,0.31,2.0,0.079,10.0,79.0,0.99677,3.29,0.69,9.5
559,13.0,0.470,0.49,4.3,0.085,6.0,47.0,1.00210,3.30,0.68,12.7


In [185]:
y_test

Unnamed: 0,Setosa,Versicolor,Virginica
1109,,,
1032,,,
1002,,,
487,,,
979,,,
...,...,...,...
1283,,,
195,,,
977,,,
708,,,


# Scaling

In [186]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.fit_transform(x_test)

In [187]:
X_train

array([[ 3.79782017, -1.73659008,  0.8570712 , ..., -1.55136219,
         1.01399624, -1.14014938],
       [-0.36476907, -1.46457323,  0.14233674, ..., -0.12478467,
        -1.08185338,  0.35801792],
       [ 2.60036299, -0.97494291,  1.1123335 , ..., -0.70838456,
        -0.00560628,  1.4816434 ],
       ...,
       [-0.25072553,  0.2219312 ,  0.1933892 , ..., -0.12478467,
         0.16432747, -0.85924301],
       [ 2.65738476, -0.32210249,  1.1123335 , ..., -0.05994023,
         0.10768289,  2.13709159],
       [ 0.83268811,  2.45246933,  0.24444166, ..., -0.3841624 ,
        -1.0252088 , -0.95287847]])

In [188]:
X_test

array([[ 1.47122951, -0.3233002 ,  0.83746842, ..., -0.9377904 ,
         0.69913763,  0.34550363],
       [-0.10551443,  1.75897453, -1.40054167, ...,  0.29184145,
        -0.79511217, -0.78669757],
       [ 0.47846481, -1.39418434,  0.31700096, ..., -0.35533321,
         1.21887668,  1.19465453],
       ...,
       [ 0.06967934,  0.39062257,  0.10881397, ..., -1.06722533,
        -0.99001431, -1.25844807],
       [-0.2807082 ,  0.12290153, -0.77598071, ...,  0.16240652,
        -0.27537311,  1.10030443],
       [-1.21507498,  1.55074706, -1.40054167, ...,  1.78034316,
        -0.27537311, -0.40929717]])