In [16]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

sns.set(color_codes=True)

<br>
<br>
<br>

### Data Collection

In [17]:
# importing datasets
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

In [18]:
# inspecting training data
train_data.head()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1,6
1,1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8,6
2,2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3,7
3,3,8.1,0.87,0.22,2.6,0.084,11.0,65.0,0.9973,3.2,0.53,9.8,5
4,4,8.5,0.36,0.3,2.3,0.079,10.0,45.0,0.99444,3.2,1.36,9.5,6


In [19]:
# inspecting test data
test_data.head()

Unnamed: 0,Id,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,2056,7.2,0.51,0.01,2.0,0.077,31.0,54.0,0.99748,3.39,0.59,9.8
1,2057,7.2,0.755,0.15,2.0,0.102,14.0,35.0,0.99586,3.33,0.68,10.0
2,2058,8.4,0.46,0.4,2.0,0.065,21.0,50.0,0.99774,3.08,0.65,9.5
3,2059,8.0,0.47,0.4,1.8,0.056,14.0,25.0,0.9948,3.3,0.65,11.7
4,2060,6.5,0.34,0.32,2.1,0.044,8.0,94.0,0.99356,3.23,0.48,12.8


<br>
<br>
<br>

### Data Preprocessing

In [20]:
from sklearn.preprocessing import RobustScaler, StandardScaler

In [21]:
# dropping id and target variable
data_1 = train_data.drop(['Id', 'quality'], axis=1)
data_1.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,8.0,0.5,0.39,2.2,0.073,30.0,39.0,0.99572,3.33,0.77,12.1
1,9.3,0.3,0.73,2.3,0.092,30.0,67.0,0.99854,3.32,0.67,12.8
2,7.1,0.51,0.03,2.1,0.059,3.0,12.0,0.9966,3.52,0.73,11.3
3,8.1,0.87,0.22,2.6,0.084,11.0,65.0,0.9973,3.2,0.53,9.8
4,8.5,0.36,0.3,2.3,0.079,10.0,45.0,0.99444,3.2,1.36,9.5


In [22]:
# creating scaler
scaler = StandardScaler()

In [23]:
# scaling data
data_1_scaled = scaler.fit_transform(data_1)

# converting to dataframe
data_1_scaled = pd.DataFrame(data_1_scaled, columns=data_1.columns)
data_1_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.214182,-0.159429,0.663801,-0.23163,-0.373311,1.303453,-0.310649,-0.562501,0.136562,0.933169,1.638215
1,0.548293,-1.314684,2.470184,-0.115164,0.427602,1.303453,0.539042,0.980993,0.066281,0.208049,2.318768
2,-0.742049,-0.101666,-1.248839,-0.348097,-0.963457,-1.394584,-1.129995,-0.080843,1.471893,0.643121,0.86044
3,-0.15553,1.977792,-0.23939,0.234236,0.090375,-0.595165,0.47835,0.302294,-0.777086,-0.807119,-0.597888
4,0.079077,-0.968107,0.185641,-0.115164,-0.120391,-0.695093,-0.128573,-1.263094,-0.777086,5.211378,-0.889553


In [24]:
# adding back target feature
final_data = pd.concat([data_1_scaled, train_data[['quality']]], axis=1)
final_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.214182,-0.159429,0.663801,-0.23163,-0.373311,1.303453,-0.310649,-0.562501,0.136562,0.933169,1.638215,6
1,0.548293,-1.314684,2.470184,-0.115164,0.427602,1.303453,0.539042,0.980993,0.066281,0.208049,2.318768,6
2,-0.742049,-0.101666,-1.248839,-0.348097,-0.963457,-1.394584,-1.129995,-0.080843,1.471893,0.643121,0.86044,7
3,-0.15553,1.977792,-0.23939,0.234236,0.090375,-0.595165,0.47835,0.302294,-0.777086,-0.807119,-0.597888,5
4,0.079077,-0.968107,0.185641,-0.115164,-0.120391,-0.695093,-0.128573,-1.263094,-0.777086,5.211378,-0.889553,6


In [25]:
# saving as a csv file
final_data.to_csv("../data/abc.csv", index=False)

<br>
<br>
<br>

In [26]:
# dropping id from test data
data_2 = test_data.drop(['Id'], axis=1)
data_2.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.2,0.51,0.01,2.0,0.077,31.0,54.0,0.99748,3.39,0.59,9.8
1,7.2,0.755,0.15,2.0,0.102,14.0,35.0,0.99586,3.33,0.68,10.0
2,8.4,0.46,0.4,2.0,0.065,21.0,50.0,0.99774,3.08,0.65,9.5
3,8.0,0.47,0.4,1.8,0.056,14.0,25.0,0.9948,3.3,0.65,11.7
4,6.5,0.34,0.32,2.1,0.044,8.0,94.0,0.99356,3.23,0.48,12.8


In [27]:
# transforming data with same scaler
data_2_scaled = scaler.transform(data_2)
# converting back to dataframe
data_2_scaled = pd.DataFrame(data_2_scaled, columns=data_2.columns)
data_2_scaled.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,-0.683397,-0.101666,-1.355097,-0.464564,-0.204698,1.403381,0.144543,0.400815,0.558245,-0.372047,-0.597888
1,-0.683397,1.31352,-0.611292,-0.464564,0.849135,-0.295383,-0.432034,-0.485873,0.136562,0.280561,-0.403444
2,0.020425,-0.39048,0.71693,-0.464564,-0.710537,0.404108,0.023158,0.543122,-1.620453,0.063025,-0.889553
3,-0.214182,-0.332717,0.71693,-0.697497,-1.089917,-0.295383,-0.735495,-1.066052,-0.07428,0.063025,1.249328
4,-1.093961,-1.083633,0.291899,-0.348097,-1.595756,-0.894947,1.358388,-1.744751,-0.566244,-1.169679,2.318768


In [28]:
# saving as a csv file
data_2_scaled.to_csv("../data/abc.csv", index=False)