In [1]:
import pandas as pd
import numpy as np

In [2]:
df_wine = pd.read_csv('wine.data',header=None)

In [13]:
columns_list = ['label',
                   'alcohol',
                   'malic acid',
                   'ash',
                   'alcalinity of ash',
                   'magnesium',
                   'total phenols',
                   'flavanoids',
                   'nonflavanoid phenols',
                   'proanthocyanins',
                   'color intensity',
                   'hue',
                   'od280/od315 of diluted wines',
                   'proline']

In [14]:
df_wine.columns = columns_list

In [15]:
np.unique(df_wine['label'])

array([1, 2, 3])

# Split the data using sklearns test train split function

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
x = df_wine.iloc[:,1:]
y = df_wine.iloc[:,0]

In [18]:
# split into training and test datasets

x_train,x_test,y_train,y_test = train_test_split(x,
                                                 y,
                                                 test_size=0.3,    # test data is 30% of the data
                                                 random_state=0,   # setting the random seed
                                                 stratify=y)       # stratify means the proportions/percentages of the original datasets are maintained

In [19]:
print(y_train.value_counts())
print(y.value_counts())

label
2    50
1    41
3    33
Name: count, dtype: int64
label
2    71
1    59
3    48
Name: count, dtype: int64


# Bringing features to same scale (standardization vs normalization)

In [25]:
features_list = [x for x in columns_list if x != 'label']

In [27]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()

x_train_norm = mms.fit_transform(x_train)

x_train_norm_df = pd.DataFrame(x_train_norm)

x_train_norm_df.columns = features_list

x_train_norm_df

Unnamed: 0,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,od280/od315 of diluted wines,proline
0,0.646199,0.832016,0.424837,0.462366,0.271605,0.351724,0.097046,0.68,0.189873,0.236234,0.457447,0.285714,0.194009
1,0.687135,0.156126,0.653595,0.435484,0.765432,0.679310,0.506329,0.74,0.294304,0.325044,0.819149,0.633700,0.682596
2,0.678363,0.150198,0.653595,0.596774,0.382716,0.696552,0.613924,0.32,0.620253,0.351687,0.755319,0.527473,0.718260
3,0.614035,0.209486,0.620915,0.408602,0.493827,0.472414,0.462025,0.32,0.354430,0.218472,0.659574,0.586081,0.582739
4,0.347953,0.339921,0.326797,0.381720,0.296296,0.220690,0.067511,1.00,0.164557,0.476021,0.265957,0.113553,0.297432
...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,0.195906,0.077075,0.529412,0.677419,0.098765,0.351724,0.261603,0.54,0.310127,0.040853,0.882979,0.531136,0.251070
120,0.581871,0.626482,0.509804,0.623656,0.395062,0.282759,0.086498,0.60,0.313291,0.493783,0.234043,0.106227,0.336662
121,0.722222,0.849802,0.346405,0.462366,0.123457,0.000000,0.000000,0.54,0.082278,0.280639,0.106383,0.021978,0.097718
122,0.160819,0.069170,0.392157,0.516129,0.382716,0.827586,0.379747,0.00,0.389241,0.130551,0.542553,0.681319,0.433666


In [26]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()

x_train_std = stdsc.fit_transform(x_train)

x_train_std_df = pd.DataFrame(x_train_std)

x_train_std_df.columns = features_list

x_train_std_df

Unnamed: 0,alcohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,od280/od315 of diluted wines,proline
0,0.712259,2.220487,-0.130259,0.059629,-0.504327,-0.528316,-1.240000,0.841180,-1.052151,-0.292189,-0.200170,-0.821641,-0.629464
1,0.882292,-0.704572,1.175336,-0.090655,2.341479,1.016759,0.662995,1.088743,-0.492935,0.131521,1.339826,0.549313,1.475688
2,0.845856,-0.730230,1.175336,0.811048,0.135979,1.098079,1.163267,-0.644195,1.252496,0.258634,1.068062,0.130811,1.629349
3,0.578661,-0.473646,0.988823,-0.240939,0.776285,0.040922,0.457000,-0.644195,-0.170963,-0.376931,0.660416,0.361708,1.045438
4,-0.526554,0.090839,-0.689799,-0.391223,-0.362037,-1.146346,-1.377330,2.161513,-1.187719,0.851827,-1.015462,-1.499903,-0.183848
...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,-1.158106,-1.046684,0.466585,1.261899,-1.500359,-0.528316,-0.474879,0.263534,-0.408206,-1.224349,1.611590,0.145242,-0.383607
120,0.445064,1.330995,0.354677,0.961331,0.207124,-0.853595,-1.289047,0.511097,-0.391260,0.936569,-1.151344,-1.528765,-0.014821
121,1.028035,2.297462,-0.577891,0.059629,-1.358069,-2.187238,-1.691226,0.263534,-1.628313,-0.080334,-1.694872,-1.860680,-1.044348
122,-1.303849,-1.080895,-0.316772,0.360196,0.135979,1.716108,0.074440,-1.964528,0.015443,-0.796403,0.162182,0.736917,0.403136


In [54]:
# comparison of the differences in these methods

x_train['alcohol_std'] = np.array(x_train_std_df['alcohol'])
x_train['alcohol_norm'] = np.array(x_train_norm_df['alcohol'])

x_train[['alcohol','alcohol_std','alcohol_norm']].head(10)


Unnamed: 0,alcohol,alcohol_std,alcohol_norm
143,13.62,0.712259,0.646199
33,13.76,0.882292,0.687135
30,13.73,0.845856,0.678363
34,13.51,0.578661,0.614035
135,12.6,-0.526554,0.347953
114,12.08,-1.158106,0.195906
108,12.22,-0.988073,0.236842
134,12.51,-0.635862,0.321637
92,12.69,-0.417247,0.374269
103,11.82,-1.473882,0.119883
