# Data Preprocessing

In [2]:
import numpy as np
from sklearn import preprocessing

In [3]:
data = np.array([[1, 3, 0.5, -10], 
                 [15, 20, 0.3, 3], 
                 [0.7,-12,-0.1, 5.]
                ])

In [4]:
data

array([[  1. ,   3. ,   0.5, -10. ],
       [ 15. ,  20. ,   0.3,   3. ],
       [  0.7, -12. ,  -0.1,   5. ]])

# Mean Removal

It is usally a good practice to remove the mean of the raw data to center data on zero and remove any bias from the features.

In [5]:
print("Mean=", data.mean(axis=0))
print("Std Deviation=", data.std(axis=0))

Mean= [ 5.56666667  3.66666667  0.23333333 -0.66666667]
Std Deviation= [ 6.67149825 13.0724477   0.24944383  6.64997911]


In [6]:
# mean removal
data_standardized = preprocessing.scale(data)
print("\n Mean=", data_standardized.mean(axis=0))
print("Std Deviation=", data_standardized.std(axis=0))


 Mean= [ 7.40148683e-17  0.00000000e+00 -1.48029737e-16 -7.40148683e-17]
Std Deviation= [1. 1. 1. 1.]


In [7]:
data_standardized

array([[-0.68450391, -0.05099785,  1.06904497, -1.40351318],
       [ 1.41397524,  1.24944721,  0.26726124,  0.55138018],
       [-0.72947132, -1.19844937, -1.33630621,  0.852133  ]])

# Min-Max Scaler

In [9]:
# min max scaling
data_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled = data_scaler.fit_transform(data)
print("MinMax scaled data\n", data_scaled)

MinMax scaled data
 [[0.02097902 0.46875    1.         0.        ]
 [1.         1.         0.66666667 0.86666667]
 [0.         0.         0.         1.        ]]


# Normalization

In [10]:
# normalization
data_normalized = preprocessing.normalize(data, norm="l1")
print('\nL1 normalized data:\n', data_normalized)




L1 normalized data:
 [[ 0.06896552  0.20689655  0.03448276 -0.68965517]
 [ 0.39164491  0.52219321  0.0078329   0.07832898]
 [ 0.03932584 -0.6741573  -0.00561798  0.28089888]]


# Binarization

In [11]:
# binarization
data_binarized = preprocessing.Binarizer(threshold=2).transform(data)
print('Original Data', data)
print('\nBinarized data:\n', data_binarized)

Original Data [[  1.    3.    0.5 -10. ]
 [ 15.   20.    0.3   3. ]
 [  0.7 -12.   -0.1   5. ]]

Binarized data:
 [[0. 1. 0. 0.]
 [1. 1. 0. 1.]
 [0. 0. 0. 1.]]


# Label Encoding

In supervised learning is usual to have labels in human-readable form. It is often necessary to transform those labels to numeric values.

In [13]:
label_encoder = preprocessing.LabelEncoder()
input_classes = ['audi', 'ford', 'audi', 'toyota', 'ford', 'bmw']
label_encoder.fit(input_classes)

# print classes
print('\nClass mapping:')
for i, item in enumerate(label_encoder.classes_):
    print(item, i)



Class mapping:
audi 0
bmw 1
ford 2
toyota 3


In [14]:
enumerate(label_encoder.classes_)

<enumerate at 0x1a216147e0>

In [15]:
label_encoder.classes_

array(['audi', 'bmw', 'ford', 'toyota'], dtype='<U6')

In [16]:

# transform a set of classes
labels = ['toyota', 'ford', 'audi']
encoded_labels = label_encoder.transform(labels)
print('\nLabels =', labels)
print('\ncoded labels =', list(encoded_labels))



Labels = ['toyota', 'ford', 'audi']

coded labels = [3, 2, 0]
