feature scaling
standardizing features


In [28]:
from sklearn import preprocessing
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

In [26]:
feature = np.array([[-500.5],
                    [-100.1],
                    [0],
                    [100.1],
                    [500.5]])

In [3]:
# scale from 0 to 1, sometimes -1 to 1, and giving each array index a location_value based on its distance from the max-min range 
#x′i=xi−min(x)max(x)−min(x)

minimun_scale = preprocessing.MinMaxScaler(feature_range=(0,1))

scaled_feature = minimun_scale.fit_transform(feature)

scaled_feature

array([[0. ],
       [0.4],
       [0.5],
       [0.6],
       [1. ]])

In [4]:
# You want to transform a feature to have a mean of 0 and a standard deviation of 1.
x_std_sc = np.array([[-10111.1],
              [-310.2],
              [450.5],
              [610.6],
              [9090.9]])

scaler_std_sc = preprocessing.StandardScaler()

standardized_std_sc = scaler_std_sc.fit_transform(x_std_sc)

standardized_std_sc

print(int(standardized_std_sc.mean()))
print(int(standardized_std_sc.std()))


0
1


In [5]:
#if data has significant outliers, we can use use a robust scaler to use median and quartile range instead
robust_scaler = preprocessing.RobustScaler()
robust_scaler.fit_transform(x_std_sc)

array([[-11.47002606],
       [ -0.82612945],
       [  0.        ],
       [  0.17387055],
       [  9.3835795 ]])

In [6]:
#You want to rescale the feature values of observations to have unit norm or a total length of 1 on indiviual 
#obvservations not just entire feature
features_norm = np.array([[0.4,0.6],
                         [1.2, 4.4],
                         [1.6,21.2],
                         [1.9, 44.4],
                         [12.8, 0.1]])

#Eucliean by default norm = 12
normalizer = preprocessing.Normalizer()

normalizer.transform(features_norm)

#Manhattan version
normalizer_man = preprocessing.Normalizer(norm='l1')

normalizer_man.transform(features_norm)

array([[0.4       , 0.6       ],
       [0.21428571, 0.78571429],
       [0.07017544, 0.92982456],
       [0.04103672, 0.95896328],
       [0.99224806, 0.00775194]])

In [13]:
#Create polynominal and interaction features.
#If an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].
feature_poly = np.array([[4,6],
                        [3,7],
                         [1,2],
                        [1,5]])

scaler_poly = preprocessing.PolynomialFeatures(2)

scaler_poly.fit_transform(feature_poly)


array([[ 1.,  4.,  6., 16., 24., 36.],
       [ 1.,  3.,  7.,  9., 21., 49.],
       [ 1.,  1.,  2.,  1.,  2.,  4.],
       [ 1.,  1.,  5.,  1.,  5., 25.]])

In [16]:
#You want to make a custom transformation to one or more features
feature_tran = np.array([[1,2],
                        [3,7],
                        [10,11],
                        [100,150]])

def add_negative(arr):
    return arr + -5

neg_transformer = preprocessing.FunctionTransformer(add_negative)

neg_transformer.transform(feature_tran)

array([[ -4,  -3],
       [ -2,   2],
       [  5,   6],
       [ 95, 145]])

In [23]:
#You have a numerical feature and want to break it up into discrete bins 
#Binarizer(value) value is cut off for bins also / digitize for mulitple bins
bin_nums = np.array([[1],
                    [12],
                    [34],
                    [1000],
                    [20345]])
binarizer = preprocessing.Binarizer(40)

binarizer.fit_transform(bin_nums)

np.digitize(bin_nums, bins=[13,1001])

array([[0],
       [0],
       [1],
       [1],
       [2]])

In [32]:
#You want to cluster observations so that similar observations are grouped together

features_cl, _ = make_blobs(n_samples=50,
                        n_features=2,
                        centers=3,
                        random_state=1)

cluster_df = pd.DataFrame(features_cl, columns=["feature_1", "feature_2"])

cluster = KMeans(3, random_state=0)
cluster.fit(features_cl)
cluster_df["group"] = cluster.predict(features_cl)
cluster_df.head(5)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2


In [37]:
#You need to delete observations containing missing values
features_nan = np.array([[1.1, 11.1],
                     [2.2, 22.2],
                     [3.3, 33.3],
                     [4.4, 44.4],
                     [np.nan, 55]])

features_nan[~np.isnan(features_nan).any(axis=1)]

#or
nan_df = pd.DataFrame(features_nan, columns=["feature_1", "feature_2"])
nan_df.dropna()

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


In [39]:
#You have missing values in your data and want to fill in or predict their values

# feature_miss, _ = make_blobs(n_samples=1000, n_features=2, random_state=1)
# scaler_miss = preprocessing.StandardScaler()
# stand_features_miss = scaler_miss.fit_transform(feature_miss)
# true_value = stand_features_miss[0,0]
# stand_features_miss[0,0] = np.nan

# feature_knn_in = K