# Chapter 04

In [30]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

### 4.1 Rescaling a feature

$$ x'_i = \frac{x_i - min(x)}{max(x) - min(x)} $$

In [2]:
feature = np.array(
    [[-500.5],
     [-100.1],
     [0],
     [100.1],
     [900.9]]
)

scale = preprocessing.MinMaxScaler(feature_range = (0, 1))
scaled_feature = scale.fit_transform(feature)
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

### 4.2 Standardizing a feature


$$ x'_i = \frac{x_i-\overline{x}}{\sigma} $$

In [10]:
scaler = preprocessing.StandardScaler()
standardized = scaler.fit_transform(feature)
standardized

array([[-1.26687088],
       [-0.39316683],
       [-0.17474081],
       [ 0.0436852 ],
       [ 1.79109332]])

In [9]:
print('Mean: ', round(standardized.mean()))
print('Standard deviation: ', round(standardized.std()))

Mean:  0
Standard deviation:  1


In [14]:
# Robust scaler
robust = preprocessing.RobustScaler()
robust.fit_transform(feature)

array([[-2.5],
       [-0.5],
       [ 0. ],
       [ 0.5],
       [ 4.5]])

### 4.3 Normalizing observations

$$ ||x||_2 = \sqrt{x^2_1 + x^2_2 + ... + x^2_n} $$

In [19]:
features = np.array(
    [[0.5, 0.5],
     [1.1, 3.4],
     [1.5, 20.2],
     [1.63, 34.4],
     [10.9, 3.3]]
)

euclidean = preprocessing.Normalizer(norm='l2').transform(features)
euclidean


array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [22]:
euclidean[0, 0] + euclidean[0, 1]

1.414213562373095

$$ ||x||_1 = \sum_{i=1}^n |x_i| $$

In [23]:
manhattan = preprocessing.Normalizer(norm='l1').transform(features)
manhattan

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [24]:
manhattan[0, 0] + manhattan[0, 1]

1.0

### 4.4 Generating polynomial and interaction features

In [25]:
features = np.array(
    [[2, 3],
     [2, 3],
     [2, 3]]
)

poly_int = preprocessing.PolynomialFeatures(degree=2, include_bias=False)
poly_int.fit_transform(features)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [26]:
poly_int = preprocessing.PolynomialFeatures(degree=2,
                                            interaction_only=True,
                                            include_bias=False)
poly_int.fit_transform(features)

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

### 4.5 Transforming features

In [27]:
def add_ten(x):
    return x + 10

transformer = preprocessing.FunctionTransformer(add_ten)
transformer.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [29]:
df = pd.DataFrame(
    features, columns=['a', 'b']
)

df.apply(add_ten)

Unnamed: 0,a,b
0,12,13
1,12,13
2,12,13


### 4.6 Detecting outliers

In [33]:
features, _ = make_blobs(
    n_samples =10,
    n_features=2,
    centers=1,
    random_state=1
)

features[0, 0] = 1e4
features[0, 1] = 1e4

outlier_detector = EllipticEnvelope(contamination=.1)
outlier_detector.fit_predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [38]:
feature = features[:, 0]

def outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower = q1 - (iqr * 1.5)
    upper = q3 + (iqr * 1.5)
    return np.where((x > upper) | (x < lower))

outliers(feature)

(array([0]),)

### 4.7 Handling outliers

In [39]:
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

houses[houses['Bathrooms'] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [40]:
houses['Outlier'] = np.where(houses['Bathrooms'] < 20, 0, 1)
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [41]:
houses['Log_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


### 4.8 Discretizating features

In [53]:
from sklearn.preprocessing import Binarizer

age = np.array(
    [[6],
     [12],
     [20],
     [36],
     [65]]
)

binarizer = Binarizer(threshold=18)
binarizer.fit_transform(age)


array([[0],
       [0],
       [1],
       [1],
       [1]])

In [55]:
np.digitize(age, bins=[20, 30, 64])

array([[0],
       [0],
       [1],
       [2],
       [3]])

In [56]:
np.digitize(age, bins=[18])

array([[0],
       [0],
       [1],
       [1],
       [1]])

### 4.9 Grouping observations using clustering

In [57]:
from sklearn import cluster
from sklearn.cluster import KMeans

features, _ = make_blobs(
    n_samples=50,
    n_features=2,
    centers=3,
    random_state=1
)

data = pd.DataFrame(features, columns=['a', 'b'])
clusterer = KMeans(3, random_state=0)
clusterer.fit(features)
data['cluster'] = clusterer.predict(features)
data.head(5)


Unnamed: 0,a,b,cluster
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2


### 4.10 Deleting observations with missing values

In [58]:
features = np.array(
    [[1.1, 11.1],
     [2.2, 22.2],
     [3.3, 33.3],
     [4.4, 44.4],
     [np.nan, 55]]
)

features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [59]:
dataframe = pd.DataFrame(features, columns=['a', 'b'])
dataframe.dropna()

Unnamed: 0,a,b
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


### 4.11 Imputing missing values

In [63]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

features, _ = make_blobs(
    n_samples=1000,
    n_features=2,
    random_state=1
)

scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

true_value = standardized_features[0, 0]
standardized_features[0, 0] = np.nan

features_knn_imputed = KNNImputer(n_neighbors=5).fit_transform(standardized_features)

print('True value: ', true_value)
print('Imputed value: ', features_knn_imputed[0, 0])

True value:  0.8730186113995938
Imputed value:  1.0959262913919632


In [70]:
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy='mean')
features_mean_immputed = mean_imputer.fit_transform(features)

print('True value: ', true_value)
print('Imputed value: ', features_mean_immputed[0, 0])

True value:  0.8730186113995938
Imputed value:  -3.058372724614996
