4.8 Discretizating Features

In [1]:
import numpy as np
from sklearn.preprocessing import Binarizer
# Create feature
age = np . array ([[ 6 ],
                   [ 12 ],
                   [ 20 ],
                   [ 36 ],
                   [ 65 ]])
# Create binarizer
binarizer = Binarizer ( threshold=18 )

In [2]:
# Transform feature
binarizer . fit_transform ( age )

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [3]:
# Bin feature
np . digitize ( age , bins= [ 20 , 30 , 64 ])

array([[0],
       [0],
       [1],
       [2],
       [3]])

In [4]:
# Bin feature
np . digitize ( age , bins= [ 20 , 30 , 64 ], right=True )

array([[0],
       [0],
       [0],
       [2],
       [3]])

In [5]:
np . digitize ( age , bins= [ 18 ])

array([[0],
       [0],
       [1],
       [1],
       [1]])

4.9 Grouping Observations Using Clustering

In [6]:
# Load libraries
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
# Make simulated feature matrix
features, _ = make_blobs(n_samples = 50,
n_features = 2,
centers = 3,
random_state = 1)
# Create DataFrame
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
# Make k-means clusterer
clusterer = KMeans(3, random_state=0)
# Fit clusterer
clusterer.fit(features)
# Predict values
dataframe["group"] = clusterer.predict(features)
# View first few observations
dataframe.head(10)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0
5,-0.794152,2.104951,1
6,-2.760179,5.551214,1
7,-9.946905,-4.590344,2
8,-0.52579,3.306599,1
9,-1.981977,4.022436,1


In [7]:
# Load library
import pandas as pd
# Load data
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
# Remove observations with missing values
dataframe.dropna()

Unnamed: 0,feature_1,feature_2
0,-9.877554,-3.336145
1,-7.28721,-8.353986
2,-6.943061,-7.023744
3,-7.440167,-8.791959
4,-6.641388,-8.075888
5,-0.794152,2.104951
6,-2.760179,5.551214
7,-9.946905,-4.590344
8,-0.52579,3.306599
9,-1.981977,4.022436


4.10 Deleting Observations with Missing Values

In [8]:
# Load library
import numpy as np
# Create feature matrix
features = np.array([[1.1, 11.1],
                    [2.2, 22.2],
                    [3.3, 33.3],
                    [4.4, 44.4],
                    [np.nan, 55]])
# Keep only observations that are not (denoted by ~) missing
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [9]:
# Load library
import pandas as pd
# Load data
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
# Remove observations with missing values
dataframe.dropna()

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


4.11 Imputing Missing Values

In [10]:
# Load libraries
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
# Make a simulated feature matrix
features, _ = make_blobs(n_samples = 1000,
n_features = 2,
random_state = 1)
# Standardize the features
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)
# Replace the first feature's first value with a missing value
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan
# Predict the missing values in the feature matrix
knn_imputer = KNNImputer(n_neighbors=5)
features_knn_imputed = knn_imputer.fit_transform(standardized_features)
# Compare true and imputed values
print("True Value:", true_value)
print("Imputed Value:", features_knn_imputed[0,0])

True Value: 0.8730186113995938
Imputed Value: 1.0959262913919632


In [11]:
# Load libraries
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
# Make a simulated feature matrix
features, _ = make_blobs(n_samples = 1000,
n_features = 2,
random_state = 1)
# Standardize the features
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)
# Replace the first feature's first value with a missing value
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan
# Create imputer using the "mean" strategy
mean_imputer = SimpleImputer(strategy="mean")
# Impute values
features_mean_imputed = mean_imputer.fit_transform(features)
# Compare true and imputed values
print("True Value:", true_value)
print("Imputed Value:", features_mean_imputed[0,0])

True Value: 0.8730186113995938
Imputed Value: -3.058372724614996
