## Standardization

In [1]:
# Load libraries
import numpy as np
from sklearn import preprocessing
# Create feature
feature = np.array([[-500.5],
[-100.1],
[0],
[100.1],
[900.9]])
# Create scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

In [2]:
# Scale feature
scaled_feature = minmax_scale.fit_transform(feature)
# Show feature
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [3]:
# Load libraries
import numpy as np
from sklearn import preprocessing
# Create feature
x = np.array([[-1000.1],
[-200.2],
[500.5],
[600.6],
[9000.9]])
# Create scaler
scaler = preprocessing.StandardScaler()
# Transform the feature
standardized = scaler.fit_transform(x)
# Show feature
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [5]:
# Print mean and standard deviation
print("Mean:", round(standardized.mean()))
print("Standard deviation:", standardized.std())

Mean: 0.0
Standard deviation: 1.0


In [6]:
# Create scaler
robust_scaler = preprocessing.RobustScaler()
# Transform feature
robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

## Normalization

In [8]:
# Load libraries
import numpy as np
from sklearn.preprocessing import Normalizer
# Create feature matrix
features = np.array([[0.5, 0.5],
[1.1, 3.4],
[1.5, 20.2],
[1.63, 34.4],
[10.9, 3.3]])
# Create normalizer
normalizer = Normalizer(norm="l2")
# Transform feature matrix
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [9]:
# Transform feature matrix
features_l2_norm = Normalizer(norm="l2").transform(features)
# Show feature matrix
features_l2_norm

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [11]:
#Transform feature matrix
features_l1_norm = Normalizer(norm="l1").transform(features)
# Show feature matrix
features_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [12]:
#Print sum
print("Sum of the first observation\'s values:",
features_l1_norm[0, 0] + features_l1_norm[0, 1])

Sum of the first observation's values: 1.0


## One-hot encode the feature

In [19]:
# Import libraries
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
# Create feature
feature = np.array([["Texas"],["California"],["Texas"],["Delaware"],["Texas"]])
# Create one-hot encoder
one_hot = LabelBinarizer()
# One-hot encode feature
one_hot.fit_transform(feature)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [20]:
#We can use the classes_ method to output the classes:
# View feature classes
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [21]:
#If we want to reverse the one-hot encoding, we can use inverse_transform:
# Reverse one-hot encoding
one_hot.inverse_transform(one_hot.transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [22]:
#We can even use pandas to one-hot encode the feature:
# Import library
import pandas as pd
# Create dummy variables from feature
pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [23]:
# One helpful ability of scikit-learn is to handle a situation where each observation lists multiple classes:
# Create multiclass feature
multiclass_feature = [("Texas", "Florida"),
("California", "Alabama"),
("Texas", "Florida"),
("Delware", "Florida"),
("Texas", "Alabama")]
# Create multiclass one-hot encoder
one_hot_multiclass = MultiLabelBinarizer()
# One-hot encode multiclass feature
one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [24]:
# View classes
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

In [28]:
import pandas
from sklearn import linear_model
# Define a toy dataset of apartment rental prices in
# New York, San Francisco, and Seattle

df = pd.DataFrame({
 'City': ['SF', 'SF', 'SF', 'NYC', 'NYC', 'NYC',
 'Seattle', 'Seattle', 'Seattle'],
 'Rent': [3999, 4000, 4001, 3499, 3500, 3501, 2499, 2500, 2501]
 })
df['Rent'].mean()

3333.3333333333335

In [31]:
# Convert the categorical variables in the DataFrame to one-hot encoding
# and fit a linear regression model
one_hot_df = pd.get_dummies(df, prefix=['city'])
one_hot_df

Unnamed: 0,Rent,city_NYC,city_SF,city_Seattle
0,3999,0,1,0
1,4000,0,1,0
2,4001,0,1,0
3,3499,1,0,0
4,3500,1,0,0
5,3501,1,0,0
6,2499,0,0,1
7,2500,0,0,1
8,2501,0,0,1


In [36]:
from sklearn.linear_model import LinearRegression


In [38]:
model = linear_model.LinearRegression()

In [39]:
model.fit(one_hot_df[['city_NYC', 'city_SF', 'city_Seattle']],one_hot_df['Rent'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [40]:
model.coef_

array([ 166.66666667,  666.66666667, -833.33333333])

In [41]:
model.intercept_

3333.3333333333335

In [42]:
# Train a linear regression model on dummy code
# Specify the 'drop_first' flag to get dummy coding
dummy_df = pd.get_dummies(df, prefix=['city'], drop_first=True)
dummy_df

Unnamed: 0,Rent,city_SF,city_Seattle
0,3999,1,0
1,4000,1,0
2,4001,1,0
3,3499,0,0
4,3500,0,0
5,3501,0,0
6,2499,0,1
7,2500,0,1
8,2501,0,1


In [44]:
model.fit(dummy_df[['city_SF', 'city_Seattle']], dummy_df['Rent'])



LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [45]:
model.coef_

array([  500., -1000.])

In [46]:
model.intercept_

3500.0

In [52]:
# Linear regression with effect coding
effect_df = dummy_df.copy()
effect_df.loc[3:5, ['city_SF', 'city_Seattle']] = -1.0
effect_df


Unnamed: 0,Rent,city_SF,city_Seattle
0,3999,1.0,0.0
1,4000,1.0,0.0
2,4001,1.0,0.0
3,3499,-1.0,-1.0
4,3500,-1.0,-1.0
5,3501,-1.0,-1.0
6,2499,0.0,1.0
7,2500,0.0,1.0
8,2501,0.0,1.0


In [48]:
model.fit(effect_df[['city_SF', 'city_Seattle']], effect_df['Rent'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [49]:
model.coef_

array([ 666.66666667, -833.33333333])

In [50]:
model.intercept_

3333.3333333333335

In [54]:
## Encoding Ordinal Categorical Features

In [53]:
# Load library
import pandas as pd
# Create features
dataframe = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})

In [55]:
# Create mapper
scale_mapper = {"Low":1,"Medium":2,"High":3}
# Replace feature values with scale
dataframe["Score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [57]:
dataframe = pd.DataFrame({"Score": ["Low","Low","Medium","Medium","High","Barely More Than Medium"]})

In [58]:
scale_mapper = {"Low":1,"Medium":2,"Barely More Than Medium": 3,"High":4}

In [59]:
dataframe["Score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

In [60]:
#In this example, the distance between Low and Medium is the same as the distance
#between Medium and Barely More Than Medium, which is almost certainly not accurate.
#The best approach is to be conscious about the numerical values mapped to
#classes:
scale_mapper = {"Low":1,
"Medium":2,
"Barely More Than Medium": 2.1,
"High":3}
dataframe["Score"].replace(scale_mapper)

0    1.0
1    1.0
2    2.0
3    2.0
4    3.0
5    2.1
Name: Score, dtype: float64

In [61]:
#Problem
#Given a set of features, you want to reduce the number of features while retaining the
#variance in the data.
#Solution
#Use principal component analysis with scikit’s PCA:
# Load libraries
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import datasets
# Load the data
digits = datasets.load_digits()
# Standardize the feature matrix
features = StandardScaler().fit_transform(digits.data)
# Create a PCA that will retain 99% of variance
pca = PCA(n_components=0.99, whiten=True)
# Conduct PCA
features_pca = pca.fit_transform(features)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_pca.shape[1])

Original number of features: 64
Reduced number of features: 54


In [62]:
## Reducing Features by Maximizing Class Separability

In [63]:
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Load Iris flower dataset:
iris = datasets.load_iris()
features = iris.data
target = iris.target
# Create and run an LDA, then use it to transform the features
lda = LinearDiscriminantAnalysis(n_components=1)
features_lda = lda.fit(features, target).transform(features)
# Print the number of features
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_lda.shape[1])

Original number of features: 4
Reduced number of features: 1


In [64]:
lda.explained_variance_ratio_

array([0.99147248])

In [67]:
# Create and run LDA
lda = LinearDiscriminantAnalysis(n_components=None)
features_lda = lda.fit(features, target)
# Create array of explained variance ratios
lda_var_ratios = lda.explained_variance_ratio_
# Create function
def select_n_components(var_ratio, goal_var: float) -> int:
# Set initial variance explained so far
    total_variance = 0.0
# Set initial number of features
    n_components = 0
# For the explained variance of each feature:
    for explained_variance in var_ratio:
# Add the explained variance to the total
        total_variance += explained_variance
# Add one to the number of components
        n_components += 1
# If we reach our goal level of explained variance
        if total_variance >= goal_var:
# End the loop
           break
# Return the number of components
    return n_components
# Run function
select_n_components(lda_var_ratios, 0.95)

1

In [None]:
## Reducing Features Using Matrix Factorization

In [68]:
# Load libraries
from sklearn.decomposition import NMF
from sklearn import datasets
# Load the data
digits = datasets.load_digits()
# Load feature matrix
features = digits.data
# Create, fit, and apply NMF
nmf = NMF(n_components=10, random_state=1)
features_nmf = nmf.fit_transform(features)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_nmf.shape[1])

Original number of features: 64
Reduced number of features: 10


In [69]:
## Reducing Features on Sparse Data using Truncated Singular Value Decomposition (TSVD)

In [72]:
# Load libraries
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn import datasets
import numpy as np
# Load the data
digits = datasets.load_digits()
# Standardize feature matrix
features = StandardScaler().fit_transform(digits.data)
# Make sparse matrix
features_sparse = csr_matrix(features)
# Create a TSVD
tsvd = TruncatedSVD(n_components=10)
# Conduct TSVD on sparse matrix
features_sparse_tsvd = tsvd.fit(features_sparse).transform(features_sparse)
# Show results
print("Original number of features:", features_sparse.shape[1])
print("Reduced number of features:", features_sparse_tsvd.shape[1])

Original number of features: 64
Reduced number of features: 10


In [73]:
# Sum of first three components' explained variance ratios
tsvd.explained_variance_ratio_[0:3].sum()

0.3003938533399548

In [74]:
# Create and run an TSVD with one less than number of features
tsvd = TruncatedSVD(n_components=features_sparse.shape[1]-1)
features_tsvd = tsvd.fit(features)
# List of explained variances
tsvd_var_ratios = tsvd.explained_variance_ratio_
# Create a function
def select_n_components(var_ratio, goal_var):
    # Set initial variance explained so far
    total_variance = 0.0
# Set initial number of features
    n_components = 0
# For the explained variance of each feature:
    for explained_variance in var_ratio:
# Add the explained variance to the total
        total_variance += explained_variance
# Add one to the number of components
        n_components += 1
# If we reach our goal level of explained variance
        if total_variance >= goal_var:
# End the loop
            break
# Return the number of components
    return n_components
# Run function
select_n_components(tsvd_var_ratios, 0.95)

40

In [None]:
## Thresholding Numerical Feature Variance

In [75]:
#Load libraries
from sklearn import datasets
from sklearn.feature_selection import VarianceThreshold
# import some data to play with
iris = datasets.load_iris()
# Create features and target
features = iris.data
target = iris.target
# Create thresholder
thresholder = VarianceThreshold(threshold=.5)
# Create high variance feature matrix
features_high_variance = thresholder.fit_transform(features)
# View high variance feature matrix
features_high_variance[0:3]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2]])

In [76]:
# View variances
thresholder.fit(features).variances_

array([0.68112222, 0.18675067, 3.09242489, 0.57853156])

In [78]:
#Finally, if the features have been standardized (to mean zero and unit variance), then
#for obvious reasons variance thresholding will not work correctly:
# Load library
from sklearn.preprocessing import StandardScaler
# Standardize feature matrix
scaler = StandardScaler()
features_std = scaler.fit_transform(features)
# Caculate variance of each feature
selector = VarianceThreshold()
selector.fit(features_std).variances_


array([1., 1., 1., 1.])

In [None]:
## Thresholding Binary Feature Variance

In [79]:
# Load library
from sklearn.feature_selection import VarianceThreshold
# Create feature matrix with:
# Feature 0: 80% class 0
# Feature 1: 80% class 1
# Feature 2: 60% class 0, 40% class 1
features = [[0, 1, 0],
[0, 1, 1],
[0, 1, 0],
[0, 1, 1],
[1, 0, 0]]
# Run threshold by variance
thresholder = VarianceThreshold(threshold=(.75 * (1 - .75)))
thresholder.fit_transform(features)

array([[0],
       [1],
       [0],
       [1],
       [0]])

In [None]:
## Handling Highly Correlated Features

In [80]:
# Load libraries
import pandas as pd
import numpy as np
# Create feature matrix with two highly correlated features
features = np.array([[1, 1, 1],
[2, 2, 0],
[3, 3, 1],
[4, 4, 0],
[5, 5, 1],
[6, 6, 0],
[7, 7, 1],
[8, 7, 0],
[9, 7, 1]])
# Convert feature matrix into DataFrame
dataframe = pd.DataFrame(features)
# Create correlation matrix
corr_matrix = dataframe.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),
k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
dataframe.drop(dataframe.columns[to_drop], axis=1).head(3)

Unnamed: 0,0,2
0,1,1
1,2,0
2,3,1


In [81]:
# Correlation matrix
dataframe.corr()

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,-0.034503
2,0.0,-0.034503,1.0


In [82]:
# Upper triangle of correlation matrix
upper

Unnamed: 0,0,1,2
0,,0.976103,0.0
1,,,0.034503
2,,,


In [None]:
## Removing Irrelevant Features for Classification

In [83]:
#Load libraries
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
# Load data
iris = load_iris()
features = iris.data
target = iris.target
# Convert to categorical data by converting data to integers
features = features.astype(int)
# Select two features with highest chi-squared statistics
chi2_selector = SelectKBest(chi2, k=2)
features_kbest = chi2_selector.fit_transform(features, target)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])


Original number of features: 4
Reduced number of features: 2


In [84]:
#If the features are quantitative, compute the ANOVA F-value between each feature and the target vector:
# Select two features with highest F-values
fvalue_selector = SelectKBest(f_classif, k=2)
features_kbest = fvalue_selector.fit_transform(features, target)

In [85]:
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 2


In [88]:
# Instead of selecting a specific number of features, we can also use 
# SelectPercentile 
# to select the top n percent of features:
# Load library
from sklearn.feature_selection import SelectPercentile
# Select top 75% of features with highest F-values
fvalue_selector = SelectPercentile(f_classif, percentile=75)
features_kbest = fvalue_selector.fit_transform(features, target)
# Show results
print("Original number of features:", features.shape[1])
print("Reduced number of features:", features_kbest.shape[1])

Original number of features: 4
Reduced number of features: 3


  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [None]:
## Recursively Eliminating Features

In [90]:
# Load libraries
import warnings
from sklearn.datasets import make_regression
from sklearn.feature_selection import RFECV
from sklearn import datasets, linear_model
# Suppress an annoying but harmless warning
warnings.filterwarnings(action="ignore", module="scipy",
message="^internal gelsd")
# Generate features matrix, target vector, and the true coefficients
features, target = make_regression(n_samples = 10000,
n_features = 100,
n_informative = 2,
random_state = 1)
# Create a linear regression
ols = linear_model.LinearRegression()
# Recursively eliminate features
rfecv = RFECV(estimator=ols, step=1, scoring="neg_mean_squared_error")
rfecv.fit(features, target)
rfecv.transform(features)

array([[ 0.00850799,  0.7031277 , -0.34606121],
       [-1.07500204,  2.56148527, -1.8392567 ],
       [ 1.37940721, -1.77039484, -0.90016708],
       ...,
       [-0.80331656, -1.60648007, -1.28329706],
       [ 0.39508844, -1.34564911,  0.85012142],
       [-0.55383035,  0.82880112,  0.27741159]])

In [91]:
# Number of best features
rfecv.n_features_

3

In [92]:
#We can also see which of those features we should keep:
# Which categories are best
rfecv.support_

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False])

In [93]:
#We can even view the rankings of the features:
# Rank features best (1) to worst
rfecv.ranking_

array([66, 91, 78, 67, 41,  1, 73, 57, 15, 90, 23, 18, 33, 45, 27, 92, 12,
       36,  4, 52, 13, 26, 61, 63, 17,  7, 55, 94,  5, 34, 64, 20, 60, 29,
        8, 10, 46, 81, 84,  1, 40, 75, 50, 98, 47, 39, 28, 22, 96, 21, 79,
       72, 53, 11, 82, 19, 65, 51, 32, 74, 68, 76,  2, 71, 88, 95,  6, 24,
       42, 70, 48, 80, 62, 58, 85, 56, 38, 44,  1,  3, 30, 25,  9, 97, 16,
       59, 49, 69, 77, 86, 87, 89, 37, 43, 35, 31, 54, 14, 83, 93])