# Top 10 functions from Feature Engine package
[Documentation](https://feature-engine.trainindata.com/en/latest/api_doc/index.html)

In [10]:
!pip install feature_engine --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/378.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/378.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m368.6/378.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m378.6/378.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
# Initial Imports
import pandas as pd
import seaborn as sns

In [92]:
# Load data
df = sns.load_dataset('attention', index_col=[0])
df.head(3)

# X and y
X = df.drop('score', axis=1)
y = df['score']

## Mean Encoder
Instead of creating hundreds of columns with one-hot encoding, MeanEncoder replaces categories with the mean of the target variable.

In [86]:
from feature_engine.encoding import MeanEncoder

e = MeanEncoder(variables=['attention'])
X_encoded = e.fit_transform(X, y)

In [16]:
X_encoded.sample(3)

Unnamed: 0,subject,attention,solutions
4,5,5.116667,1
52,13,6.8,3
9,10,5.116667,1


## Rare Label Encoder
Sometimes categories appear only a handful of times. Instead of letting these rare categories confuse your model, RareLabelEncoder groups them into a single "Rare" label.

In [65]:
from feature_engine.encoding import RareLabelEncoder

# Add line at the end of the df [1, 'not-focused', 0, 0]
df.loc[len(df)] = [1, 'not-focused', 0, 0]
# X and y
X = df.drop('score', axis=1)
y = df['score']


r = RareLabelEncoder(tol=0.05, n_categories=2)
X_encoded = r.fit_transform(X)
X_encoded

Unnamed: 0,subject,attention,solutions
0,1,divided,1
1,2,divided,1
2,3,divided,1
3,4,divided,1
4,5,divided,1
...,...,...,...
56,17,focused,3
57,18,focused,3
58,19,focused,3
59,20,focused,3


## OrdinalEncoder
When categories have a natural order, OrdinalEncoder comes to the rescue. For example, education levels: high school < bachelor < master < PhD.

In [67]:
from feature_engine.encoding import OrdinalEncoder

o = OrdinalEncoder(encoding_method='arbitrary') #you can also use "ordered"
X_encoded = o.fit_transform(X)
X_encoded

Unnamed: 0,subject,attention,solutions
0,1,0,1
1,2,0,1
2,3,0,1
3,4,0,1
4,5,0,1
...,...,...,...
56,17,1,3
57,18,1,3
58,19,1,3
59,20,1,3


## DecisionTreeEncoder
DecisionTreeEncoder uses decision trees to find smart ways of encoding categories. It basically asks: "How do categories split the target variable?"

In [68]:
from feature_engine.encoding import DecisionTreeEncoder

dt = DecisionTreeEncoder(random_state=42)
X_encoded = dt.fit_transform(X, y)
X_encoded

Unnamed: 0,subject,attention,solutions
0,1,5.116667,1
1,2,5.116667,1
2,3,5.116667,1
3,4,5.116667,1
4,5,5.116667,1
...,...,...,...
56,17,6.800000,3
57,18,6.800000,3
58,19,6.800000,3
59,20,6.800000,3


## MeanMedianImputer
Missing data is everywhere. MeanMedianImputer fills numeric gaps using either the mean or median.

In [69]:
from feature_engine.imputation import MeanMedianImputer
import numpy as np

# Add line at the end of the df [1, 'not-focused', NA, 0]
df.loc[len(df)] = [1, 'not-focused', np.nan, 0]
# X and y
X = df.drop('score', axis=1)
y = df['score']

imp = MeanMedianImputer(imputation_method='median')
X_imputed = imp.fit_transform(X)
X_imputed.tail()

Unnamed: 0,subject,attention,solutions
57,18,focused,3.0
58,19,focused,3.0
59,20,focused,3.0
60,1,not-focused,0.0
61,1,not-focused,2.0


## ArbitraryNumberImputer
Sometimes you want missing values to stand out. ArbitraryNumberImputer lets you fill them with a specific number, like -999.

In [74]:
from feature_engine.imputation import ArbitraryNumberImputer

# Add line at the end of the df [1, 'not-focused', NA, 0]
df.loc[len(df)] = [1, 'not-focused', np.nan, 0]
# X and y
X = df.drop('score', axis=1)
y = df['score']

imp = ArbitraryNumberImputer(arbitrary_number=-999)
X_imputed = imp.fit_transform(X)
X_imputed.tail()

Unnamed: 0,subject,attention,solutions
56,17,focused,3.0
57,18,focused,3.0
58,19,focused,3.0
59,20,focused,3.0
60,1,not-focused,-999.0


## MissingIndicator
Want to flag missing values instead of filling them? MissingIndicator creates binary columns that signal whether a value was missing.

In [75]:
from feature_engine.imputation import AddMissingIndicator

# Add line at the end of the df [1, 'not-focused', NA, 0]
df.loc[len(df)] = [1, 'not-focused', np.nan, 0]
# X and y
X = df.drop('score', axis=1)
y = df['score']


mi = AddMissingIndicator()
X_with_flags = mi.fit_transform(X)

In [76]:
X_with_flags

Unnamed: 0,subject,attention,solutions,solutions_na
0,1,divided,1.0,0
1,2,divided,1.0,0
2,3,divided,1.0,0
3,4,divided,1.0,0
4,5,divided,1.0,0
...,...,...,...,...
57,18,focused,3.0,0
58,19,focused,3.0,0
59,20,focused,3.0,0
60,1,not-focused,,1


## LogTransformer
Many datasets have skewed distributions. LogTransformer applies a logarithm to bring things closer to normal.

In [88]:
from feature_engine.transformation import LogTransformer

# X and y
X = df.drop('score', axis=1)
y = df['score']

lt = LogTransformer(variables=['solutions'])
X_transformed = lt.fit_transform(X)
X_transformed.sample(3)

Unnamed: 0,subject,attention,solutions
37,18,focused,0.693147
5,6,divided,0.0
15,16,focused,0.0


## PowerTransformer
When a simple log isn't enough, PowerTransformer offers Box-Cox and Yeo-Johnson transformations.


In [90]:
from feature_engine.transformation import PowerTransformer

pt = PowerTransformer()
X_transformed = pt.fit_transform(X)
X_transformed.sample(3)


Unnamed: 0,subject,attention,solutions
41,1.414214,divided,1.732051
55,4.0,focused,1.732051
56,4.123106,focused,1.732051


## Winsorizer
Outliers can ruin your model's day. Winsorizer caps extreme values to reduce their impact.

In [101]:
from feature_engine.outliers import Winsorizer

# Add line at the end of the df [100, 'winsorize this', 0, 0]
df.loc[len(df)] = [100, 'winsorize this', 0, 0]
# X and y
X = df.drop('score', axis=1)
y = df['score']

w = Winsorizer(capping_method='iqr')
X_winsorized = w.fit_transform(X)

In [102]:
X_winsorized.tail()

Unnamed: 0,subject,attention,solutions
58,19,focused,3
59,20,focused,3
60,31,winsorize this,0
61,31,winsorize this,0
62,31,winsorize this,0
