In [1]:
# %pip install feature-engine
# %pip install category_encoders
# %pip install --upgrade scikit-learn==1.5

# Target Encoder: A powerful categorical encoder for your feature engineering toolkit

# Implementing Target Encoding in Python

In [2]:
import sklearn
sklearn.__version__

'1.5.0'

In [3]:
import feature_engine
feature_engine.__version__

'1.8.0'

In [4]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import TargetEncoder
from feature_engine.encoding import MeanEncoder
from category_encoders import TargetEncoder as CE_TargetEncoder

# Ensure to return a dataframe
from sklearn import set_config
set_config(transform_output='pandas')

In [5]:
def show_fit_attributes(transformer):
    """List fit attributes as per scikit-learn convention"""
    return [attr for attr in dir(transformer) if not attr.startswith('_') and attr.endswith('_')]

## Continuous target

In [6]:
# Load dataset
X,y = datasets.fetch_openml(name='employee_salaries', return_X_y=True, as_frame=True)

# Filter target categorical columns
target_categorical_features = ['gender', 'department', 'division', 'assignment_category']
X = X[target_categorical_features].astype(str)

# Split data set into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display dataset
X.join(y)

Unnamed: 0,gender,department,division,assignment_category,current_annual_salary
0,F,POL,MSB Information Mgmt and Tech Division Records...,Fulltime-Regular,69222.18
1,M,POL,ISB Major Crimes Division Fugitive Section,Fulltime-Regular,97392.47
2,F,HHS,Adult Protective and Case Management Services,Fulltime-Regular,104717.28
3,M,COR,PRRS Facility and Security,Fulltime-Regular,52734.57
4,M,HCA,Affordable Housing Programs,Fulltime-Regular,93396.00
...,...,...,...,...,...
9223,F,HHS,School Based Health Centers,Fulltime-Regular,72094.53
9224,F,FRS,Human Resources Division,Fulltime-Regular,169543.85
9225,M,HHS,Child and Adolescent Mental Health Clinic Serv...,Parttime-Regular,102736.52
9226,M,CCL,Council Central Staff,Fulltime-Regular,153747.50


### Scikit-learn

In [7]:
X_train.head()

Unnamed: 0,gender,department,division,assignment_category
5821,M,DOT,Transit Silver Spring Ride On,Fulltime-Regular
1323,M,COR,DS MCCF Unit 2 Security,Fulltime-Regular
5563,M,FRS,Recruit Training,Fulltime-Regular
4840,F,HHS,Child Welfare Services,Fulltime-Regular
4514,M,DGS,Facilities Maintenance,Fulltime-Regular


In [8]:
# Instantiate encoder
encoder = TargetEncoder(categories='auto', target_type='continuous', smooth='auto', cv=5, random_state=42)

# Fit train & transform the training set
encoder.fit_transform(X_train, y_train).head()

Unnamed: 0,gender,department,division,assignment_category
5821,74640.410204,59288.367294,49885.254697,76911.614947
1323,75069.86267,73915.116308,69796.057282,77137.069255
5563,75069.86267,78503.534748,45644.20485,77137.069255
4840,70998.162571,74744.382454,74635.970939,76846.68785
4514,74495.572368,73439.814758,63980.844563,76846.68785


In [9]:
# Transform training data
encoder.transform(X_train).head()

Unnamed: 0,gender,department,division,assignment_category
5821,74995.001783,59325.915073,50273.168664,77139.986577
1323,74995.001783,73596.926834,67742.0973,77139.986577
5563,74995.001783,79124.649869,45589.801377,77139.986577
4840,71008.836746,74254.10207,73334.526543,77139.986577
4514,74995.001783,74308.919016,65006.962776,77139.986577


In [10]:
# Show learned attributes
print(show_fit_attributes(encoder))

['categories_', 'classes_', 'encodings_', 'feature_names_in_', 'infrequent_categories_', 'n_features_in_', 'target_mean_', 'target_type_']


In [11]:
{category: encoding for category,encoding in zip(encoder.categories_[3], encoder.encodings_[3])}

{'Fulltime-Regular': 77139.98657661081, 'Parttime-Regular': 35094.57315951283}

### Feature-engine

In [12]:
# Instantiate encoder
encoder = MeanEncoder(smoothing='auto')

# Fit train data
encoder.fit(X_train, y_train)

# Transform test data
encoder.transform(X_test)



Unnamed: 0,gender,department,division,assignment_category
1183,74995.001783,50292.908573,50890.388846,35094.573160
1038,71008.836746,71694.251078,61235.477028,77139.986577
9217,71008.836746,73596.926834,89959.390651,77139.986577
908,71008.836746,98046.001244,,77139.986577
8492,74995.001783,74254.102070,68943.425281,77139.986577
...,...,...,...,...
1530,74995.001783,79124.649869,77579.344869,77139.986577
5961,71008.836746,74254.102070,71450.691565,77139.986577
7807,74995.001783,59325.915073,91205.478328,77139.986577
7826,74995.001783,74254.102070,83709.145928,35094.573160


In [13]:
# Show learned attributes
print(show_fit_attributes(encoder))

['encoder_dict_', 'feature_names_in_', 'n_features_in_', 'variables_']


### Category_encoders

In [14]:
# Instantiate encoder
encoder = CE_TargetEncoder(smoothing=10)

# Fit train data
encoder.fit(X_train, y_train)

# Transform test data
encoder.transform(X_test)

Unnamed: 0,gender,department,division,assignment_category
1183,74995.355410,50246.415333,68433.377330,35077.198825
1038,71007.995978,71693.319079,67105.884602,77140.482458
9217,71007.995978,73597.247404,75923.058598,77140.482458
908,71007.995978,88707.018113,73356.952204,77140.482458
8492,74995.355410,74254.684452,68925.794058,77140.482458
...,...,...,...,...
1530,74995.355410,79128.717290,76397.913822,77140.482458
5961,71007.995978,74254.684452,72853.197705,77140.482458
7807,74995.355410,59316.575117,77905.955228,77140.482458
7826,74995.355410,74254.684452,81633.421189,35077.198825


In [15]:
# Show learned attributes
show_fit_attributes(encoder)

['feature_names_in_', 'feature_names_out_', 'n_features_in_']

## Binary target

In [16]:
# Load dataset
X,y = datasets.fetch_openml(name='churn', version=1, return_X_y=True, as_frame=True)

# Filter target categorical columns
target_categorical_features = ['state', 'area_code', 'phone_number', 'international_plan', 'voice_mail_plan']
X = X[target_categorical_features].astype(str)
y = y.astype(int)

# Split data set into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display dataset
X.join(y)

Unnamed: 0,state,area_code,phone_number,international_plan,voice_mail_plan,class
0,16,415,2845,0,1,0
1,35,415,2301,0,1,0
2,31,415,1616,0,0,0
3,35,408,2510,1,0,0
4,36,415,155,1,0,0
...,...,...,...,...,...,...
4995,11,408,2000,0,1,0
4996,49,415,394,0,0,1
4997,7,415,313,0,0,0
4998,7,510,3471,0,0,0


### Scikit-learn

In [17]:
# Instantiate encoder
encoder = TargetEncoder(categories='auto', target_type='binary', smooth='auto', cv=5, random_state=42)

# Fit train data
encoder.fit(X_train, y_train)

# Transform test data
encoder.transform(X_test)

Unnamed: 0,state,area_code,phone_number,international_plan,voice_mail_plan
1501,0.086227,0.143998,0.142,0.113224,0.164094
2586,0.155202,0.141787,0.142,0.113224,0.164094
2653,0.146983,0.140423,0.142,0.113224,0.081895
1055,0.181273,0.141787,0.142,0.113224,0.164094
705,0.061178,0.141787,0.142,0.113224,0.164094
...,...,...,...,...,...
4711,0.205309,0.141787,0.142,0.430523,0.164094
2313,0.208755,0.141787,0.142,0.113224,0.164094
3214,0.146983,0.143998,0.142,0.430523,0.164094
2732,0.130589,0.140423,0.142,0.430523,0.164094


### Feature-engine

In [18]:
# Instantiate encoder
encoder = MeanEncoder(smoothing='auto')

# Fit train data
encoder.fit(X_train, y_train)

# Transform test data
encoder.transform(X_test)



Unnamed: 0,state,area_code,phone_number,international_plan,voice_mail_plan
1501,0.086227,0.143998,,0.113224,0.164094
2586,0.155202,0.141787,,0.113224,0.164094
2653,0.146983,0.140423,,0.113224,0.081895
1055,0.181273,0.141787,,0.113224,0.164094
705,0.061178,0.141787,,0.113224,0.164094
...,...,...,...,...,...
4711,0.205309,0.141787,,0.430523,0.164094
2313,0.208755,0.141787,,0.113224,0.164094
3214,0.146983,0.143998,,0.430523,0.164094
2732,0.130589,0.140423,,0.430523,0.164094


### Category_encoders

In [19]:
# Instantiate encoder
encoder = CE_TargetEncoder(smoothing=10)

# Fit train data
encoder.fit(X_train, y_train)

# Transform test data
encoder.transform(X_test)

Unnamed: 0,state,area_code,phone_number,international_plan,voice_mail_plan
1501,0.086091,0.144000,0.142,0.113218,0.164103
2586,0.155336,0.141787,0.142,0.113218,0.164103
2653,0.147018,0.140421,0.142,0.113218,0.081860
1055,0.181774,0.141787,0.142,0.113218,0.164103
705,0.061416,0.141787,0.142,0.113218,0.164103
...,...,...,...,...,...
4711,0.206157,0.141787,0.142,0.432133,0.164103
2313,0.209725,0.141787,0.142,0.113218,0.164103
3214,0.147018,0.144000,0.142,0.432133,0.164103
2732,0.130520,0.140421,0.142,0.432133,0.164103


In [20]:
# encoder.mapping

## Multiclass

In [21]:
# Load dataset
X,y = datasets.fetch_openml(name='eucalyptus', version=1, return_X_y=True, as_frame=True)

# # Filter target categorical columns
target_categorical_features = ['Abbrev', 'Locality', 'Map_Ref']
X = X[target_categorical_features].astype(str)

# Split data set into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display dataset
X.join(y)

Unnamed: 0,Abbrev,Locality,Map_Ref,Utility
0,Cra,Central_Hawkes_Bay,N135_382/137,good
1,Cra,Central_Hawkes_Bay,N135_382/137,best
2,Cra,Central_Hawkes_Bay,N135_382/137,low
3,Cra,Central_Hawkes_Bay,N135_382/137,good
4,Cra,Central_Hawkes_Bay,N135_382/137,good
...,...,...,...,...
731,WSh,Southern_Hawkes_Bay,N151_922/226,average
732,WSh,Southern_Hawkes_Bay,N151_922/226,good
733,WSh,Southern_Hawkes_Bay,N151_922/226,good
734,WSh,Southern_Hawkes_Bay,N151_922/226,good


### Scikit-learn

In [22]:
# Instantiate encoder
encoder = TargetEncoder(categories='auto', target_type='multiclass', smooth='auto', cv=5, random_state=42)

# Fit train data
encoder.fit(X_train, y_train)

# Transform test data
encoder.transform(X_test)

Unnamed: 0,Abbrev_average,Abbrev_best,Abbrev_good,Abbrev_low,Abbrev_none,Locality_average,Locality_best,Locality_good,Locality_low,Locality_none,Map_Ref_average,Map_Ref_best,Map_Ref_good,Map_Ref_low,Map_Ref_none
669,0.000000,0.000000,0.000000,0.000000,1.000000,0.090395,0.077402,0.154916,0.025947,0.647583,0.000000,0.000000,0.000000,0.000000,1.000000
33,0.057999,0.266275,0.434447,0.216451,0.000000,0.057999,0.266275,0.434447,0.216451,0.000000,0.057999,0.266275,0.434447,0.216451,0.000000
549,0.296651,0.039369,0.272889,0.117164,0.272370,0.198772,0.074079,0.306694,0.147699,0.272570,0.296651,0.039369,0.272889,0.117164,0.272370
199,0.130372,0.238036,0.259712,0.238108,0.130930,0.247844,0.169717,0.183496,0.247325,0.150757,0.257803,0.172100,0.207439,0.231642,0.129920
264,0.102533,0.118968,0.221259,0.168982,0.386788,0.198772,0.074079,0.306694,0.147699,0.272570,0.102533,0.118968,0.221259,0.168982,0.386788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,0.155713,0.079692,0.516814,0.079739,0.158590,0.285768,0.068439,0.520560,0.017239,0.102893,0.155713,0.079692,0.516814,0.079739,0.158590
329,0.320908,0.070645,0.505374,0.000000,0.094628,0.285768,0.068439,0.520560,0.017239,0.102893,0.320908,0.070645,0.505374,0.000000,0.094628
599,0.117336,0.251774,0.505374,0.070657,0.047616,0.117336,0.251774,0.505374,0.070657,0.047616,0.117336,0.251774,0.505374,0.070657,0.047616
259,0.102533,0.118968,0.221259,0.168982,0.386788,0.198772,0.074079,0.306694,0.147699,0.272570,0.102533,0.118968,0.221259,0.168982,0.386788


# Showcasing a pipeline with mixed data using TargetEncoder

In [23]:
import time
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, TargetEncoder
from feature_engine.encoding import MeanEncoder
from category_encoders import TargetEncoder as CE_TargetEncoder

In [24]:
def clf_metrics(y_true, y_pred, y_proba=None):
    """
    Calculate various performance metrics for a classification model.

    Args:
    y_true (array-like): True labels.
    y_pred (array-like): Predicted labels.
    y_proba (array-like, optional): Predicted probabilities for the positive class.

    Returns:
    dict: A dictionary containing calculated metrics such as Accuracy, Balanced Accuracy, Recall, Precision, F1, and optionally ROC_AUC.
    """
    dict_metrics = {
        'Accuracy': metrics.accuracy_score(y_true, y_pred),
        'Balanced Accuracy': metrics.balanced_accuracy_score(y_true, y_pred),
        'Recall': metrics.recall_score(y_true, y_pred),
        'Precison': metrics.precision_score(y_true, y_pred),
        'F1': metrics.f1_score(y_true, y_pred),
    }

    if y_proba is not None:
        dict_metrics['ROC_AUC'] = metrics.roc_auc_score(y_true, y_proba)

    return dict_metrics

In [25]:
# Load the adult census data (v4)
X,y = datasets.fetch_openml(name='adult', version=4, return_X_y=True, as_frame=True)

# List numeric & categorical features (removing education-num because it's redundant)
NUMERIC_FEATURES = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
CATEGORICAL_FEATURES = X.select_dtypes(exclude='number').columns.tolist()

# Change dtype from category to str because we need to fix missing values
X[CATEGORICAL_FEATURES] = X[CATEGORICAL_FEATURES].astype(str)

# Fix missing values
X[CATEGORICAL_FEATURES] = X[CATEGORICAL_FEATURES].replace('nan', np.nan)

# Encode target variable using a rule-based approach (resulting into a binary variable)
y = y.eq('>50K').astype(int)

# Split data into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Show training data
X_train.join(y_train).head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass,education,marital-status,occupation,relationship,race,sex,native-country,class
37193,42,143582.0,4,0.0,0.0,48,Private,7th-8th,Married-civ-spouse,Other-service,Other-relative,Asian-Pac-Islander,Female,,0
31093,23,227471.0,9,0.0,0.0,24,Private,HS-grad,Never-married,Handlers-cleaners,Own-child,White,Male,United-States,0
33814,38,111128.0,12,0.0,0.0,40,Private,Assoc-acdm,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,1
14500,30,319280.0,9,0.0,0.0,40,Local-gov,HS-grad,Married-civ-spouse,Protective-serv,Husband,White,Male,United-States,1
23399,43,190786.0,9,0.0,0.0,40,Private,HS-grad,Married-civ-spouse,Other-service,Husband,White,Male,United-States,1


In [26]:
# Computing missing values
(
    X.isna().sum().to_frame('missing_count')
    .assign(missing_pct = lambda x: x.missing_count / X.shape[0])
    .sort_values('missing_pct', ascending=False)
    .query('missing_count>0')
    .style.format('{:.2%}', subset=['missing_pct'])
)

Unnamed: 0,missing_count,missing_pct
occupation,2809,5.75%
workclass,2799,5.73%
native-country,857,1.75%


In [27]:
# Set the numeric processor
numeric_processor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Set the categorical processor for categorical values
categorical_processor = TargetEncoder(categories='auto', target_type='binary', smooth='auto', cv=5, random_state=42)

# Set an overall preprocessor
preprocessor = ColumnTransformer([
    ('numeric', numeric_processor, NUMERIC_FEATURES),
    ('categorical', categorical_processor, CATEGORICAL_FEATURES)
])

# Create the final model pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(class_weight='balanced', max_iter=500, random_state=42))
])

# Fit the pipeline
model_pipeline.fit(X_train, y_train)

In [28]:
# Compute metrics
y_pred = model_pipeline.predict(X_test)
y_proba = model_pipeline.predict_proba(X_test)[:, 1]
clf_metrics(y_test, y_pred, y_proba)

{'Accuracy': 0.7975227761285699,
 'Balanced Accuracy': 0.8114646914388979,
 'Recall': 0.838655462184874,
 'Precison': 0.5559888579387187,
 'F1': 0.668676716917923,
 'ROC_AUC': 0.8961481181997769}

In [29]:
# List all encoders we're going to evaluate
categorical_processors = dict(
    onehot=OneHotEncoder(handle_unknown='ignore', sparse_output=False),
    ordinal=make_pipeline(
        SimpleImputer(strategy='constant', fill_value='Missing'),
        OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
    ),
    target_sklearn=TargetEncoder(categories='auto', target_type='binary', smooth='auto', cv=5, random_state=42),
    target_fe=make_pipeline(
        SimpleImputer(strategy='constant', fill_value='Missing'),
        MeanEncoder(smoothing='auto'),
    ),
    target_ce=CE_TargetEncoder(smoothing=10)
)

# Evaluating the performance of each encoder
results = []

for encoder_name, encoder in categorical_processors.items():
    # Update categorical preprocessor
    model_pipeline.set_params(preprocessor__categorical=encoder)

    # Measure the fitting time
    start_time = time.time()
    model_pipeline.fit(X_train, y_train)
    fit_time = time.time() - start_time

    # Compute metrics
    y_pred = model_pipeline.predict(X_test)
    y_proba = model_pipeline.predict_proba(X_test)[:, 1]
    model_metrics = clf_metrics(y_test, y_pred, y_proba)
    model_metrics['encoder'] = encoder_name     # identify the encoder
    model_metrics['fit_time'] = fit_time        # add fit time to metrics

    # Append metrics to list
    results.append(model_metrics)

# Convert list of results into a dataframe
df_metrics = pd.DataFrame(results).set_index('encoder').round(4)

# Display the results
display(df_metrics)

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,Recall,Precison,F1,ROC_AUC,fit_time
encoder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
onehot,0.8043,0.8186,0.8466,0.5657,0.6782,0.9023,3.7019
ordinal,0.7319,0.7373,0.7479,0.4685,0.5761,0.8148,2.8261
target_sklearn,0.7975,0.8115,0.8387,0.556,0.6687,0.8961,1.4781
target_fe,0.7979,0.8115,0.8378,0.5567,0.6689,0.8961,2.0669
target_ce,0.7979,0.8113,0.8374,0.5567,0.6688,0.8961,2.8551
