**Author:** Cainã Max Couto da Silva  
**LinkedIn:** @cmcouto-silva

In [None]:
%pip install scikit-learn==1.2.2
%pip install feature-engine==1.6.1

Collecting feature-engine==1.6.1
  Downloading feature_engine-1.6.1-py2.py3-none-any.whl (326 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m326.6/326.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.1


# **Settings**

Wrap output text on Colab:

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

## **Libraries**

In [None]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# For displaying pipelines
from sklearn import set_config
set_config(display='diagram')
set_config(transform_output="pandas")

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler

## **Data**

In [None]:
# Create simulated data set

df_train = pd.DataFrame({
    'tool_id': [1,2,3,4,5],
    'temperature': [180,100,120,np.nan,90],
    'pressure': [13000,5000,11000,4500,np.nan],
    'due_maintenance': ['Yes', 'No', 'Yes', 'Yes', 'No'],
    'age_status': ['old','new','old','old','new'],
    'failed':[True,False,True,False,False]
}).set_index('tool_id')

df_test = pd.DataFrame({
    'tool_id': [6,7,8],
    'temperature': [85,110,np.nan],
    'pressure': [6000,10500,3300],
    'due_maintenance': ['Yes', 'Yes', 'No'],
    'age_status': ['new', 'old','ancient'],
    'failed':[False,True,False]
}).set_index('tool_id')

df_future_unique = pd.DataFrame({
    'tool_id': [10],
    'temperature': [12],
    'pressure': [7500],
    'due_maintenance': ['No'],
    'age_status': ['new'],
}).set_index('tool_id')

print('Train data')
display(df_train)
print()

print('Test data')
display(df_test)
print()

print('Future data')
display(df_future_unique)

Train data


Unnamed: 0_level_0,temperature,pressure,due_maintenance,age_status,failed
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,180.0,13000.0,Yes,old,True
2,100.0,5000.0,No,new,False
3,120.0,11000.0,Yes,old,True
4,,4500.0,Yes,old,False
5,90.0,,No,new,False



Test data


Unnamed: 0_level_0,temperature,pressure,due_maintenance,age_status,failed
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6,85.0,6000,Yes,new,False
7,110.0,10500,Yes,old,True
8,,3300,No,ancient,False



Future data


Unnamed: 0_level_0,temperature,pressure,due_maintenance,age_status
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,12,7500,No,new


In [None]:
NUMERICAL_FEATURES = [
    'temperature',
    'pressure'
]

CATEGORICAL_FEATURES = [
    'due_maintenance',
    'age_status'
]

FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES

TARGET = 'failed'

# Manual preprocessing

**Note:** I'm do not recommend a manual approach like this to anyone. It's just for didatic purposes.

We should handle:
- Missing values
- Numerical features
- Categorical features

## Numerical features

In [None]:
# Train features and target
X_train = df_train[NUMERICAL_FEATURES]
y_train = df_train[TARGET]

# Test features and target
X_test = df_test[NUMERICAL_FEATURES]
y_test = df_test[TARGET]

# Instance with unknown target
X_new = df_future_unique[NUMERICAL_FEATURES]

In [None]:
# Instantiate model
model = LogisticRegression()

### Missing values

In [None]:
# If missing values are present, training fails
try:
  model.fit(X_train, y_train)
except Exception as e:
  print(e)

Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


In [None]:
# Compute mean for every column
training_numerical_means = X_train.mean()
training_numerical_means

temperature     122.5
pressure       8375.0
dtype: float64

In [None]:
# Apply our imputation method to train, test, and new data
X_train_imputed = X_train.fillna(training_numerical_means)
X_test_imputed = X_test.fillna(training_numerical_means)
X_new_imputed = X_new.fillna(training_numerical_means)

In [None]:
# Without missing values, the training runs ok
try:
  model.fit(X_train_imputed, y_train)
except Exception as e:
  print(e)

In [None]:
# Predict train, test, and new data
try:
  print('Train predictions:', model.predict(X_train_imputed))
  print('Test predictions:', model.predict(X_test_imputed))
  print('New predictions:', model.predict(X_new_imputed))
except Exception as e:
  print(e)

Train predictions: [ True False  True False  True]
Test predictions: [ True  True False]
New predictions: [ True]


### Data scaling

What if we need scale the numerical data?

Let's consider the common standard scaler:

$$
\frac{x - \bar{x}}{\sigma}
$$

In [None]:
# Compute standard deviation for every column
training_numerical_std = X_train.std()
display(training_numerical_std)

temperature      40.311289
pressure       4269.562819
dtype: float64

In [None]:
# Applying scaling preprocessing to train, test, and new data
X_train_imputed_scaled = (X_train_imputed - training_numerical_means) / training_numerical_std
X_test_imputed_scaled = (X_test_imputed - training_numerical_means) / training_numerical_std
X_new_imputed_scaled = (X_new_imputed - training_numerical_means) / training_numerical_std

In [None]:
# Train model
model.fit(X_train_imputed_scaled, y_train)

# Predict train, test, and new data
try:
  print('Train predictions:', model.predict(X_train_imputed_scaled))
  print('Test predictions:', model.predict(X_test_imputed_scaled))
  print('New predictions:', model.predict(X_new_imputed_scaled))
except Exception as e:
  print(e)

Train predictions: [ True False  True False False]
Test predictions: [False False False]
New predictions: [False]


Let's save the preprocessing parameters so we can use them in the future/pipeline.

In [None]:
trained_preprocessor = {
    'imputer_parameters': {
      'mean': training_numerical_means
    },
    'scaler_parameters': {
      'mean': training_numerical_means,
      'std': training_numerical_std
    }
}

trained_model = model

In [None]:
# Saving preprocessing parameters & trained model
with open('num_preprocessor.pkl', 'wb') as preprocessor_file:
  pickle.dump(trained_preprocessor, preprocessor_file)

with open('num_model.pkl', 'wb') as model_file:
  pickle.dump(trained_model, model_file)

What we needed so far:
- Compute and store means (for imputation & scaling)
- Compute and store standard deviations (scaling)
- **Store** the computed preprocessing parameters (mean & std)
- Create a temporary table with imputated values
- Create another temporary table with imputed and scaled values
- Manually applying the preprocessing with the trained parameters

Wouldn't be better to have a class for computing and storing the preprocessing parameters, able to transform the data using the learned parameters?

### Custom classes

In [None]:
# Creating a class to learn and store paremeters, able to transform new data
class MySimpleImputer():

  def fit(self, X, y=None):
    self.feature_names_in_ = X.columns.tolist()
    self.mean_ = X.mean()
    return self

  def transform(self, X, y=None):
    return X.fillna(self.mean_)

  def fit_transform(self, X, y=None):
    self.fit(X)
    return X.fillna(self.mean_)

In [None]:
# Instantiate our imputer
my_imputer = MySimpleImputer()

In [None]:
# Trying to access attributes without .fit()
try:
  print('Trained feature names:', my_imputer.feature_names_in_)
  print('Trained averages', my_imputer.mean_)
except Exception as e:
  print(e)

'MySimpleImputer' object has no attribute 'feature_names_in_'


In [None]:
# Fit our imputer preprocessor
my_imputer.fit(X_train)

<__main__.MySimpleImputer at 0x7f1274620460>

In [None]:
# Trying to access attributes after .fit()
try:
  print('Trained feature names:', my_imputer.feature_names_in_)
  print('Trained averages', my_imputer.mean_.to_dict())
except Exception as e:
  print(e)

Trained feature names: ['temperature', 'pressure']
Trained averages {'temperature': 122.5, 'pressure': 8375.0}


In [None]:
# Transform train, test, and new data
try:
  my_imputer.transform(X_train)
  my_imputer.transform(X_test)
  my_imputer.transform(X_new)
except Exception as e:
  print(e)

In [None]:
print('Test features without imputation')
display(X_test)
print('\n')

print('Test features after imputation')
my_imputer.transform(X_test)

Test features without imputation


Unnamed: 0_level_0,temperature,pressure
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,85.0,6000
7,110.0,10500
8,,3300




Test features after imputation


Unnamed: 0_level_0,temperature,pressure
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,85.0,6000
7,110.0,10500
8,122.5,3300


In [None]:
class MyStandardScaler():

  def fit(self, X, y=None):
    self.feature_names_in_ = X.columns.tolist()
    self.mean_ = X.mean()
    self.std_ = X.std()
    return self

  def transform(self, X, y=None):
      return (X - self.mean_) / self.std_

  def fit_transform(self, X, y=None):
    self.fit(X)
    return self.transform(X)

In [None]:
my_imputer = MySimpleImputer().fit(X_train)
my_scaler = MyStandardScaler().fit(X_train)

X_test_transformed = my_scaler.transform( my_imputer.transform(X_test) )
X_new_transformed = my_scaler.transform( my_imputer.transform(X_new) )

In [None]:
model_pipeline = make_pipeline(MySimpleImputer(), MyStandardScaler(), LogisticRegression())
display(model_pipeline)

In [None]:
# Train model pipeline
model_pipeline.fit(X_train, y_train)

In [None]:
# Predict train, test, and new data
try:
  print('Train predictions:', model.predict(X_train_imputed_scaled))
  print('Test predictions:', model.predict(X_test_imputed_scaled))
  print('New predictions:', model.predict(X_new_imputed_scaled))
except Exception as e:
  print(e)

Train predictions: [ True False  True False False]
Test predictions: [False False False]
New predictions: [False]


In [None]:
# Saving preprocessing parameters & trained model
with open('num_imputer.pkl', 'wb') as preprocessor_file:
  pickle.dump(my_imputer, preprocessor_file)

with open('num_scaler.pkl', 'wb') as preprocessor_file:
  pickle.dump(my_scaler, preprocessor_file)

## Categorical features

Extract categorical features

In [None]:
# Train features and target
X_train = df_train[CATEGORICAL_FEATURES]
y_train = df_train[TARGET]

# Test features and target
X_test = df_test[CATEGORICAL_FEATURES]
y_test = df_test[TARGET]

# Instance with unknown target
X_new = df_future_unique[CATEGORICAL_FEATURES]

Avoid using functions like pd.get_dummies, since it's not suitable for model reprodutibility:

In [None]:
# Train data
print('Train data')
display(X_train)
print()

# Test data
print('Test data')
display(X_test)
print()

# New data
print('New data')
display(X_new)

Train data


Unnamed: 0_level_0,due_maintenance,age_status
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Yes,old
2,No,new
3,Yes,old
4,Yes,old
5,No,new



Test data


Unnamed: 0_level_0,due_maintenance,age_status
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,Yes,new
7,Yes,old
8,No,ancient



New data


Unnamed: 0_level_0,due_maintenance,age_status
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10,No,new


In [None]:
# Transform data using pd.get_dummies (not recommended!)
X_train_transformed = pd.get_dummies(X_train)
X_test_transformed = pd.get_dummies(X_test)
X_new_transformed = pd.get_dummies(X_new)

In [None]:
# Train data (transformed)
print('Train data')
display(X_train_transformed)
print()

# Test data (transformed) - new column
print('Test data')
display(X_test_transformed)
print()

# New data - missing columns
print('New data')
display(X_new_transformed)

Train data


Unnamed: 0_level_0,due_maintenance_No,due_maintenance_Yes,age_status_new,age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,0,1
5,1,0,1,0



Test data


Unnamed: 0_level_0,due_maintenance_No,due_maintenance_Yes,age_status_ancient,age_status_new,age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6,0,1,0,1,0
7,0,1,0,0,1
8,1,0,1,0,0



New data


Unnamed: 0_level_0,due_maintenance_No,age_status_new
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10,1,1


In [None]:
# Train model with categorical data
model.fit(X_train_transformed, y_train)

In [None]:
# Predict train data (same structure learnt)
try:
  model.predict(X_train_transformed)
except Exception as e:
  print(e)

In [None]:
# Predict test data (fails due to an extra column)
try:
  model.predict(X_test_transformed)
except Exception as e:
  print(e)

The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- age_status_ancient



In [None]:
# Predict test data (fails due to missing columns)
try:
  model.predict(X_new_transformed)
except Exception as e:
  print(e)

The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- age_status_old
- due_maintenance_Yes



Let's create a minimal class for computing dummies properly (transformation applied to train data should be applied to test and upcoming data):

In [None]:
class MyOneHotEncoder():

  def fit(self, X, y=None):
    self.categories_ = {feature: set(X[feature]) for feature in X.columns}
    self.output_columns_ = pd.get_dummies(X, ).columns.tolist()

  def transform(self, X, y=None):
    X_transformed = (
        pd.get_dummies(X)
        .reindex(self.output_columns_, axis=1)
        .fillna(0)
        .astype(int)
        )
    return X_transformed

  def fit_transform(self, X, y=None):
    self.fit(X)
    return self.transform(X)

In [None]:
# Transform the categorical data
my_encoder = MyOneHotEncoder()

X_train_transformed = my_encoder.fit_transform(X_train)
X_test_transformed = my_encoder.transform(X_train)
X_new_transformed = my_encoder.transform(X_new)

In [None]:
# Train data (transformed)
print('Train data')
display(X_train_transformed)
print()

# Test data (transformed)
print('Test data')
display(X_test_transformed)
print()

# New data
print('New data')
display(X_new_transformed)

Train data


Unnamed: 0_level_0,due_maintenance_No,due_maintenance_Yes,age_status_new,age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,0,1
5,1,0,1,0



Test data


Unnamed: 0_level_0,due_maintenance_No,due_maintenance_Yes,age_status_new,age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,0,1
5,1,0,1,0



New data


Unnamed: 0_level_0,due_maintenance_No,due_maintenance_Yes,age_status_new,age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,1,0,1,0


In [None]:
# Train model with processed categorical variables
model.fit(X_train_transformed, y_train)

In [None]:
# Predict train, test, and new data
try:
  print('Train predictions:', model.predict(X_train_transformed))
  print('Test predictions:', model.predict(X_test_transformed))
  print('New predictions:', model.predict(X_new_transformed))
except Exception as e:
  print(e)

Train predictions: [ True False  True  True False]
Test predictions: [ True False  True  True False]
New predictions: [False]


In [None]:
# Save categorical preprocessor and trained model
with open('cat_encoder.pkl', 'wb') as preprocessor_file:
  pickle.dump(my_encoder, preprocessor_file)

with open('cat_model.pkl', 'wb') as model_file:
  pickle.dump(model, model_file)

In [None]:
# Load preprocessor
with open('cat_encoder.pkl', 'rb') as preprocessor_file:
  categorical_preprocessor = pickle.load(preprocessor_file)

# Load logistic regression (lr) model
with open('cat_model.pkl', 'rb') as model_file:
  lr_model_cat = pickle.load(model_file)

# Process data & compute model predictions
lr_model_cat.predict( categorical_preprocessor.transform(X_new) )

array([False])

## All features

In [None]:
# Train features and target
X_train = df_train[FEATURES]
y_train = df_train[TARGET]

# Test features and target
X_test = df_test[FEATURES]
y_test = df_test[TARGET]

# Instance with unknown target
X_new = df_future_unique[FEATURES]

In [None]:
# Load trained preprocessor classes
with open('num_imputer.pkl', 'rb') as imputer_file:
  imputer = pickle.load(imputer_file)

with open('num_scaler.pkl', 'rb') as scaler_file:
  scaler = pickle.load(scaler_file)

with open('cat_encoder.pkl', 'rb') as encoder_file:
  encoder = pickle.load(encoder_file)

In [None]:
from functools import reduce

def preprocess_data(X: pd.DataFrame, numeric_features: list, categoric_features: list, numeric_preprocessors: list, categoric_preprocessor: list) -> pd.DataFrame:
  """Preprocess pandas dataframe using trained preprocessors"""
  X_num = reduce(lambda X, preprocessor: preprocessor.transform(X), numeric_preprocessors, X[numeric_features])
  X_cat = reduce(lambda X, preprocessor: preprocessor.transform(X), categoric_preprocessor, X[categoric_features])
  return pd.concat([X_num, X_cat], axis=1)

In [None]:
# Preprocess train, test, and new data using trained parameters
X_train_transformed = preprocess_data(X_train, NUMERICAL_FEATURES, CATEGORICAL_FEATURES, [imputer, scaler], [encoder])
X_test_transformed = preprocess_data(X_test, NUMERICAL_FEATURES, CATEGORICAL_FEATURES, [imputer, scaler], [encoder])
X_new_transformed = preprocess_data(X_new, NUMERICAL_FEATURES, CATEGORICAL_FEATURES, [imputer, scaler], [encoder])

# Display transformed data
print('Train data')
display(X_train_transformed)
print()

print('Test data')
display(X_test_transformed)
print()

print('New data')
display(X_new_transformed)

Train data


Unnamed: 0_level_0,temperature,pressure,due_maintenance_No,due_maintenance_Yes,age_status_new,age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.426399,1.083249,0,1,0,1
2,-0.558156,-0.790479,1,0,1,0
3,-0.062017,0.614817,0,1,0,1
4,0.0,-0.907587,0,1,0,1
5,-0.806226,0.0,1,0,1,0



Test data


Unnamed: 0_level_0,temperature,pressure,due_maintenance_No,due_maintenance_Yes,age_status_new,age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,-0.930261,-0.556263,0,1,1,0
7,-0.310087,0.497709,0,1,0,1
8,0.0,-1.188646,1,0,0,0



New data


Unnamed: 0_level_0,temperature,pressure,due_maintenance_No,due_maintenance_Yes,age_status_new,age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10,-2.741168,-0.204939,1,0,1,0


# Scikit-learn Transformers

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
def list_trained_attributes(obj):
  """List trained attributes from sklearn classes"""
  return [attr for attr in dir(obj) if not attr.startswith('_') and attr.endswith('_')]

## Imputer

In [None]:
# Instantiate simple imputer
imputer = SimpleImputer(strategy='mean', fill_value=None, add_indicator=False)

In [None]:
# No trained attribues (ending with "_") are shown when .fit() is not applied
list_trained_attributes(imputer)

[]

In [None]:
# Train imputer
imputer.fit(X_train[NUMERICAL_FEATURES])

In [None]:
# Trained imputer attributes
list_trained_attributes(imputer)

['feature_names_in_', 'indicator_', 'n_features_in_', 'statistics_']

In [None]:
imputer.statistics_

array([ 122.5, 8375. ])

In [None]:
# Instantiate simple imputer with indicator
imputer = SimpleImputer(strategy='mean', fill_value=None, add_indicator=True)

# Train imputer
imputer.fit(X_train[NUMERICAL_FEATURES])

# Show trained imputer attributes
list_trained_attributes(imputer)

['feature_names_in_', 'indicator_', 'n_features_in_', 'statistics_']

In [None]:
# Show transformed data with missing indicator
imputer.transform(X_train[NUMERICAL_FEATURES])

Unnamed: 0_level_0,temperature,pressure,missingindicator_temperature,missingindicator_pressure
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,180.0,13000.0,0.0,0.0
2,100.0,5000.0,0.0,0.0
3,120.0,11000.0,0.0,0.0
4,122.5,4500.0,1.0,0.0
5,90.0,8375.0,0.0,1.0


In [None]:
# Show transformed test data with missing indicator
imputer.transform(X_test[NUMERICAL_FEATURES])

Unnamed: 0_level_0,temperature,pressure,missingindicator_temperature,missingindicator_pressure
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,85.0,6000.0,0.0,0.0
7,110.0,10500.0,0.0,0.0
8,122.5,3300.0,1.0,0.0


## Scaler

In [None]:
from sklearn.preprocessing import StandardScaler, scale

In [None]:
# Instantiate & fit standard scaler
scaler = StandardScaler().fit(X_train[NUMERICAL_FEATURES])

# List learned attributes
list_trained_attributes(scaler)

['feature_names_in_',
 'mean_',
 'n_features_in_',
 'n_samples_seen_',
 'scale_',
 'var_']

In [None]:
# Show scaled train, test, and new data
print('Scaled train data')
display (scaler.transform(X_train[NUMERICAL_FEATURES]) )
print()

print('Scaled test data')
display (scaler.transform(X_test[NUMERICAL_FEATURES]) )
print()

print('Scaled new data')
display (scaler.transform(X_new[NUMERICAL_FEATURES]) )

Scaled train data


Unnamed: 0_level_0,temperature,pressure
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.647064,1.250828
2,-0.644503,-0.912767
3,-0.071611,0.70993
4,,-1.047991
5,-0.930949,



Scaled test data


Unnamed: 0_level_0,temperature,pressure
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,-1.074172,-0.642317
7,-0.358057,0.574705
8,,-1.372531



Scaled new data


Unnamed: 0_level_0,temperature,pressure
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10,-3.165228,-0.236643


If this preprocessing will not be part of a pipeline for model / production, you can use `scale` directly:

In [None]:
# Applying standard scale with function
scale(X_train[NUMERICAL_FEATURES])

array([[ 1.64706421,  1.2508283 ],
       [-0.64450339, -0.9127666 ],
       [-0.07161149,  0.70992957],
       [        nan, -1.04799128],
       [-0.93094934,         nan]])

In [None]:
# Trick to keep dataframe index and column names :)
X_train[NUMERICAL_FEATURES].apply(scale)

Unnamed: 0_level_0,temperature,pressure
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.647064,1.250828
2,-0.644503,-0.912767
3,-0.071611,0.70993
4,,-1.047991
5,-0.930949,


## Encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder

**One-hot encoder**

In [None]:
# Instantiate and fit one-hot encoder
ohe_encoder = OneHotEncoder(handle_unknown='ignore', drop=None, sparse_output=False, min_frequency=.5, max_categories=None)
ohe_encoder.fit(df_train[CATEGORICAL_FEATURES])

In [None]:
# Transform categorical data
X_train_transformed = ohe_encoder.transform(X_train[CATEGORICAL_FEATURES])
X_test_transformed = ohe_encoder.transform(X_test[CATEGORICAL_FEATURES])
X_new_transformed = ohe_encoder.transform(X_new[CATEGORICAL_FEATURES])

# Show transformed categorical data
print('Train data')
display(X_train_transformed)
print()

print('Test data')
display(X_test_transformed)
print()

print('New data')
display(X_new_transformed)

Train data


Unnamed: 0_level_0,due_maintenance_Yes,due_maintenance_infrequent_sklearn,age_status_old,age_status_infrequent_sklearn
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0
3,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0
5,0.0,1.0,0.0,1.0



Test data


Unnamed: 0_level_0,due_maintenance_Yes,due_maintenance_infrequent_sklearn,age_status_old,age_status_infrequent_sklearn
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,1.0,0.0,0.0,1.0
7,1.0,0.0,1.0,0.0
8,0.0,1.0,0.0,0.0



New data


Unnamed: 0_level_0,due_maintenance_Yes,due_maintenance_infrequent_sklearn,age_status_old,age_status_infrequent_sklearn
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,0.0,1.0,0.0,1.0


In [None]:
# List learned attributes
list_trained_attributes(ohe_encoder)

['categories_',
 'drop_idx_',
 'feature_names_in_',
 'infrequent_categories_',
 'n_features_in_']

**Ordinal encoder**

In [None]:
# Instantiate and fit "ordinal" encoder
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ordinal_encoder.fit(df_train[CATEGORICAL_FEATURES])

In [None]:
# Transform categorical data
X_train_transformed = ordinal_encoder.transform(X_train[CATEGORICAL_FEATURES])
X_test_transformed = ordinal_encoder.transform(X_test[CATEGORICAL_FEATURES])
X_new_transformed = ordinal_encoder.transform(X_new[CATEGORICAL_FEATURES])

# Show transformed categorical data
print('Train data')
display(X_train_transformed)
print()

print('Test data')
display(X_test_transformed)
print()

print('New data')
display(X_new_transformed)

Train data


Unnamed: 0_level_0,due_maintenance,age_status
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.0,1.0
2,0.0,0.0
3,1.0,1.0
4,1.0,1.0
5,0.0,0.0



Test data


Unnamed: 0_level_0,due_maintenance,age_status
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,1.0,0.0
7,1.0,1.0
8,0.0,-1.0



New data


Unnamed: 0_level_0,due_maintenance,age_status
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10,0.0,0.0


In [None]:
# List learned attributes
list_trained_attributes(ordinal_encoder)

['categories_', 'feature_names_in_', 'n_features_in_']

In [None]:
print(ordinal_encoder.categories_)
print(ordinal_encoder.feature_names_in_)

[array(['No', 'Yes'], dtype=object), array(['new', 'old'], dtype=object)]
['due_maintenance' 'age_status']


**LabelEncoder**

In [None]:
# Instantiate and train label encoder with target
le_encoder = LabelEncoder()
le_encoder.fit(y_train)

In [None]:
# Transform target data
y_train_transformed = le_encoder.transform(y_train)
y_test_transformed = le_encoder.transform(y_test)

# Show transformed targets
print('Train data')
display(y_train_transformed)
print()

print('Test data')
display(y_test_transformed)

Train data


array([1, 0, 1, 0, 0])


Test data


array([0, 1, 0])

In [None]:
# List learned attributes
list_trained_attributes(le_encoder)

['classes_']

In [None]:
# Show y classes
le_encoder.classes_

array([False,  True])

In [None]:
# Inverse transform
le_encoder.inverse_transform(y_train_transformed)

array([ True, False,  True, False, False])

## Column transformers

So far, we have been subsetting the numerical and categorical data for applyting our preprocessing.

Wouldn't be easier if we could directly specify the columns we want to apply the respective preprocessing?

We can use Column Transformer for this:

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
# Preprocessors (transformers)
numeric_preprocessor = SimpleImputer(strategy='mean')
categorical_preprocessor = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Create Column transformer (list ot tuples: step name, transformer, list of columns)
preprocessor = ColumnTransformer([
    ('numeric', numeric_preprocessor, NUMERICAL_FEATURES),
    ('categorical', categorical_preprocessor, CATEGORICAL_FEATURES),
])

# Fit preprocessor
preprocessor.fit(X_train)

In [None]:
# Transform data
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
X_new_transformed = preprocessor.transform(X_new)

# Show transformed categorical data
print('Train data')
display(X_train_transformed)
print()

print('Test data')
display(X_test_transformed)
print()

print('New data')
display(X_new_transformed)

Train data


Unnamed: 0_level_0,numeric__temperature,numeric__pressure,categorical__due_maintenance_No,categorical__due_maintenance_Yes,categorical__age_status_new,categorical__age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,180.0,13000.0,0.0,1.0,0.0,1.0
2,100.0,5000.0,1.0,0.0,1.0,0.0
3,120.0,11000.0,0.0,1.0,0.0,1.0
4,122.5,4500.0,0.0,1.0,0.0,1.0
5,90.0,8375.0,1.0,0.0,1.0,0.0



Test data


Unnamed: 0_level_0,numeric__temperature,numeric__pressure,categorical__due_maintenance_No,categorical__due_maintenance_Yes,categorical__age_status_new,categorical__age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,85.0,6000.0,0.0,1.0,1.0,0.0
7,110.0,10500.0,0.0,1.0,0.0,1.0
8,122.5,3300.0,1.0,0.0,0.0,0.0



New data


Unnamed: 0_level_0,numeric__temperature,numeric__pressure,categorical__due_maintenance_No,categorical__due_maintenance_Yes,categorical__age_status_new,categorical__age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10,12.0,7500.0,1.0,0.0,1.0,0.0


In [None]:
# List learned attributes
list_trained_attributes(preprocessor)

['feature_names_in_',
 'n_features_in_',
 'named_transformers_',
 'output_indices_',
 'sparse_output_',
 'transformers_']

In [None]:
# We can access the transformers separately as intended
preprocessor.named_transformers_

{'numeric': SimpleImputer(),
 'categorical': OneHotEncoder(handle_unknown='ignore', sparse_output=False)}

In [None]:
# Accessing and using one transformer from the Column transformer
trained_imputer = preprocessor.named_transformers_['numeric']
trained_imputer.transform(X_train[NUMERICAL_FEATURES])

Unnamed: 0_level_0,temperature,pressure
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,180.0,13000.0
2,100.0,5000.0
3,120.0,11000.0
4,122.5,4500.0
5,90.0,8375.0


An alternative (simpler) way to use Column Transformer, without specifying the step names

In [None]:
from sklearn.compose import make_column_transformer

In [None]:
# Preprocessors (transformers)
numeric_preprocessor = SimpleImputer(strategy='mean')
categorical_preprocessor = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Create Column transformer with make_column_transformer (tuples: transformer, list of columns)
preprocessor = make_column_transformer(
    (numeric_preprocessor, NUMERICAL_FEATURES),
    (categorical_preprocessor, CATEGORICAL_FEATURES),
)

# Fit preprocessor
preprocessor.fit(X_train)

In [None]:
# Transformer names
preprocessor.named_transformers_

{'simpleimputer': SimpleImputer(),
 'onehotencoder': OneHotEncoder(handle_unknown='ignore', sparse_output=False)}

# ML Pipelines

## Simple pipeline 1

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline

In [None]:
# Preprocessors (transformers)
numeric_preprocessor = SimpleImputer(strategy='mean')
categorical_preprocessor = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Create Column transformer with make_column_transformer (tuples: transformer, list of columns)
preprocessor = make_column_transformer(
    (numeric_preprocessor, NUMERICAL_FEATURES),
    (categorical_preprocessor, CATEGORICAL_FEATURES),
)

# Create pipeline (list of tuples - step name, transformer/estimator)
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])

model_pipeline

Likewise, we can also use the simpler version of the Pipeline:

In [None]:
model_pipeline = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))
model_pipeline.fit(X_train, y_train)

In [None]:
# Predict train, test, and new data with pipeline
try:
  print('Train predictions:', model_pipeline.predict(X_train))
  print('Test predictions:', model_pipeline.predict(X_test))
  print('New predictions:', model_pipeline.predict(X_new))
except Exception as e:
  print(e)

Train predictions: [ True False  True False False]
Test predictions: [False  True False]
New predictions: [False]


In [None]:
# List pipeline steps
model_pipeline.named_steps

{'columntransformer': ColumnTransformer(transformers=[('simpleimputer', SimpleImputer(),
                                  ['temperature', 'pressure']),
                                 ('onehotencoder',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  ['due_maintenance', 'age_status'])]),
 'logisticregression': LogisticRegression(max_iter=1000)}

In [None]:
# Access preprocessing
model_pipeline.named_steps['columntransformer'] # or model_pipeline['columntransformer']

In [None]:
# Use trained preprocessor
model_pipeline['columntransformer'].transform(X_test)

Unnamed: 0_level_0,simpleimputer__temperature,simpleimputer__pressure,onehotencoder__due_maintenance_No,onehotencoder__due_maintenance_Yes,onehotencoder__age_status_new,onehotencoder__age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,85.0,6000.0,0.0,1.0,1.0,0.0
7,110.0,10500.0,0.0,1.0,0.0,1.0
8,122.5,3300.0,1.0,0.0,0.0,0.0


In [None]:
# Access preprocessing - imputer
model_pipeline.named_steps['columntransformer'].named_transformers_['simpleimputer']

In [None]:
# Use trained imputer
trained_imputer = model_pipeline.named_steps['columntransformer'].named_transformers_['simpleimputer']
trained_imputer.transform(X_test[NUMERICAL_FEATURES])

Unnamed: 0_level_0,temperature,pressure
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,85.0,6000.0
7,110.0,10500.0
8,122.5,3300.0


## Simple pipeline 2

What if we need to use multiple/consecutive transformations to the same features (e.g. imputation and scaling) ?

In [None]:
# Preprocessors (transformers)
numeric_preprocessor = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
    )

categorical_preprocessor = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Create Column transformer (list ot tuples: step name, transformer, list of columns)
preprocessor = ColumnTransformer([
    ('numeric', numeric_preprocessor, NUMERICAL_FEATURES),
    ('categorical', categorical_preprocessor, CATEGORICAL_FEATURES),
])

# Create pipeline (list of tuples - step name, transformer/estimator)
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression())
])

model_pipeline.fit(X_train, y_train)

In [None]:
model_pipeline.named_steps

{'preprocessor': ColumnTransformer(transformers=[('numeric',
                                  Pipeline(steps=[('simpleimputer',
                                                   SimpleImputer()),
                                                  ('standardscaler',
                                                   StandardScaler())]),
                                  ['temperature', 'pressure']),
                                 ('categorical',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  ['due_maintenance', 'age_status'])]),
 'model': LogisticRegression()}

In [None]:
model_pipeline.named_steps['preprocessor']

In [None]:
model_pipeline.named_steps['preprocessor'].transform(X_test)

Unnamed: 0_level_0,numeric__temperature,numeric__pressure,categorical__due_maintenance_No,categorical__due_maintenance_Yes,categorical__age_status_new,categorical__age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,-1.200961,-0.718132,0.0,1.0,1.0,0.0
7,-0.40032,0.64254,0.0,1.0,0.0,1.0
8,0.0,-1.534536,1.0,0.0,0.0,0.0


In [None]:
model_pipeline.named_steps['model'].get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
# Column transformers
temperature_preprocessor = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
pressure_preprocessor = make_pipeline(SimpleImputer(strategy='median'), MinMaxScaler())

ct = ColumnTransformer([
    ('imputer', temperature_preprocessor, ['temperature']),
    ('ohe', categorical_preprocessor, CATEGORICAL_FEATURES)
])


pipe = make_pipeline(ct, PCA(.95))

pipe.fit(X_train, y_train)
pipe.transform(X_train)

Unnamed: 0_level_0,pca0,pca1
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.878065,0.709857
2,-1.353106,0.358239
3,0.500191,-0.62946
4,0.557602,-0.573655
5,-1.582752,0.135019


## Intermediate pipeline

In [None]:
from sklearn.decomposition import PCA

In [None]:
# Preprocessors (transformers)
numeric_preprocessor = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
    )

categorical_preprocessor = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Create Column transformer (list ot tuples: step name, transformer, list of columns)
preprocessor = ColumnTransformer([
    ('numeric', numeric_preprocessor, NUMERICAL_FEATURES),
    ('categorical', categorical_preprocessor, CATEGORICAL_FEATURES),
])

# Create pipeline (list of tuples - step name, transformer/estimator)
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=.9)),
    ('model', LogisticRegression())
])

model_pipeline.fit(X_train, y_train)

In [None]:
# Predict train, test, and new data with pipeline
try:
  print('Train predictions:', model_pipeline.predict(X_train))
  print('Test predictions:', model_pipeline.predict(X_test))
  print('New predictions:', model_pipeline.predict(X_new))
except Exception as e:
  print(e)

Train predictions: [ True False  True False False]
Test predictions: [False  True False]
New predictions: [False]


## Complex pipeline

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.base import BaseEstimator, TransformerMixin

**Desired pipeline:**

1. Numeric features

- Preprocess temperature with mean imputation and standard scaler
- Preprocess pressure with median imputation and min-max scaler
- Apply PCA to both outputs

2. Categorical features

- Imput all categorical variables with the respective most frequent category
- Apply one-hot encoder

3. Clusterize records
- Use both processed numeric and categorical variables to cluster observations using KMeans, so we can levarage the trained centroids to

4. Feature selection
- Select top 5 processed features

5. Model
- Train a predictive model for the final processed/selected features



_**Note:** such a complex pipeline for this fake data doesn't make sense at all. I'm just highlighting the possibilites for building custom and complex pipelines._

In [None]:
class KMeansTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=3):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init='auto')

    def fit(self, X, y=None):
        self.kmeans.fit(X)
        return self

    def transform(self, X):
        clusters = self.kmeans.predict(X)
        return X.assign(cluster=clusters)

In [None]:
# Example of application
custom_kmeans = KMeansTransformer(n_clusters=4).fit(X_train_transformed)
custom_kmeans.transform(X_test_transformed)


Unnamed: 0_level_0,numeric__temperature,numeric__pressure,categorical__due_maintenance_No,categorical__due_maintenance_Yes,categorical__age_status_new,categorical__age_status_old,cluster
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6,85.0,6000.0,0.0,1.0,1.0,0.0,0
7,110.0,10500.0,0.0,1.0,0.0,1.0,1
8,122.5,3300.0,1.0,0.0,0.0,0.0,0


In [None]:
# Preprocessing for numerical data
temperature_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

pressure_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler()),
])

numeric_transformer = ColumnTransformer([
    ('temp', temperature_transformer, ['temperature']),
    ('press', pressure_transformer, ['pressure']),

])

numeric_preprocessor = make_pipeline(numeric_transformer, PCA(.9))

categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_preprocessor, NUMERICAL_FEATURES),
    ('cat', categorical_preprocessor, CATEGORICAL_FEATURES)
])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('cluster', KMeansTransformer(n_clusters=4)),
    ('feat_selection', SelectKBest(k=2)),
    ('classifier', LogisticRegression())
])

model_pipeline.fit(X_train, y_train)

In [None]:
# Predict train, test, and new data with pipeline
try:
  print('Train predictions:', model_pipeline.predict(X_train))
  print('Test predictions:', model_pipeline.predict(X_test))
  print('New predictions:', model_pipeline.predict(X_new))
except Exception as e:
  print(e)

Train predictions: [ True False  True False False]
Test predictions: [False  True False]
New predictions: [False]


In [None]:
# Preprocess numerical & categorical features
X_train_transformed = model_pipeline.named_steps['preprocessor'].transform(X_train)
display(X_train_transformed)

# Add clusters using trained centroids
X_train_transformed_clst = model_pipeline.named_steps['cluster'].transform(X_train_transformed)
display(X_train_transformed_clst)

Unnamed: 0_level_0,num__pca0,cat__due_maintenance_No,cat__due_maintenance_Yes,cat__age_status_new,cat__age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.922478,0.0,1.0,0.0,1.0
2,-0.799199,1.0,0.0,1.0,0.0
3,0.009706,0.0,1.0,0.0,1.0
4,-0.122062,0.0,1.0,0.0,1.0
5,-1.010923,1.0,0.0,1.0,0.0


Unnamed: 0_level_0,num__pca0,cat__due_maintenance_No,cat__due_maintenance_Yes,cat__age_status_new,cat__age_status_old,cluster
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1.922478,0.0,1.0,0.0,1.0,2
2,-0.799199,1.0,0.0,1.0,0.0,3
3,0.009706,0.0,1.0,0.0,1.0,1
4,-0.122062,0.0,1.0,0.0,1.0,1
5,-1.010923,1.0,0.0,1.0,0.0,0


In [None]:
model_pipeline.named_steps['feat_selection'].transform(X_train_transformed_clst)

Unnamed: 0_level_0,num__pca0,cat__age_status_old
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.922478,1.0
2,-0.799199,0.0
3,0.009706,1.0
4,-0.122062,1.0
5,-1.010923,0.0


# Quick example with feature-engine

[Feature-engine](https://feature-engine.trainindata.com/en/latest/index.html) includes transformers for:

- Missing data imputation
- Categorical encoding
- Discretisation
- Outlier capping or removal
- Variable transformation
- Variable creation
- Variable selection
- Datetime features
- Time series
- Preprocessing

In [None]:
from feature_engine.encoding import RareLabelEncoder
from feature_engine.imputation import CategoricalImputer

In [None]:
# Example of rare label encoder
rare_label_encoder = RareLabelEncoder(tol=.1, n_categories=1, variables=CATEGORICAL_FEATURES)
rare_label_encoder.fit_transform(X_train)

Unnamed: 0_level_0,temperature,pressure,due_maintenance,age_status
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,180.0,13000.0,Yes,old
2,100.0,5000.0,No,new
3,120.0,11000.0,Yes,old
4,,4500.0,Yes,old
5,90.0,,No,new


In [None]:
# Learned attributes
rare_label_encoder.encoder_dict_

{'due_maintenance': ['Yes', 'No'], 'age_status': ['old', 'new']}

In [None]:
# Transform test set
rare_label_encoder.transform(X_test)

Unnamed: 0_level_0,temperature,pressure,due_maintenance,age_status
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,85.0,6000,Yes,new
7,110.0,10500,Yes,old
8,,3300,No,Rare


In [None]:
# Transform new data
rare_label_encoder.transform(df_future_unique)

Unnamed: 0_level_0,temperature,pressure,due_maintenance,age_status
tool_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,12,7500,No,new
