## Imports

In [1]:
import sys
import pandas as pd
import numpy as np
import pickle

from functools import partial

import sklearn.pipeline as skpip
from sklearn.compose import ColumnTransformer
from category_encoders import WOEEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Import my libs
sys.path.append('../')

from src.features.balance import BalanceMixin
from src.features.correlation import HighCorrelation_filter
from src.features.variance import NearZeroVar_filter, LowVar_Filter
from src.features.passthrough import Passthrough

modelisationTuple

In [2]:
with open('../data/processed/modelling_tuple', 'rb') as f:
    M = pickle.load(f)

## Preprocessing Steps

#### Function I use in preprocessing

In [3]:
def _intersect(df: pd.DataFrame, target: list):
    """
    Intersection between a list and a set of columns in a dataframe.

    Args:
        df (pd.DataFrame): The dataframe with which we want to make the intersection.
        target (list): The array with which we want to make the intersection.
    """

    return [c for c in df.columns if c in target]

####  Split the features to be processed by their types

In [4]:
#M.train_X.dtypes

In [5]:
# variables that do not require preprocessing
passthrough_cols = ['']

# non-ordinal / categorical variables
categorical_cols = [''] 

# quantitative variables
quantitative_cols = set(M.train_X.select_dtypes(include=['float','int']).columns.tolist()) - set(passthrough_cols) - set(categorical_cols)
quantitative_cols = list(quantitative_cols)

####  Fast checkpoint

In [6]:
# Check if dataframe contains NaN value
test = M.train_X.isna().sum().to_frame('nb')
test[test['nb'] > 0]

Unnamed: 0,nb


In [7]:
 # Check if dataframe contains infinity
test = np.isinf(M.train_X).sum().to_frame('nb')
test[test['nb'] > 0]

Unnamed: 0,nb


In [8]:
 # prevalence
y = M.train_y
out = pd.concat([
    np.round(y.value_counts(normalize=True, ascending=False).rename('normalized'), 4) * 100,
    y.value_counts(normalize=False, ascending=False).rename('number'),
], axis=1)
out

Unnamed: 0,normalized,number
0.0,99.82,217485
1.0,0.18,392


#### Preprocessor

In [9]:
def build_data_preprocessor(passthrough_cols: list, categorical_cols: list, quantitative_cols: list, scale=False):
    """
    Creates a data preprocessing pipeline

    Args:
        passthrough_cols (list):  list of variables that do not require preprocessing.
        categorical_cols (list):  list of non-ordinal / categorical variables .
        quantitative_cols (list): list of quantitative variables.
        scale (boolean, optional): Should we standardize the variables? Defaults to False.
    """

    # This step will make each preprocessing step robust if it does not receive all the assumed variables
    passthrough_cols_selector = partial(_intersect, target=passthrough_cols)
    categorical_cols_selector = partial(_intersect, target=categorical_cols)
    quantitative_cols_selector = partial(_intersect, target=quantitative_cols)

    # Make the preprocessor pipeline
    lowvar_transformer = skpip.Pipeline(steps=[
        ('NZV', NearZeroVar_filter(equisample=True))
    ])

    # Categorical-non-ordinal pipeline
    categorical_transformer = skpip.Pipeline(steps=[
        ('encoder', WOEEncoder())
    ])

    # Passthrough pipeline
    passthrough_transformer = skpip.Pipeline(steps=[
        ('passthrough', Passthrough())
    ])

    # Quantitative pipeline
    quantitative_transformer = skpip.Pipeline(steps=[
        ('low-var', LowVar_Filter(equisample=True)),
        ('high-correlation', HighCorrelation_filter(equisample=True)),
        ('imputer', SimpleImputer(strategy='median')),
    ])

    if scale:
        quantitative_transformer.steps.append(('scaler', StandardScaler()))


    # Map the columns to their respective transformers
    # ColumnTransformer allows to apply data transformations to different features of a same df. 
    # Columns not specified in the "Transformers" list are removed from the default dataset.
    columns_transformers = ColumnTransformer(
        transformers=[
            ('quantitative', quantitative_transformer, quantitative_cols_selector),
            ('passthrough', passthrough_transformer, passthrough_cols_selector),
            ('categorical', categorical_transformer, categorical_cols_selector),
        ],
        verbose_feature_names_out=False,
    )

    # Define the pipeline
    preprocessor = skpip.Pipeline(steps=[
        ('lowvar_transformer', lowvar_transformer),
        ('columns_transformers', columns_transformers),
    ]
    )

    return preprocessor

## SmokeTest

In [10]:
preprocessor = build_data_preprocessor(passthrough_cols= passthrough_cols, categorical_cols= categorical_cols, quantitative_cols= quantitative_cols, scale=False)
preprocessor.fit(M.train_X, M.train_y)

In [11]:
print(type(preprocessor))
print(type(preprocessor.named_steps['columns_transformers']))

<class 'sklearn.pipeline.Pipeline'>
<class 'sklearn.compose._column_transformer.ColumnTransformer'>


In [12]:
# Extract the columns names from the FINAL layer of the transformer
preprocessor.get_feature_names_out()

array(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
       'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V18', 'V19',
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
       'Amount'], dtype=object)

In [13]:
X_transformed = preprocessor.transform(M.train_X)
X_transformed = pd.DataFrame(X_transformed, columns=preprocessor.get_feature_names_out())