---
# Notebook para criação de encoders e gerar base transformada para ajustarmos o modelo 

---

# Imports

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import warnings

from src.utils.features_manager import get_features_by_property
from src.utils.transformers import (
    FeaturesType,
    NumericMissing,
    OptBinningEncoder,
    BuildFeatures
)

# Configs Pandas
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 200

warnings.filterwarnings("ignore")

In [2]:
project_root = Path().resolve().parents[0]

In [3]:
df = pd.read_parquet(os.path.join(project_root, "data", "interim", "train.parquet"))

In [4]:
features_config_path = os.path.join(project_root, "src", "data", "config", "features.yaml")

# Criando Encoders

## Features Types

In [5]:
features_dtypes = {
    "binary": get_features_by_property(features_config_path, property_name="type", property_value="binary"),
    "categorical": get_features_by_property(features_config_path, property_name="type", property_value="categorical"),
    "numerical": get_features_by_property(features_config_path, property_name="type", property_value="numerical"),
    "datetime": get_features_by_property(features_config_path, property_name="type", property_value="datetime")
}

for dtype in features_dtypes:
    features_dtypes[dtype] = [
        feature for feature in features_dtypes[dtype]
        if feature in get_features_by_property(features_config_path, "created", False)
        and feature in df.columns
    ]

features_type = FeaturesType(dtypes=features_dtypes)
features_type.fit(df)
df = features_type.transform(df)

## Build Features

In [6]:
ratio_features = [
    feat for feat in get_features_by_property(features_config_path, property_name="role", property_value="descriptive") # descriptive features
    if feat in get_features_by_property(features_config_path, property_name="type", property_value="numerical") # numerical features
    and feat != "monto" # exclude target feature
    and feat in df.columns # check if feature is in the dataframe 
]

In [7]:
build_features = BuildFeatures(inference=False, ratio_features=ratio_features)
build_features.fit(df)
df = build_features.transform(df)

## Fill Numeric Missing

In [8]:
features_fill_numeric = get_features_by_property(features_config_path, "fill_numeric_missing", True)
fill_numeric = NumericMissing(features_fill_numeric)
fill_numeric.fit(df)
df = fill_numeric.transform(df)

## OptBinningEncoding

In [9]:
features_to_encode = get_features_by_property(yaml_path=features_config_path, property_name="encode", property_value=True)
string_encoder = OptBinningEncoder(features_to_encode)
string_encoder.fit(df, df["fraude"])
df = string_encoder.transform(df)

## Feature Selector

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.features]

In [11]:
seletor = 'boruta'
features_selected = get_features_by_property(yaml_path=features_config_path, property_name=f"selected_by_{seletor}", property_value=True)
selector = Selector(features=features_selected)
selector.fit(df)
df = selector.transform(df)

# Executando encoders em sequência, para gerar base encodada

Antes, vamos salvar as colunas auxiliares que serão utilizadas também.

In [12]:
df = pd.read_parquet(os.path.join(project_root, "data", "interim", "train.parquet")) # recarregando o dataframe

In [13]:
features_auxiliares = get_features_by_property(yaml_path=features_config_path, property_name="role", property_value="auxiliary")
features_auxiliares = [feat for feat in features_auxiliares if feat in df.columns]
df_aux = df[features_auxiliares].copy()

In [25]:
transformers = {
    "feature_type": features_type,
    "build_features": build_features,
    "fill_numeric": fill_numeric,
    "string_encoder": string_encoder,
    "selector": selector
}

In [29]:
def apply_encoders_transformations(X, transformers, features_auxiliares):
    
    df_aux = df[features_auxiliares].copy()
    
    for name, transformer in transformers.items():
        print(f"Applying {name} transformation")
        X = transformer.transform(X)

    X[features_auxiliares] = df_aux
    
    return X

In [30]:
df_transformed = apply_encoders_transformations(df, transformers, features_auxiliares)

Applying feature_type transformation
Applying build_features transformation
Applying fill_numeric transformation
Applying string_encoder transformation
Applying selector transformation


In [31]:
df_transformed.shape

(114506, 36)

In [32]:
df_transformed.head()

Unnamed: 0,monto,a,b,c,d,e,f,g,h,l,m,n,o,p,hour,dawn_operation,monto_div_a,monto_div_b,monto_div_c,monto_div_d,monto_div_e,monto_div_f,monto_div_h,monto_div_k,monto_div_l,monto_div_m,monto_div_hour,monto_div_weekday,N_op,f_lower,l_lower,m_lower,n_lower,fecha,index,week_of_the_year
0,5.64,4.0,0.7685,94436.24,20.0,0.444828,1.0,-0.093884,5.0,240.0,102.0,1.0,-100.0,0.0,11,0.0,1.41,7.338972,6e-05,0.282,12.67907,5.64,1.128,6.382994,0.0235,0.055294,0.512727,1.41,0.0,False,False,False,False,2020-03-27 11:51:16,0,13
1,339.32,4.0,0.7455,242549.09,3.0,0.0,19.0,0.361803,23.0,1779.0,77.0,1.0,-100.0,0.0,18,0.0,84.83,455.157612,0.001399,113.106667,-100.0,17.858947,14.753043,657.128086,0.190736,4.406753,18.851111,169.66,0.0,False,False,False,False,2020-03-25 18:13:38,2,13
2,3.53,2.0,0.7315,5728.68,15.0,0.0,1.0,-0.093884,2.0,1025.0,150.0,1.0,-100.0,0.0,10,0.0,1.765,4.825701,0.000616,0.235333,-100.0,3.53,1.765,4.124804,0.003444,0.023533,0.353,1.176667,0.0,False,False,False,False,2020-04-02 10:24:45,4,14
3,10.56,4.0,0.5962,7121.78,2.0,0.398,0.0,-0.093884,11.0,127.0,125.0,0.0,-100.0,0.0,19,0.0,2.64,17.712177,0.001483,5.28,26.532663,-100.0,0.96,51.514416,0.08315,0.08448,0.555789,1.76,0.0,True,True,False,True,2020-03-22 19:20:24,6,12
4,6.13,4.0,0.6806,1656.95,50.0,1.043077,0.0,-0.093884,11.0,363.0,224.0,0.0,-100.0,0.0,11,0.0,1.5325,9.006759,0.0037,0.1226,5.876844,-100.0,0.557273,10.768929,0.016887,0.027366,0.557273,1.021667,0.0,True,False,False,True,2020-04-12 11:49:54,7,15
