---
# Notebook para criação de encoders e gerar base transformada para ajustarmos o modelo 

---

# Imports

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import warnings

from src.utils.features_manager import get_features_by_property
from src.utils.transformers import (
    FeaturesType,
    NumericMissing,
    OptBinningEncoder,
    BuildFeatures
)

# Configs Pandas
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 200

warnings.filterwarnings("ignore")

In [2]:
project_root = Path().resolve().parents[0]

In [3]:
df = pd.read_parquet(os.path.join(project_root, "data", "interim", "train.parquet"))

In [4]:
features_config_path = os.path.join(project_root, "src", "data", "config", "features.yaml")

# Criando Encoders

## Features Types

In [5]:
features_dtypes = {
    "binary": get_features_by_property(features_config_path, property_name="type", property_value="binary"),
    "categorical": get_features_by_property(features_config_path, property_name="type", property_value="categorical"),
    "numerical": get_features_by_property(features_config_path, property_name="type", property_value="numerical"),
    "datetime": get_features_by_property(features_config_path, property_name="type", property_value="datetime")
}

for dtype in features_dtypes:
    features_dtypes[dtype] = [
        feature for feature in features_dtypes[dtype]
        if feature in get_features_by_property(features_config_path, "created", False)
        and feature in df.columns
    ]

features_type = FeaturesType(dtypes=features_dtypes)
features_type.fit(df)
df = features_type.transform(df)

## Build Features

In [6]:
ratio_features = [
    feat for feat in get_features_by_property(features_config_path, property_name="role", property_value="descriptive") # descriptive features
    if feat in get_features_by_property(features_config_path, property_name="type", property_value="numerical") # numerical features
    and feat != "monto" # exclude target feature
    and feat in df.columns # check if feature is in the dataframe 
]

In [7]:
build_features = BuildFeatures(inference=False, ratio_features=ratio_features)
build_features.fit(df)
df = build_features.transform(df)

## Fill Numeric Missing

In [8]:
features_fill_numeric = get_features_by_property(features_config_path, "fill_numeric_missing", True)
fill_numeric = NumericMissing(features_fill_numeric)
fill_numeric.fit(df)
df = fill_numeric.transform(df)

## OptBinningEncoding

In [9]:
features_to_encode = get_features_by_property(yaml_path=features_config_path, property_name="encode", property_value=True)
string_encoder = OptBinningEncoder(features_to_encode)
string_encoder.fit(df, df["fraude"])
df = string_encoder.transform(df)

## Feature Selector

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

class Selector(BaseEstimator, TransformerMixin):
    def __init__(self, features):
        self.features = features

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.features]

In [12]:
seletor = 'boruta'
features_selected = get_features_by_property(yaml_path=features_config_path, property_name=f"selected_by_{seletor}", property_value=True)
selector = Selector(features=features_selected)
selector.fit(df)
df = selector.transform(df)

# Executando encoders em sequência, para gerar base encodada

Antes, vamos salvar as colunas auxiliares que serão utilizadas também.

In [13]:
df = pd.read_parquet(os.path.join(project_root, "data", "interim", "train.parquet")) # recarregando o dataframe

In [14]:
features_auxiliares = get_features_by_property(yaml_path=features_config_path, property_name="role", property_value="auxiliary")
features_auxiliares = [feat for feat in features_auxiliares if feat in df.columns]
df_aux = df[features_auxiliares].copy()

In [15]:
transformers = {
    "feature_type": features_type,
    "build_features": build_features,
    "fill_numeric": fill_numeric,
    "string_encoder": string_encoder,
    "selector": selector
}

In [16]:
def apply_encoders_transformations(X, transformers, features_auxiliares):
    
    df_aux = df[features_auxiliares].copy()
    
    for name, transformer in transformers.items():
        print(f"Applying {name} transformation")
        X = transformer.transform(X)

    X[features_auxiliares] = df_aux
    
    return X

In [17]:
df_transformed = apply_encoders_transformations(df, transformers, features_auxiliares)

Applying feature_type transformation
Applying build_features transformation
Applying fill_numeric transformation
Applying string_encoder transformation
Applying selector transformation


In [18]:
df_transformed.shape

(114506, 3)