---
# Notebook para preparar os dados e construir novas variáveis
---

# Imports

In [1]:
import pandas as pd
import numpy as np
import os
import datetime
from pathlib import Path
import warnings



# Configs Pandas
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 200

warnings.filterwarnings("ignore")

In [2]:
from src.utils.features_manager import get_features_by_property

In [3]:
project_root = Path().resolve().parents[0]

In [4]:
df = pd.read_csv(os.path.join(project_root, "data", "raw", "dados.csv"))

# Fix datatypes

In [5]:
features_config_path = os.path.join(project_root, "src", "data", "config", "features.yaml")

In [6]:
dtypes = {
    "binary": get_features_by_property(features_config_path, property_name="type", property_value="binary"),
    "categorical": get_features_by_property(features_config_path, property_name="type", property_value="categorical"),
    "numerical": get_features_by_property(features_config_path, property_name="type", property_value="numerical"),
    "datetime": get_features_by_property(features_config_path, property_name="type", property_value="datetime")
}

for dtype in dtypes:
    dtypes[dtype] = [feature for feature in dtypes[dtype] if feature in get_features_by_property(features_config_path, property_name="created", property_value=False)]

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeaturesType(BaseEstimator, TransformerMixin):
    
    def __init__(self, dtypes):
        self.map_dtypes = {
            np.float64: dtypes["numerical"],
            np.float32: dtypes["binary"],
            str: dtypes["categorical"],
            "datetime64[ns]": dtypes["datetime"]
        }


    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for dtype in self.map_dtypes:
            for feature in self.map_dtypes[dtype]:
                if X[feature].dtype != dtype:
                    X[feature] = X[feature].astype(dtype)        
    
        return X
    
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [8]:
encoder = FeaturesType(dtypes)
df = encoder.fit_transform(df)

In [9]:
df.dtypes

a                float64
b                float64
c                float64
d                float64
e                float64
f                float64
g                 object
h                float64
i                 object
j                 object
k                float64
l                float64
m                float64
n                float32
o                 object
p                 object
fecha     datetime64[ns]
monto            float64
score              int64
fraude             int64
dtype: object

# Missings

## Numerical

In [10]:
features_num_missing = get_features_by_property(features_config_path, property_name="fill_numeric_missing", property_value=True)

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

class NumericMissing(BaseEstimator, TransformerMixin):
    """Classe responsável por tratar valores faltantes em variáveis numéricas.
       Faremos a imputação dos valores faltantes com o valor -10.0, uma vez que não pertence a nenhum dominio.
    """
    def __init__(self, num_features):
        self.features = num_features

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for feature in self.features:
            X[feature] = X[feature].fillna(-10.)
    
        return X
    
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [12]:
encoder = NumericMissing(features_num_missing)
df = encoder.fit_transform(df)

In [13]:
df[features_num_missing].isnull().sum()

b    0
c    0
d    0
e    0
f    0
g    0
l    0
m    0
o    0
dtype: int64

## Categorical

Vamos manter os missings por enquanto, uma vez que o próprio OptBinning já faz a definição dos missings em um bin.

# Criando semana do ano

In [15]:
def get_week_from_date(date: datetime.date) -> int:
    """
    Retorna o número da semana do ano para uma data específica.
    
    Args:
        date (datetime.date): Data para extrair a semana.
    
    Returns:
        int: Número da semana do ano (1-53).
    """
    return date.isocalendar()[1]

df['date'] = pd.to_datetime(df['fecha']).dt.date
df['week_of_the_year'] = df['date'].apply(lambda row: get_week_from_date(row))