---
# Notebook para preparar os dados e construir novas variáveis
---

# Imports

In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import warnings
from sklearn.base import BaseEstimator, TransformerMixin

from src.utils.features_manager import get_features_by_property

# Configs Pandas
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 200

warnings.filterwarnings("ignore")

In [2]:
project_root = Path().resolve().parents[0]

In [3]:
df = pd.read_parquet(os.path.join(project_root, "data", "interim", "train.parquet"))

In [4]:
features_config_path = os.path.join(project_root, "src", "data", "config", "features.yaml")

# Novas Variáveis

## Tempo

In [5]:
# df['date'] = df['fecha'].dt.date
# df['time'] = df['fecha'].dt.time
df['hour'] = df['fecha'].dt.hour
df['weekday'] = df['fecha'].dt.weekday

In [6]:
df['dawn_operation'] = np.where((df['hour'].between(22, 24) | df['hour'].between(0, 5)), 1., .0)

## Tamanho do título do anúncio

In [7]:
df['lenght_i'] = df['i'].apply(lambda row: len(row))

## Razões de valor

In [8]:
def create_value_ratios(df, features_list):
    """
    Creates columns of ratios between 'monto' and other numeric columns in the DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame.

    Returns:
        pd.DataFrame: DataFrame with new ratio columns added.
    """
    
    for col in features_list:
        df[f"monto_div_{col}"] = np.where(df[col] != 0, df["monto"] / df[col], np.nan)
    
    return df

In [9]:
ratio_features = [
    feat for feat in get_features_by_property(features_config_path, property_name="role", property_value="descriptive") # descriptive features
    if feat in get_features_by_property(features_config_path, property_name="type", property_value="numerical") # numerical features
    and feat != "monto" # exclude target feature
]

In [10]:
df = create_value_ratios(df, ratio_features)

In [11]:
df.shape

(114506, 35)

## Booleanas

In [12]:
df['N_op'] = np.where((df['o'] == 'N') & (df['p'] == 'N'), 1., .0)

In [13]:
df['o'] = df['o'].map({'N': 0., 'S': 1.})
df['p'] = df['p'].map({'N': 0., 'S': 1.})

In [14]:
df["f_lower"] = (df['f'] < 0.50)
df["l_lower"] = (df['l'] < 140.50)
df["m_lower"] = (df['m'] < 4.50)
df["n_lower"] = (df['n'] < 0.50)

In [15]:
df.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,fecha,monto,fraude,index,week_of_the_year,hour,weekday,dawn_operation,lenght_i,monto_div_a,monto_div_b,monto_div_c,monto_div_d,monto_div_e,monto_div_f,monto_div_h,monto_div_k,monto_div_l,monto_div_m,N_op,f_lower,l_lower,m_lower,n_lower
0,4.0,0.7685,94436.24,20.0,0.444828,1.0,BR,5.0,Máquininha Corta Barba Cabelo Peito Perna Pelos Pézinho Nova,cat_8d714cd,0.883598,240.0,102.0,1.0,,0.0,2020-03-27 11:51:16,5.64,0,0,13,11,4,0.0,60,1.41,7.338972,6e-05,0.282,12.67907,5.64,1.128,6.382994,0.0235,0.055294,0.0,False,False,False,False
1,4.0,0.7455,242549.09,3.0,0.0,19.0,AR,23.0,Bicicleta Mountain Fire Bird Rodado 29 Aluminio Shimano 21v,cat_e9110c5,0.516368,1779.0,77.0,1.0,,0.0,2020-03-25 18:13:38,339.32,0,2,13,18,2,0.0,59,84.83,455.157612,0.001399,113.106667,,17.858947,14.753043,657.128086,0.190736,4.406753,0.0,False,False,False,False
2,2.0,0.7315,5728.68,15.0,0.0,1.0,BR,2.0,Resident Evil Operation Raccoon City Ps3,cat_6c4cfdc,0.855798,1025.0,150.0,1.0,,0.0,2020-04-02 10:24:45,3.53,0,4,14,10,3,0.0,40,1.765,4.825701,0.000616,0.235333,,3.53,1.765,4.124804,0.003444,0.023533,0.0,False,False,False,False
3,4.0,0.5962,7121.78,2.0,0.398,0.0,BR,11.0,"Corpinho Avulso Joseph, Josepha Ou Placa Sem Sexo",cat_5d6059e,0.204991,127.0,125.0,0.0,,0.0,2020-03-22 19:20:24,10.56,0,6,12,19,6,0.0,49,2.64,17.712177,0.001483,5.28,26.532663,,0.96,51.514416,0.08315,0.08448,0.0,True,True,False,True
4,4.0,0.6806,1656.95,50.0,1.043077,0.0,BR,11.0,Tripa Para Fazer Linguiça - 45 Metros Long Short,cat_e686ce3,0.56923,363.0,224.0,0.0,,0.0,2020-04-12 11:49:54,6.13,0,7,15,11,6,0.0,48,1.5325,9.006759,0.0037,0.1226,5.876844,,0.557273,10.768929,0.016887,0.027366,0.0,True,False,False,True


In [16]:
df.shape

(114506, 40)

In [None]:
class BuildFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, inference=False, ratio_features: list = None):
        """
        Initializes the BuildFeatures transformer.

        Args:
            inference (bool): Flag indicating if the transformer is used for inference. Defaults to False.
            ratio_features (list): List of columns to create ratio features. Defaults to None.
        """
        self.inference = inference
        self.ratio_features = ratio_features
    
    def fit(self, X, y=None):
        return self
    
    @staticmethod
    def create_value_ratios(df, features_list):
        """
        Creates columns of ratios between 'monto' and other numeric columns in the DataFrame.

        Args:
            df (pd.DataFrame): Input DataFrame.

        Returns:
            pd.DataFrame: DataFrame with new ratio columns added.
        """
        
        for col in features_list:
            df[f"monto_div_{col}"] = np.where(df[col] != 0, df["monto"] / df[col], np.nan)
        
        return df
    
    def create_train_features(self, X):
        """
        Creates training features from the input DataFrame.

        Args:
            X (pd.DataFrame): Input DataFrame.

        Returns:
            pd.DataFrame: DataFrame with new training features added.
        """
        X['hour'] = X['fecha'].dt.hour
        X['weekday'] = X['fecha'].dt.weekday

        X['dawn_operation'] = np.where((X['hour'].between(22, 24) | X['hour'].between(0, 5)), 1., .0)

        # X['lenght_i'] = X['i'].apply(lambda row: len(row))

        if self.ratio_features is not None:
            X = BuildFeatures.create_value_ratios(X, self.ratio_features)

        X['N_op'] = np.where((X['o'] == 'N') & (X['p'] == 'N'), 1., .0)

        X['o'] = X['o'].map({'N': 0., 'Y': 1.})
        X['p'] = X['p'].map({'N': 0., 'Y': 1.})

        X["f_lower"] = (X['f'] < 0.50)
        X["l_lower"] = (X['l'] < 140.50)
        X["m_lower"] = (X['m'] < 4.50)
        X["n_lower"] = (X['n'] < 0.50)
        
        return X
    
    def create_inference_features(self, X):
        """Creates inference features from the input DataFrame.
        """
        return X
    
    def transform(self, X):
        if self.inference:
            X_transformed = self.create_inference_features(X)
        else:
            X_transformed = self.create_train_features(X)
        return X_transformed
    
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [18]:
df = pd.read_parquet(os.path.join(project_root, "data", "interim", "dados.parquet"))

build_features = BuildFeatures(inference=False, ratio_features=ratio_features)
build_features.fit_transform(df).shape

(150000, 40)