In [1]:
import sys
sys.path.append(r'C:\Users\gustavo\Documents\Data Science\08-GitHub\Portifolio/Classification/dsa_single_model')
import json
import pandas as pd
import yaml
import pickle
import os
from utils.feat_eng_pipeline import feat_eng_pipeline
from sklearn.model_selection import train_test_split


# Carregando as configurações do arquivo YAML
yaml_path = r"C:\Users\gustavo\Documents\Data Science\08-GitHub\Portifolio\Classification\dsa_single_model\src\config.yaml"
with open(yaml_path, "r", encoding="utf-8") as f:
    config = yaml.safe_load(f)

params = {
        'input_data':os.path.join(
            config['feat_selection']['path'],
            config['feat_selection']['input']),                
        'output_x_train' : os.path.join(
            config['feat_selection']['path'],
            config['feat_selection']['X_train']),        
        'output_x_val' : os.path.join(
            config['feat_selection']['path'],
            config['feat_selection']['X_val']),        
        'output_y_train' : os.path.join(
            config['feat_selection']['path'],
            config['feat_selection']['y_train']),        
        'output_y_val' : os.path.join(
            config['feat_selection']['path'],
            config['feat_selection']['y_val']),       
        'random_state' : config['feat_selection_params']['random_state'],
        'val_size' : config['feat_selection_params']['val_size'],
        'cols_2_drop' : config['feat_selection_params']['cols_2_drop'],
        'num_var' : config['feat_selection_params']['num_var'],
        'num_var_1' : config['feat_selection_params']['num_var_1'],
        'num_var_2' : config['feat_selection_params']['num_var_2'],
        'cat_var' : config['feat_selection_params']['cat_var'],
        'target' : config['feat_selection_params']['target'],
        'pipe': config['pipe_feat_eng']['path'], 
        'reports': config['save_reports']['path_reports'],
        'pipe_version': config['feat_selection_params']['pipe_version']        
        }
    

In [2]:
df = pd.read_parquet(params['input_data'])
df.drop(
        columns=params['cols_2_drop'], 
        inplace=True)

In [3]:
df

Unnamed: 0,num_gestacoes,glicose,pressao_sanguinea,grossura_pele,insulina,bmi,indice_historico,idade,classe
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
595,0,188.0,82.0,14.0,185.0,32.0,0.682,22,1
596,0,67.0,76.0,,,45.3,0.194,46,0
597,1,89.0,24.0,19.0,25.0,27.8,0.559,21,0
598,1,173.0,74.0,,,36.8,0.088,38,1


In [4]:
X_train, X_val, y_train, y_val =  train_test_split(
        df.drop(columns=params['target']), 
        df[params['target']],
        test_size=params['val_size'], 
        random_state=params['random_state'])

In [5]:
pipe = feat_eng_pipeline(
        num_var_1=params['num_var_1'],
        num_var_2=params['num_var_2'],
        )

In [6]:
pipe

0,1,2
,steps,"[('medianbyytransformer', ...), ('decisiontreediscretiser', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,feature_cols,"['insulina', 'glicose']"

0,1,2
,variables,'insulina'
,bin_output,'prediction'
,precision,
,cv,3
,scoring,'neg_mean_squared_error'
,param_grid,
,regression,False
,random_state,

0,1,2
,capping_method,'iqr'
,tail,'right'
,fold,'auto'
,add_indicators,False
,variables,"['num_gestacoes', 'indice_historico', ...]"
,missing_values,'raise'

0,1,2
,transformers,"[('inputer_pipe', ...), ('numerical_pipe', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,imputation_method,'median'
,variables,"['num_gestacoes', 'indice_historico', ...]"


In [7]:
 
# print('Feature Eng pipe transform')
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('medianbyytransformer', ...), ('decisiontreediscretiser', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,feature_cols,"['insulina', 'glicose']"

0,1,2
,variables,'insulina'
,bin_output,'prediction'
,precision,
,cv,3
,scoring,'neg_mean_squared_error'
,param_grid,
,regression,False
,random_state,

0,1,2
,capping_method,'iqr'
,tail,'right'
,fold,'auto'
,add_indicators,False
,variables,"['num_gestacoes', 'indice_historico', ...]"
,missing_values,'raise'

0,1,2
,transformers,"[('inputer_pipe', ...), ('numerical_pipe', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,n_neighbors,5
,weights,'uniform'
,metric,'nan_euclidean'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,imputation_method,'median'
,variables,"['num_gestacoes', 'indice_historico', ...]"


In [7]:
X_train_trans = pipe.transform(X_train)
X_val_trans = pipe.transform(X_val)

In [8]:
X_train_trans

Unnamed: 0,inputer_pipe__insulina,inputer_pipe__glicose,inputer_pipe__grossura_pele,inputer_pipe__bmi,numerical_pipe__num_gestacoes,numerical_pipe__pressao_sanguinea,numerical_pipe__indice_historico,numerical_pipe__idade
108,18.0,83.0,31.0,34.3,3.000000,58.0,0.336,25.0
352,136.0,61.0,28.0,34.4,3.000000,82.0,0.243,46.0
238,136.0,164.0,21.0,30.8,9.000000,84.0,0.831,32.0
298,184.0,100.0,25.0,36.6,12.755519,78.0,0.412,46.0
300,136.0,167.0,29.5,32.3,0.000000,0.0,0.839,30.0
...,...,...,...,...,...,...,...,...
237,136.0,179.0,27.0,44.1,0.000000,90.0,0.686,23.0
31,245.0,158.0,36.0,31.6,3.000000,76.0,0.851,28.0
488,136.0,99.0,17.0,25.6,4.000000,72.0,0.294,28.0
40,70.0,180.0,25.0,34.0,3.000000,64.0,0.271,26.0


In [None]:
num_var_1=params['num_var_1']
num_var_2=params['num_var_2']

In [None]:
from sklearn.pipeline import make_pipeline
from feature_engine.imputation import MeanMedianImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from feature_engine.outliers import Winsorizer
from feature_engine.transformation import LogCpTransformer
from sklearn.preprocessing import MinMaxScaler
from feature_engine.discretisation import DecisionTreeDiscretiser
from utils.functions import MedianByYTransformer
from feature_engine.scaling import MeanNormalizationScaler
from sklearn.impute import KNNImputer


   
    # numerical var
median = make_pipeline(
        MeanMedianImputer(
        imputation_method = 'median',
        variables = num_var_2))
    
inputer = KNNImputer()
inputer_pipe = make_pipeline(inputer)
   

preprocessor_1  = ColumnTransformer(
    transformers = [
        ("inputer_pipe", inputer_pipe, num_var_1),
        ("numerical_pipe", median, num_var_2)
        ]
    )  
    
    
pipe = make_pipeline(
         preprocessor_1.set_output(transform="pandas"),
 
        
        )
    


In [None]:
pipe.fit_transform(X_train, y_train)