In [1]:
#!/usr/bin/env python -W ignore::DeprecationWarning
import warnings
warnings.filterwarnings("ignore")
import sys
sys.path.append('../src')
from sklearn.impute import SimpleImputer
from feature_selection import FeatureSelection
from sklearn.preprocessing import OneHotEncoder
from config import Configure
import pandas as pd
import numpy as np

def qcutting(df,col_names):
    data = df.copy()
    for col in col_names:
        data[col] = pd.qcut(df[col].rank(method='first'), 
                            4, 
                            labels=["0","1","2","3"])
    return data  

def qt_socios_cut(x):
    if x > 1:
        return False
    else:
        return True
    
    
def df_transform(df):
    df['qt_socios'] = df['qt_socios'].apply(qt_socios_cut)
    columns = df.columns
    for col in columns:
        var_type = df[col].dtype 
        if ((var_type == 'bool')|(var_type == 'object')):
            df[col] = df[col].fillna(df[col].mode().values[0])
        else:
            df[col] = df[col].fillna(df[col].median())
    df = qcutting(df, 
                  columns[(df.dtypes=='float64')|(df.dtypes=='int64')])
    encoder = OneHotEncoder(categorical_features='all',sparse = False)
    X = encoder.fit_transform(df)  
    return X

## Leitura e pre-processamento

In [2]:
print('\n Aplicando algoritmo de seleção de parâmetros')
settings = Configure()
settings.set_fs_params()
settings.set_pre_processing_params()
pp_params = settings.pre_processing_params
fs_params = settings.feature_selection_params
df1 = pd.read_csv(settings.pf1_folder)
df2 = pd.read_csv(settings.pf2_folder)
df3 = pd.read_csv(settings.pf3_folder)
mkt = pd.read_csv(settings.mkt_folder)


 Aplicando algoritmo de seleção de parâmetros


## Seleção de parâmetros com LASSO

In [3]:
fs = FeatureSelection(mkt,  df1, df2, df3, pp_params, fs_params)
values, features = fs.feature_selection_algorithm(m='LASSO')

In [4]:
columns = features[features != 'Unnamed: 0']
id_column = mkt['id']
mkt_lasso = mkt[columns]
print(" Resultado aproximado : ", values.max())
mkt_lasso.head(3)

 Resultado aproximado :  0.9295


Unnamed: 0,vl_faturamento_estimado_aux,de_saude_tributaria,fl_rm,fl_ltda,fl_telefone,vl_total_veiculos_leves_grupo,natureza_juridica_macro,fl_spa,dt_situacao,fl_veiculo,fl_optante_simei
0,3132172.8,VERDE,SIM,False,True,0.0,ENTIDADES EMPRESARIAIS,False,2005-03-25,False,False
1,210000.0,CINZA,SIM,False,True,0.0,OUTROS,False,2017-05-12,False,
2,50000.0,AMARELO,SIM,False,True,0.0,OUTROS,False,2011-09-26,False,True


## Seleção de parâmetros com RFECV

In [5]:
fs = FeatureSelection(mkt,  df1, df2, df3, pp_params, fs_params)
values, features = fs.feature_selection_algorithm(m='RFECV')

In [6]:
columns = features[features != 'Unnamed: 0']
id_column = mkt['id']
mkt_rfe_cv = mkt[columns]
print(" Resultado aproximado : ", values.max())
mkt_rfe_cv.head(3)

 Resultado aproximado :  0.8317694307173836


Unnamed: 0,vl_faturamento_estimado_aux,de_saude_tributaria,fl_rm,vl_total_veiculos_leves_grupo,fl_optante_simples,qt_filiais,vl_total_veiculos_pesados_grupo,fl_ltda,natureza_juridica_macro,fl_antt,idade_media_socios,qt_socios,de_nivel_atividade,qt_socios_feminino,fl_sa,fl_passivel_iss,fl_telefone
0,3132172.8,VERDE,SIM,0.0,True,0,0.0,False,ENTIDADES EMPRESARIAIS,False,44.0,2.0,ALTA,,False,True,True
1,210000.0,CINZA,SIM,0.0,,0,0.0,False,OUTROS,False,27.0,1.0,BAIXA,,False,True,True
2,50000.0,AMARELO,SIM,0.0,True,0,0.0,False,OUTROS,False,32.0,1.0,MEDIA,,False,True,True
