# 0.0 IMPORTS

In [110]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import pickle
import pandera
from pandera import Check, Column, DataFrameSchema, errors, error_handlers
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from sklearn.base import BaseEstimator, TransformerMixin
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.pipeline import Pipeline


## 0.1 Utils

In [111]:
columns_to_use = ['target', 'TaxaDeUtilizacaoDeLinhasNaoGarantidas', 'Idade',
       'NumeroDeVezes30-59DiasAtrasoNaoPior', 'TaxaDeEndividamento',
       'RendaMensal', 'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
       'NumeroDeVezes90DiasAtraso', 'NumeroDeEmprestimosOuLinhasImobiliarias',
       'NumeroDeVezes60-89DiasAtrasoNaoPior', 'NumeroDeDependentes']

In [112]:
def to_snake_case(column_name):
    """
    Converte um nome de coluna para snake_case.
    
    Argumentos:
    column_name (str): O nome da coluna a ser convertido.
    
    Retorna:
    str: O nome da coluna convertido para snake_case.
    """
    # Converte letras maiúsculas para minúsculas e adiciona um underscore antes delas
    snake_case_name = re.sub(r'(?<!^)(?=[A-Z])', '_', column_name).lower()
    return snake_case_name

class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.freq_map = {}
    
    def fit(self, X, y=None):
        if self.columns is None:
            self.columns = X.columns
        for column in self.columns:
            self.freq_map[column] = X[column].value_counts(normalize=True)
        return self
    
    def transform(self, X):
        X_encoded = X.copy()
        for column in self.columns:
            X_encoded[column] = X[column].map(self.freq_map[column])
        return X_encoded

# 1.0 Data Load

In [113]:
class DataLoad:
    """Class data load
    """
    def __init__(self)->None:
        pass

    def load_data(self)->pd.DataFrame:
        """Função vai carregar os dados
        
        return: pandas DataFrame """

        loaded_data = pd.read_csv('../data/raw/train.csv', index_col=0)
        return loaded_data

In [114]:
dl = DataLoad()
df = dl.load_data()

In [115]:
df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


# 2.0 Data validation

In [116]:
columns_to_use

['target',
 'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
 'Idade',
 'NumeroDeVezes30-59DiasAtrasoNaoPior',
 'TaxaDeEndividamento',
 'RendaMensal',
 'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
 'NumeroDeVezes90DiasAtraso',
 'NumeroDeEmprestimosOuLinhasImobiliarias',
 'NumeroDeVezes60-89DiasAtrasoNaoPior',
 'NumeroDeDependentes']

In [117]:
class DataValidation:
    """"""
    def __init__(self, columns_to_use)->None:
        self.columns_to_use = columns_to_use

    def check_shape_data(self, dataframe: pd.DataFrame) -> bool:
        try:
            print('Validação iniciou')
            dataframe.columns = self.columns_to_use
            return True
        except Exception as e:
            print(f'Validação falhou: {e}')
            return False
    
    def check_columns(self, dataframe: pd.DataFrame) -> bool:
        schema = DataFrameSchema(
            {
             'target': Column(int, Check.isin([0,1]), Check(lambda x: x > 0), coerce = True),   
             'TaxaDeUtilizacaoDeLinhasNaoGarantidas': Column(float, nullable = True),
             'Idade': Column(int, nullable = True),
             'NumeroDeVezes30-59DiasAtrasoNaoPior': Column(int, nullable = True),
             'TaxaDeEndividamento': Column(float, nullable = True),
             'RendaMensal': Column(float, nullable = True),
             'NumeroDeLinhasDeCreditoEEmprestimosAbertos': Column(int, nullable = True),
             'NumeroDeVezes90DiasAtraso': Column(int, nullable = True),
             'NumeroDeEmprestimosOuLinhasImobiliarias': Column(int, nullable = True),
             'NumeroDeVezes60-89DiasAtrasoNaoPior': Column(int, nullable = True),
             'NumeroDeDependentes': Column(float, nullable = True)
            }
        )
        try:
            schema.validate(dataframe)
            print("Validação de colunas passou...")
            return True
        except pandera.errors.SchemaErrors as exc:
            print("Validação de colunas falhou...")
            pandera.display(exc.failure_cases)
            return False
        
    def run(self, dataframe: pd.DataFrame) -> bool:
        if self.check_shape_data(dataframe) and self.check_columns(dataframe):
            print("Validação com sucesso")
            return True
        else:
            print("Validação falhou")
            return False


In [118]:
dv = DataValidation(columns_to_use)
dv.run(df)

Validação iniciou
Validação de colunas passou...
Validação com sucesso


True

# 3.0 Data Transformation

In [119]:
class DataTransformation:

    def __init__(self, dataframe: pd.DataFrame,
                       target_name: str):
        
        self.dataframe = dataframe
        self.target_name = target_name

    def column_name_change(self):

        dataframe_formatted_columns = self.dataframe.rename(columns=lambda x: to_snake_case(x))
        return dataframe_formatted_columns

    def feature_engineering(self, dataframe_imported: pd.DataFrame) -> pd.DataFrame:
        # definindo uma faixa para a renda mensal
        dataframe_imported['faixa_renda_mensal'] = dataframe_imported['renda_mensal'].apply(lambda x: 'não especificado' if np.isnan(x) else
                                                                        '0-1k' if x >= 0 and x < 1000 else 
                                                                        '1k-5k' if x>=1000 and x < 5000 else
                                                                        '5k-20k' if x >= 5000 and x < 20000 else
                                                                        '20k-100k' if x>=20000 and x < 100000 else
                                                                        '100k-1mi' if x>=100000 and x < 1000000 else
                                                                        '1mi+'
                                                                        )

        # definindo faixa para numero de dependentes
        dataframe_imported['faixa_numero_dependentes'] = dataframe_imported['numero_de_dependentes'].apply(lambda x: 'não especificado' if np.isnan(x) else
                                                                                    '0-2' if x>=0 and x<2 else
                                                                                    '2-5' if x>=2 and x<5 else
                                                                                    '5-10' if x>=5 and x<10 else
                                                                                    '10+')
        
        dataframe_imported.drop(columns=['renda_mensal','numero_de_dependentes'], inplace = True)

        return dataframe_imported
    
    def train_test_splitting(self, featured_dataframe)->pd.DataFrame:

        X = featured_dataframe.drop(self.target_name, axis = 1)
        y = featured_dataframe[self.target_name]

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, stratify=y)

        return X_train, X_valid, y_train, y_valid

    def run(self):

        dataframe_formatted_columns = self.column_name_change()
        featured_dataframe = self.feature_engineering(dataframe_formatted_columns)
        X_train, X_valid, y_train, y_valid = self.train_test_splitting(featured_dataframe)

        return X_train, X_valid, y_train, y_valid
        

In [120]:
dt = DataTransformation(df, 'target')

In [121]:
X_train, X_valid, y_train, y_valid = dt.run()

# 4.0 Data Preprocess

In [122]:
class DataPreprocess:
    def __init__(self, dataframe: pd.DataFrame, pipe: Pipeline):

        self.dataframe = dataframe
        self.pipe = pipe

    def pipeline(self):
        train_pipe = self.pipe
        train_pipe.fit(self.dataframe)
        return train_pipe
    
    def run(self):
        print('Preprocessador iniciou...')
        trained_pipeline = self.pipeline()
        data_preprocessed = trained_pipeline.transform(self.dataframe)
        return data_preprocessed

In [123]:
pipe = Pipeline(steps = [('frequency_encoder',FrequencyEncoder(columns=['faixa_renda_mensal','faixa_numero_dependentes']))])

In [124]:
dp = DataPreprocess(X_train, pipe)
X_train_processed = dp.run()

Preprocessador iniciou...


In [125]:
X_train_processed.head()

Unnamed: 0,taxa_de_utilizacao_de_linhas_nao_garantidas,idade,numero_de_vezes30-59_dias_atraso_nao_pior,taxa_de_endividamento,numero_de_linhas_de_credito_e_emprestimos_abertos,numero_de_vezes90_dias_atraso,numero_de_emprestimos_ou_linhas_imobiliarias,numero_de_vezes60-89_dias_atraso_nao_pior,faixa_renda_mensal,faixa_numero_dependentes
61767,0.228795,41,1,0.582055,31,0,1,0,0.430883,0.754192
49368,0.886187,63,1,0.405899,9,0,0,0,0.32555,0.754192
113763,0.671307,69,6,0.212591,11,2,0,0,0.32555,0.754192
86883,0.001917,44,0,0.953034,4,0,2,0,0.32555,0.754192
5072,0.027907,63,0,2001.0,5,0,2,0,0.198642,0.754192


In [126]:
pickle.dump(dp.pipeline(), open('preprocessor.pkl', 'wb'))

# 5.0 Train Models

In [152]:
class TrainModels:

    def __init__(self, dados_x: pd.DataFrame,
                       dados_y: pd.DataFrame):
        
        self.dados_x = dados_x
        self.dados_y = dados_y

    def train(self, model):

        model.fit(self.dados_x, self.dados_y)
        pickle.dump(model, open('modelo.pkl', 'wb'))
        return model
    
    def _load_model(self):
        model = pickle.load(open('modelo.pkl','rb'))
        return model
    
    def predict(self, dados_input: pd.DataFrame):
        model_fitted = self._load_model()
        dados_pred = model_fitted.predict_proba(dados_input)
        return dados_pred


In [154]:
tm = TrainModels(dados_x=X_train_processed, 
                 dados_y=y_train)

In [155]:
tm.train(model=LogisticRegression())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [156]:
y_train_pred = tm.predict(X_train_processed)

In [157]:
y_train_pred

array([[0.9632003 , 0.0367997 ],
       [0.95560925, 0.04439075],
       [0.62179397, 0.37820603],
       ...,
       [0.94172965, 0.05827035],
       [0.97051579, 0.02948421],
       [0.97872693, 0.02127307]])

# 6.0 Models Evaluation

In [158]:
preprocessor = dp.pipeline()

In [159]:
X_valid_processed = preprocessor.transform(X_valid)

In [160]:
y_valid_pred = tm.predict(X_valid_processed)

In [161]:
class ModelEvaluation:
    def __init__(self):
        pass

    def eval_metrics(self, dados_reais, dados_preditos):
        roc_auc = roc_auc_score(dados_reais, dados_preditos)
        return roc_auc

In [150]:
me = ModelEvaluation()

In [162]:
me.eval_metrics(y_valid, y_valid_pred[:,1])

0.6867114674467608

# 7.0 Experiments

## 7.1 Experiment 1

In [163]:
# 1ª etapa
pipe = Pipeline(steps = [('frequency_encoder',FrequencyEncoder(columns=['faixa_renda_mensal','faixa_numero_dependentes']))])
dp = DataPreprocess(X_train, pipe)
X_train_processed = dp.run()

#-------------------#
# 2ª etapa
tm = TrainModels(dados_x=X_train_processed, 
                 dados_y=y_train)
tm.train(model=LogisticRegression())

#-------------------#
# 3ª etapa
me = ModelEvaluation()
me.eval_metrics(y_valid, y_valid_pred[:,1])

Preprocessador iniciou...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6867114674467608