# 0.0. Imports

In [1]:
import joblib
import pandas as pd
import pandera
from sklearn.model_selection import train_test_split
from pandera import Check, Column, DataFrameSchema
from sklearn.pipeline import Pipeline
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

## 0.1. Utils

In [2]:
columns_to_use = ['target', 'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
       'Idade', 'NumeroDeVezes30-59DiasAtrasoNaoPior', 'TaxaDeEndividamento',
       'RendaMensal', 'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
       'NumeroDeVezes90DiasAtraso', 'NumeroDeEmprestimosOuLinhasImobiliarias',
       'NumeroDeVezes60-89DiasAtrasoNaoPior', 'NumeroDeDependentes']

# 1.0. Data load

In [3]:
class DataLoad:
    """Class data load"""
    
    def __init__(self) -> None:
        pass 
    
    def load_data(self) -> pd.DataFrame:
        """Funcao vai carregar os dados
        
        return:
            pandas DataFrame"""
            
        loaded_data = pd.read_csv('../data/raw/train.csv')
        return loaded_data

In [4]:
dl = DataLoad()

In [5]:
df = dl.load_data()[columns_to_use]

In [6]:
df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


# 2.0. Data validation

In [7]:
class DataValidation:
    """"""
    def __init__(self, columns_to_use) -> None:
        self.columns_to_use = columns_to_use
        
    def check_shape_data(self, dataframe: pd.DataFrame) -> bool:
        try:
            print('Validacao iniciou')
            dataframe.columns = self.columns_to_use
            return True 
        except Exception as e:
            print(f'Validacao errou: {e}')
            return False
        
    def check_columns(self, dataframe: pd.DataFrame) -> bool:
        schema = DataFrameSchema(
                {
                    "target": Column(int, Check.isin([0, 1]), Check(lambda x: x > 0), coerce=True),
                    "TaxaDeUtilizacaoDeLinhasNaoGarantidas": Column(float, nullable=True),
                    "Idade": Column(int, nullable=True),
                    "NumeroDeVezes30-59DiasAtrasoNaoPior": Column(int, nullable=True),
                    "TaxaDeEndividamento": Column(float, nullable=True),
                    "RendaMensal": Column(float, nullable=True),
                    "NumeroDeLinhasDeCreditoEEmprestimosAbertos": Column(int, nullable=True),
                    "NumeroDeVezes90DiasAtraso": Column(int, nullable=True),
                    "NumeroDeEmprestimosOuLinhasImobiliarias": Column(int, nullable=True),
                    "NumeroDeVezes60-89DiasAtrasoNaoPior": Column(int, nullable=True),
                    "NumeroDeDependentes": Column(float, nullable=True)
                }
            )
        try:
            schema.validate(dataframe)
            print("Validation columns passed...")
            return True
        except pandera.errors.SchemaErrors as exc:
            print("Validation columns failed...")
            pandera.display(exc.failure_cases)
        return False
    
    def run(self, dataframe: pd.DataFrame) -> bool:
        if self.check_shape_data(dataframe) and self.check_columns(dataframe):
            print('Validacao com sucesso.')
            return True 
        else:
            print('Validacao falhou.')
            return False

In [8]:
dv = DataValidation(columns_to_use)

In [9]:
dv.run(df)

Validacao iniciou
Validation columns passed...
Validacao com sucesso.


True

# 3.0. Data Transformation

In [10]:
class DataTransformation:
    def __init__(self, dataframe: pd.DataFrame, 
                       target_name: str):
        self.dataframe = dataframe 
        self.target_name = target_name 
        
    def train_test_spliting(self):
        X = self.dataframe.drop(self.target_name, axis=1)
        y = self.dataframe[self.target_name]
        
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y)
        
        return X_train, X_valid, y_train, y_valid

In [11]:
dt = DataTransformation(df, 'target')

In [12]:
X_train, X_valid, y_train, y_valid = dt.train_test_spliting()

In [13]:
X_train.shape

(112500, 10)

In [14]:
X_valid.shape

(37500, 10)

# 4.0. Data preprocess

In [15]:
class DataPreprocess:
    def __init__(self, dataframe: pd.DataFrame,
                       pipe: Pipeline):
        self.dataframe = dataframe
        self.pipe = pipe 
        
    def pipeline(self):
        train_pipe = self.pipe
        train_pipe.fit(self.dataframe)
        return train_pipe 
    
    def run(self):
        print('Preprocessador iniciou...')
        trained_pipeline = self.pipeline()
        data_preprocessed = trained_pipeline.transform(self.dataframe)
        return data_preprocessed

In [16]:
pipe = Pipeline([('imputer', MeanMedianImputer(variables=['RendaMensal',
                                                          'NumeroDeDependentes'])),
                 ('discretizer', EqualFrequencyDiscretiser(variables=['TaxaDeUtilizacaoDeLinhasNaoGarantidas',
                                                                      'TaxaDeEndividamento',
                                                                      'RendaMensal'])),
                 ('scaler', SklearnTransformerWrapper(StandardScaler()))])

In [17]:
dp = DataPreprocess(X_train, pipe)

In [18]:
X_train_processed = dp.run()

Preprocessador iniciou...


In [19]:
X_train.head()

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
137772,0.954288,57,2,0.618694,6600.0,12,0,2,0,4.0
80090,0.000191,49,0,142.0,,8,0,0,0,0.0
25464,0.339351,61,0,0.474672,6020.0,13,0,1,0,1.0
48576,0.0,43,0,0.738318,4600.0,9,0,2,0,1.0
107660,0.953977,49,0,0.879571,2050.0,5,0,0,0,0.0


In [20]:
X_train_processed.head()

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
137772,1.218544,0.318857,0.392396,0.522642,0.408946,0.689098,-0.062915,0.857545,-0.056867,2.953335
80090,-1.566699,-0.222574,-0.101143,1.219147,0.000852,-0.087788,-0.062915,-0.89142,-0.056867,-0.664943
25464,0.522233,0.589572,-0.101143,0.522642,0.408946,0.883319,-0.062915,-0.016938,-0.056867,0.239626
48576,-1.566699,-0.628646,-0.101143,0.870894,-0.407242,0.106433,-0.062915,0.857545,-0.056867,0.239626
107660,1.218544,-0.222574,-0.101143,0.870894,-1.631524,-0.670453,-0.062915,-0.89142,-0.056867,-0.664943


In [21]:
joblib.dump(dp.pipeline(),
            'preprocessor.joblib')

['preprocessor.joblib']

# 5.0. Train models

In [22]:
class TrainModels:
    def __init__(self, dados_X: pd.DataFrame,
                       dados_y: pd.DataFrame):
        self.dados_X = dados_X 
        self.dados_y = dados_y 
        
    def train(self, model):
        model.fit(self.dados_X, self.dados_y)
        joblib.dump(model, 'modelo.joblib')
        return model 
    
    def predict(self, dados_para_prever: pd.DataFrame):
        model_fitted = self._load_model()
        dados_pred = model_fitted.predict_proba(dados_para_prever)
        return dados_pred
    
    def _load_model(self):
        model = joblib.load('modelo.joblib')
        return model

In [23]:
tm = TrainModels(dados_X=X_train_processed,
                 dados_y = y_train)

In [24]:
tm.train(model=LogisticRegression())

In [25]:
y_train_pred = tm.predict(X_train_processed)

In [26]:
y_train_pred

array([[0.75549557, 0.24450443],
       [0.99315458, 0.00684542],
       [0.94557916, 0.05442084],
       ...,
       [0.83396623, 0.16603377],
       [0.93760016, 0.06239984],
       [0.97725386, 0.02274614]])

# 6.0. Models evaluation

In [27]:
preprocessor = dp.pipeline()

In [28]:
preprocessor

In [29]:
X_valid_processed = preprocessor.transform(X_valid)

In [30]:
y_valid_pred = tm.predict(X_valid_processed)

In [31]:
class ModelEvaluation:
    def __init__(self):
        pass 
    
    def eval_metrics(self, dados_reais, dados_preditos):
        roc_auc = roc_auc_score(dados_reais, dados_preditos)
        return roc_auc

In [32]:
me = ModelEvaluation()

In [33]:
y_train_pred[:, 1]

array([0.24450443, 0.00684542, 0.05442084, ..., 0.16603377, 0.06239984,
       0.02274614])

In [34]:
me.eval_metrics(y_train, y_train_pred[:, 1])

0.7959115823717302

In [35]:
me.eval_metrics(y_valid, y_valid_pred[:, 1])

0.7913584597368502

# 7.0. Experiments

## 7.1. Experiment 1

In [36]:
# 1. etapa
pipe = Pipeline([('imputer', MeanMedianImputer(variables=['RendaMensal',
                                                          'NumeroDeDependentes'])),
                 ('discretizer', EqualFrequencyDiscretiser(variables=['TaxaDeUtilizacaoDeLinhasNaoGarantidas',
                                                                      'TaxaDeEndividamento',
                                                                      'RendaMensal'])),
                 ('scaler', SklearnTransformerWrapper(RobustScaler()))])
dp = DataPreprocess(X_train, pipe)
X_train_processed = dp.run()

#---------------------#
#2. etapa
tm = TrainModels(dados_X=X_train_processed,
                 dados_y = y_train)
tm.train(model=LogisticRegression(penalty='l2', max_iter=1500, solver='newton-cholesky'))
y_valid_pred = tm.predict(X_valid_processed)

#---------------------#
# 3.etapa
me = ModelEvaluation()
me.eval_metrics(y_valid, y_valid_pred[:, 1])

Preprocessador iniciou...


0.7852483141223379