# 0. Imports

In [1]:
import joblib
import pandas as pd

import pandera
from pandera import Check, Column, DataFrameSchema

from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser

from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split


## 0.1 Utils

In [2]:
columns_to_use = ['target', 'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
       'Idade', 'NumeroDeVezes30-59DiasAtrasoNaoPior', 'TaxaDeEndividamento',
       'RendaMensal', 'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
       'NumeroDeVezes90DiasAtraso', 'NumeroDeEmprestimosOuLinhasImobiliarias',
       'NumeroDeVezes60-89DiasAtrasoNaoPior', 'NumeroDeDependentes']

# 1. Data Load

In [3]:
class DataLoad:
    """Class data load"""
    def __init__(self) -> None:
        pass
    def load_data(self) -> pd.DataFrame:
        """Funcao vai carregar os dados
        return:
            pandas DataFrame"""
        loaded_data = pd.read_csv('../data/raw/train.csv')
        return loaded_data

In [4]:
dl = DataLoad()
df = dl.load_data()[columns_to_use]

# 2. Data Validation

In [5]:
class DataValidation:
    """Inserir docstring"""
    def __init__(self, columns_to_use) -> None:
        self.columns_to_use = columns_to_use

    def check_shape_data(self, dataframe: pd.DataFrame) -> bool:
        try:
            print('Validacao iniciou')
            dataframe.columns = self.columns_to_use
            return True
        except Exception as e:
            print(f'Validacao errou: {e}')
            return False
        
    def check_columns(self, dataframe: pd.DataFrame) -> bool:
        schema = DataFrameSchema(
            {
                'target': Column(int, [Check.isin([0, 1]), Check(lambda x: x>=0)], coerce=True),
                'TaxaDeUtilizacaoDeLinhasNaoGarantidas': Column(float, nullable=True),
                'Idade': Column(int, nullable=True),
                'NumeroDeVezes30-59DiasAtrasoNaoPior': Column(int, nullable=True),
                'TaxaDeEndividamento': Column(float, nullable=True),
                'RendaMensal': Column(float, nullable=True),
                'NumeroDeLinhasDeCreditoEEmprestimosAbertos': Column(int, nullable=True),
                'NumeroDeVezes90DiasAtraso': Column(int, nullable=True),
                'NumeroDeEmprestimosOuLinhasImobiliarias': Column(int, nullable=True),
                'NumeroDeVezes60-89DiasAtrasoNaoPior': Column(int, nullable=True),
                'NumeroDeDependentes': Column(float, nullable=True)
            }
        )
        try:
            schema.validate(dataframe)
            print("Validation columns passed...")
            return True
        except pandera.errors.SchemaErrors as exc:
            print("Validation columns failed...")
            pandera.display(exc.failure_cases)
            return False

    def run(self, dataframe: pd.DataFrame) -> bool:
        if self.check_shape_data(dataframe) and self.check_columns(dataframe):
            print('Validacao com sucesso.')
            return True
        else:
            print('Validacao falhou.')
            return False

In [6]:
dv = DataValidation(columns_to_use)


In [7]:
dv.run(df)

Validacao iniciou
Validation columns passed...
Validacao com sucesso.


True

# 3. Data Transformation

In [8]:
class DataTransformation:
    def __init__(self, dataframe: pd.DataFrame, target_name: str):
        self.dataframe = dataframe
        self.target_name = target_name

    def train_test_spliting(self):
        X = self.dataframe.drop(self.target_name, axis=1)
        y = self.dataframe[self.target_name]

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y)

        return X_train, X_valid, y_train, y_valid 

In [9]:
dt = DataTransformation(df, 'target')

In [10]:
X_train, X_valid, y_train, y_valid  = dt.train_test_spliting()

In [11]:
X_valid.shape

(37500, 10)

# 4. Data Preprocess

In [12]:
class DataPreprocess:
    def __init__(self, dataframe: pd.DataFrame, pipe: Pipeline):
        self.dataframe = dataframe
        self.pipe = pipe

    def pipeline(self):
        train_pipe = self.pipe
        train_pipe.fit(self.dataframe)
        return train_pipe
    
    def run(self):
        print('Preprocessador iniciou...')
        trained_pipeline = self.pipeline()
        data_preprocessed = trained_pipeline.transform(self.dataframe)
        return data_preprocessed

In [13]:
pipe = Pipeline([
    ('imputer', MeanMedianImputer(variables=['RendaMensal', 'NumeroDeDependentes'])),
    ('discretizer', EqualFrequencyDiscretiser(variables=['TaxaDeUtilizacaoDeLinhasNaoGarantidas', 'TaxaDeEndividamento', 'RendaMensal'])),
    ('scaler', SklearnTransformerWrapper(StandardScaler()))
])

In [14]:
dp = DataPreprocess(X_train, pipe)

In [15]:
X_train_processed = dp.run()

Preprocessador iniciou...


In [16]:
X_train_processed.head()

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
138980,0.174081,-1.035241,-0.099813,-0.870371,1.225682,0.109403,0.171729,-0.015031,-0.058263,-0.667179
108642,-0.52223,-0.358093,-0.099813,0.522584,0.409132,0.304212,-0.064007,0.879697,-0.058263,-0.667179
77313,1.566704,1.131633,0.369156,1.5673,0.000856,-0.864643,-0.064007,-0.015031,-0.058263,-0.667179
22384,1.566704,-1.035241,-0.099813,1.219061,0.000856,-1.449071,0.171729,-0.90976,-0.058263,-0.667179
120200,-0.870386,-1.576959,-0.099813,1.219061,0.000856,-0.085406,-0.064007,-0.90976,-0.058263,-0.667179


In [17]:
joblib.dump(dp.pipeline(), 'preprocessor.joblib')

['preprocessor.joblib']

# 5. Train Models

In [18]:
class TrainModels:
    def __init__(self, dados_X: pd.DataFrame, dados_y: pd.DataFrame):
        self.dados_X = dados_X
        self.dados_y = dados_y

    def train(self, model):
        model.fit(self.dados_X, self.dados_y)
        joblib.dump(model, 'modelo.joblib')
        return model
    
    def predict(self, dados_para_prever: pd.DataFrame):
        model_fitted = self._load_model()
        dados_pred = model_fitted.predict_proba(dados_para_prever)
        return dados_pred
    
    def _load_model(self):
        model = joblib.load('modelo.joblib')
        return model

In [19]:
tm = TrainModels(dados_X=X_train_processed, dados_y=y_train)

In [20]:
tm.train(model=LogisticRegression())

In [21]:
y_train_pred = tm.predict(X_train_processed)

In [22]:
y_train_pred

array([[0.94671146, 0.05328854],
       [0.97755668, 0.02244332],
       [0.80433744, 0.19566256],
       ...,
       [0.99140277, 0.00859723],
       [0.99373376, 0.00626624],
       [0.90512493, 0.09487507]], shape=(112500, 2))

# 6. Model Evaluation

In [24]:
preprocessor = dp.pipeline()

In [25]:
X_valid_processed = preprocessor.transform(X_valid)

In [26]:
y_valid_pred = tm.predict(X_valid_processed)

In [27]:
y_valid_pred

array([[0.94990305, 0.05009695],
       [0.99048843, 0.00951157],
       [0.99532164, 0.00467836],
       ...,
       [0.95825218, 0.04174782],
       [0.71278599, 0.28721401],
       [0.97365387, 0.02634613]], shape=(37500, 2))

In [28]:
class ModelEvaluation:
    def __init__(self):
        pass

    def eval_metrics(self, dados_reais, dados_preditos):
        roc_auc = roc_auc_score(dados_reais, dados_preditos)
        return roc_auc

In [29]:
me = ModelEvaluation()

In [32]:
me.eval_metrics(y_train, y_train_pred[:, 1])

np.float64(0.7941589913142681)

In [33]:
me.eval_metrics(y_valid, y_valid_pred[:, 1])

np.float64(0.7964001199578911)

# 7. Experiments

## 7.1 Experiment #1

In [47]:
# 1a etapa
pipe = Pipeline([
    ('imputer', MeanMedianImputer(variables=['RendaMensal', 'NumeroDeDependentes'])),
    ('discretizer', EqualFrequencyDiscretiser(variables=['TaxaDeUtilizacaoDeLinhasNaoGarantidas', 'TaxaDeEndividamento', 'RendaMensal'])),
    ('scaler', SklearnTransformerWrapper(RobustScaler()))
])
dp = DataPreprocess(X_train, pipe)
X_train_processed = dp.run()

# 2a etapa
tm = TrainModels(dados_X=X_train_processed, dados_y=y_train)
tm.train(model=LogisticRegression(penalty='l2', max_iter=1500, solver='newton-cholesky'))
y_train_pred = tm.predict(X_train_processed)

# 3a etapa
me = ModelEvaluation()
me.eval_metrics(y_train, y_train_pred[:, 1])

Preprocessador iniciou...


np.float64(0.7940526788133527)

Continuar aula 23