# Settings

## Imports

In [2]:
import os, sys

ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..', '..'))
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

# Data Manipulation
import pandas as pd
import numpy as np

import joblib

# Data Testing
import pandera
from pandera import Check, Column, DataFrameSchema

# Models
from sklearn.linear_model import LogisticRegression

# Preprocessing
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper

# Metrics
from sklearn.metrics import roc_auc_score

# Pipeline
from sklearn.pipeline import Pipeline

## Definitions

In [73]:
STEP_PATH = os.path.join(ROOT_DIR, 'second_step')
RAW_DATA_PATH = os.path.join(STEP_PATH, 'data', 'raw')
PROCESSED_DATA_PATH = os.path.join(STEP_PATH, 'data', 'processed')
NOTEBOOK_PATH = os.path.join(STEP_PATH, 'notebooks')
PIPE_PATH = os.path.join(STEP_PATH, 'src', 'pipelines')
MODEL_PATH = os.path.join(STEP_PATH, 'src', 'models')

RAW_DATA_FILE = 'train.csv'
RAW_DATA_FILE_PATH = os.path.join(RAW_DATA_PATH, RAW_DATA_FILE)

np.random.seed(42)

COLUMNS_TO_USE = [
    'target',
    'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
    'Idade',
    'NumeroDeVezes30-59DiasAtrasoNaoPior',
    'TaxaDeEndividamento',
    'RendaMensal',
    'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
    'NumeroDeVezes90DiasAtraso',
    'NumeroDeEmprestimosOuLinhasImobiliarias',
    'NumeroDeVezes60-89DiasAtrasoNaoPior',
    'NumeroDeDependentes'
]

## Functions

# Data Load

In [26]:
class DataLoad:
    def __init__(self) -> None:
        pass

    def run(self, data_file: str, index_col: int = None) -> pd.DataFrame:
        df = pd.read_csv(data_file, index_col=index_col)
        return df

In [27]:
data_load = DataLoad()
df = data_load.run(RAW_DATA_FILE_PATH, index_col=0)
df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


# Data Validation

In [41]:
class DataValidation:
    def __init__(self, columns_to_use: list) -> None:
        self.columns_to_use = columns_to_use

    def check_shape_data(self, df: pd.DataFrame) -> bool:
        try:
            print('Validation Started')
            df.columns = self.columns_to_use
            return True
        except Exception as e:
            print(f'Validation Failed: {e}')
            return False
        
    def check_columns(self, df: pd.DataFrame) -> bool:
        schema = DataFrameSchema(
            {
                'target': Column(int, Check.isin([0, 1]), Check(lambda x: x > 0), coerce=True),
                'TaxaDeUtilizacaoDeLinhasNaoGarantidas': Column(float, nullable=True),
                'Idade': Column(int, nullable=True),
                'NumeroDeVezes30-59DiasAtrasoNaoPior': Column(int, nullable=True),
                'TaxaDeEndividamento': Column(float, nullable=True),
                'RendaMensal': Column(float, nullable=True),
                'NumeroDeLinhasDeCreditoEEmprestimosAbertos': Column(int, nullable=True),
                'NumeroDeVezes90DiasAtraso': Column(int, nullable=True),
                'NumeroDeEmprestimosOuLinhasImobiliarias': Column(int, nullable=True),
                'NumeroDeVezes60-89DiasAtrasoNaoPior': Column(int, nullable=True),
                'NumeroDeDependentes': Column(float, nullable=True),
            }
        )
        
        try:
            schema.validate(df)
            print('Validation Passed')
            return True
        except pandera.errors.SchemaErrors as e:
            print(f'Validation Failed: {e}')
            pandera.display(e.failure_cases)
            return False
        
    def run(self, df: pd.DataFrame) -> bool:
        if self.check_shape_data(df):
            if self.check_columns(df):
                print(f'Validation Successeful')
                return True
            else:
                return False
        else:
            return False

In [45]:
dv = DataValidation(COLUMNS_TO_USE)
dv.run(df)

Validation Started
Validation Passed
Validation Successeful


True

# Data Transformation

In [51]:
class DataTransformation():
    def __init__(self, df: pd.DataFrame, target: str) -> None:
        self.df = df
        self.target = target
        
    def train_test_split(self, test_size: float = None, stratify: list = None) -> tuple:
        x = self.df.drop(self.target, axis=1)
        y = self.df[self.target]
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, stratify=stratify, random_state=42)
        
        return x_train, x_test, y_train, y_test
        

In [58]:
dt = DataTransformation(df, 'target')
x_train, x_test, y_train, y_test = dt.train_test_split(stratify=df['target'])

# Data Preprocessing

In [65]:
class DataProcess:
    def __init__(self, df: pd.DataFrame, pipe: Pipeline) -> None:
        self.df = df
        self.pipe = pipe
        
    def pipeline(self) -> Pipeline:
        train_pipe = self.pipe
        train_pipe.fit(self.df)
        return train_pipe
    
    def run(self) -> pd.DataFrame:
        print('Preprocessing Started')
        trained_pipe = self.pipeline()
        data_preprocessed = trained_pipe.transform(self.df)
        print('Preprocessing Finished')
        return data_preprocessed

In [66]:
pipe = Pipeline(
    [
        ('imputer', MeanMedianImputer(variables=['RendaMensal', 'NumeroDeDependentes'])),
        (
            'discretizer',
            EqualFrequencyDiscretiser(
                variables=[
                    'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
                    'TaxaDeEndividamento',
                    'RendaMensal'
                ]
            )
        ),
        ('scaler', SklearnTransformerWrapper(StandardScaler()))
    ]
)

In [68]:
dp = DataProcess(x_train, pipe)
x_train_processed = dp.run()

Preprocessing Started
Preprocessing Finished


In [71]:

joblib.dump(dp.pipeline(), os.path.join(PIPE_PATH, 'preprocessor.joblib'))

['/home/bem/repos/ml_flow/second_step/src/pipelines/preprocessor.joblib']

# Model Training

In [72]:
class ModelTraining:
    def __init__(self, x_data: pd.DataFrame, y_data: pd.DataFrame, model_path: str, model_name: str = 'model.joblib') -> None:
        self.x_data = x_data
        self.y_data = y_data
        self.model_path = model_path
        self.mode_name = model_name
        self.model_file = os.path.join(self.model_path, self.mode_name)
        
    def model_train(self, model):
        model.fit(self.x_data, self.y_data)
        self._save_model(model)
        return model
    
    def model_predict(self, x: pd.DataFrame) -> np.array:
        model = self._load_model()
        y_pred = model.predict_proba(x)
        return y_pred
    
    def _save_model(self, model) -> None:
        joblib.dump(model, self.model_file)
    
    def _load_model(self):
        return joblib.load(self.model_file)

In [74]:
mt = ModelTraining(x_train_processed, y_train, MODEL_PATH)
mt.model_train(LogisticRegression())
y_pred = mt.model_predict(x_train_processed)

# Model Evaluation

In [78]:
preprocessor = dp.pipeline()
x_test_processed = preprocessor.transform(x_test)
y_pred = mt.model_predict(x_test_processed)

In [87]:
class ModelEvaluation:
    def __init__(self) -> None:
        pass
    
    def eval_metrics(self, y_true: np.array, y_pred: np.array) -> float:
        auc = roc_auc_score(y_true, y_pred)
        return auc

In [88]:
me = ModelEvaluation()
me.eval_metrics(y_test, y_pred[:, 1])

0.8003929165191288

# Experiments

## #1

In [93]:
# 1 - Preprocessing ----------------------------------------------
pipe = Pipeline(
    [
        ('imputer', MeanMedianImputer(variables=['RendaMensal', 'NumeroDeDependentes'])),
        (
            'discretizer',
            EqualFrequencyDiscretiser(
                variables=[
                    'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
                    'TaxaDeEndividamento',
                    'RendaMensal'
                ]
            )
        ),
        ('scaler', SklearnTransformerWrapper(StandardScaler()))
    ]
)

dp = DataProcess(x_train, pipe)
x_train_processed = dp.run()

# 2 - Model Training ---------------------------------------------
mt = ModelTraining(x_train_processed, y_train, MODEL_PATH)
mt.model_train(LogisticRegression())

# 3 - Model Prediction -------------------------------------------
preprocessor = dp.pipeline()
x_test_processed = preprocessor.transform(x_test)
y_pred = mt.model_predict(x_test_processed)

# 4 - Model Evaluation -------------------------------------------
me = ModelEvaluation()
me.eval_metrics(y_test, y_pred[:, 1])

Preprocessing Started
Preprocessing Finished


0.8003929165191288