In [1]:
import os, sys

ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..', '..'))
if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from feature_engine.imputation import MeanMedianImputer
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.linear_model import LogisticRegression

from level_2.src.utils.utils import load_config
from level_2.src.data.data_load import DataLoad
from level_2.src.data.data_validation import DataValidation
from level_2.src.data.data_transform import DataTransform
from level_2.src.data.data_preprocess import DataPreprocess
from level_2.src.train.model_training import ModelTraining
from level_2.src.evaluation.classifier_eval import ClassifierEvaluation

In [3]:
LEVEL_DIR = os.path.join(ROOT_DIR, 'level_2')
RAW_DATA_DIR = os.path.join(LEVEL_DIR, 'data', 'raw')

train_data_file = 'train.csv'

train_data_path = os.path.join(RAW_DATA_DIR, train_data_file)

In [4]:
dl = DataLoad()

df = dl.run(train_data_path, index_col=0)
df.head()

[2m2023-12-07 11:20:45[0m [[32m[1minfo     [0m] [1mReading data from CSV file...[0m
[2m2023-12-07 11:20:45[0m [[32m[1minfo     [0m] [1mData read successfully.[0m


Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [5]:
dv = DataValidation()

is_valid = dv.run(df)

[2m2023-12-07 11:20:45[0m [[32m[1minfo     [0m] [1mValidation started[0m
[2m2023-12-07 11:20:45[0m [[32m[1minfo     [0m] [1mValidation passed[0m
[2m2023-12-07 11:20:45[0m [[32m[1minfo     [0m] [1mValidation successeful[0m


In [6]:
dt = DataTransform(df)

x_train, x_test, y_train, y_test = dt.train_test_split()

In [7]:
pipe = Pipeline(
    [
        ('imputer', MeanMedianImputer(variables=load_config().get('imputer_variables'))),
        ('discretizer', EqualFrequencyDiscretiser(variables=load_config().get('discretizer_variables'))),
        ('scaler', SklearnTransformerWrapper(StandardScaler()))
    ]
)

preprocessor = DataPreprocess(pipe)

preprocessor.train(x_train)

[2m2023-12-07 11:20:45[0m [[32m[1minfo     [0m] [1mPreprocessing started[0m


In [8]:
x_train_processed = preprocessor.transform(x_train)
x_test_processed = preprocessor.transform(x_test)

[2m2023-12-07 11:20:45[0m [[32m[1minfo     [0m] [1mTransforming data[0m
[2m2023-12-07 11:20:45[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m
[2m2023-12-07 11:20:45[0m [[32m[1minfo     [0m] [1mTransforming data[0m
[2m2023-12-07 11:20:45[0m [[32m[1minfo     [0m] [1mPreprocessing finished[0m


In [9]:
x_train_processed.head()

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
146433,0.174078,-1.507791,-0.100473,-0.870389,0.558077,-0.672878,-0.06394,0.869889,-0.057852,0.23683
15597,0.870388,-0.830642,-0.100473,-0.870389,-1.184396,-1.25508,-0.06394,-0.902282,-0.057852,-0.667136
111605,1.566699,-1.236931,-0.100473,-1.218546,-0.487407,-1.449147,-0.06394,-0.902282,-0.057852,-0.667136
85418,-1.218544,2.081102,-0.100473,1.218553,-0.138912,-0.866945,-0.06394,-0.902282,-0.057852,-0.667136
9652,0.870388,0.591373,-0.100473,-0.870389,-0.487407,-1.061013,-0.06394,-0.902282,-0.057852,1.140796


In [12]:
mt = ModelTraining(x_data=x_train_processed, y_data=y_train)
model = mt.train(model=LogisticRegression(random_state=load_config().get('random_state')))

[2m2023-12-07 11:21:25[0m [[32m[1minfo     [0m] [1mTraining model LogisticRegression[0m
[2m2023-12-07 11:21:26[0m [[32m[1minfo     [0m] [1mModel trained successfully.[0m
[2m2023-12-07 11:21:26[0m [[32m[1minfo     [0m] [1mModel saved successfully in /home/bem/repos/ml_flow/level_2/models/model.joblib[0m


In [15]:
ce = ClassifierEvaluation(model, x_test_processed, y_test)
roc_auc_scores = ce.cross_val_eval()

[2m2023-12-07 11:22:30[0m [[32m[1minfo     [0m] [1mCross validation evaluation for model LogisticRegression started.[0m


In [16]:
roc_auc_scores.mean()

0.7991691576684335