# Settings

## Imports

In [83]:
import os, sys

ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))

if ROOT_DIR not in sys.path:
    sys.path.append(ROOT_DIR)

In [84]:
import pandas as pd
import numpy as np
import unicodedata
import re
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectPercentile

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score

## Definitions

In [85]:
RAW_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'raw')
PROCESSED_DATA_DIR = os.path.join(ROOT_DIR, 'data', 'processed')
MODEL_DIR = os.path.join(ROOT_DIR, 'model')

train_file = os.path.join(RAW_DATA_DIR, 'train.csv')
test_file = os.path.join(RAW_DATA_DIR, 'test.csv')

model_file = os.path.join(MODEL_DIR, 'final_model.pkl')

np.random.seed(42)

## Functions

In [86]:
def jupyter_settings() -> None:

    sns.set_palette('colorblind')
    plt.style.use('seaborn-v0_8')
    
    plt.rcParams.update(
        {
            'figure.figsize': [25, 8],
            'font.size': 18,
        }
    )
                        
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', 50)    
    pd.set_option('display.expand_frame_repr', False)
    pd.set_option('display.float_format', lambda x: '%.2f' % x)
    
    sns.set()
    
def replace_seps(strings) -> list:
    return [str(x).replace(' ', '_').replace('-', '_') for x in strings]

def remove_special_chars(strings) -> list:
    new_strings = []
    for input_string in strings:
        input_string = unicodedata.normalize('NFKD', input_string).encode('ASCII', 'ignore').decode('ASCII')
        
        cleaned_string = re.sub('[^a-zA-Z0-9_]', '', input_string)
        
        new_strings.append(cleaned_string)
        
    return new_strings

def strings_to_lower(strings) -> list:
    return [str(x).lower() for x in strings]

def from_camel_to_snake_case(strings) -> list:
    return [re.sub('([A-Z][a-z]+)', r'_\1', x).strip('_') for x in strings]

def to_snake_case(strings) -> list:
    converted_strings = strings.str.strip()
    converted_strings = replace_seps(converted_strings)
    converted_strings = remove_special_chars(converted_strings)
    converted_strings = from_camel_to_snake_case(converted_strings)
    converted_strings = strings_to_lower(converted_strings)
    
    return converted_strings

def show_dimensions(df) -> None:
    rows, cols = df.shape
    print(f'Linhas: {rows}')
    print(f'Colunas: {cols}')
    
def nan_status(df, supress=True) -> pd.DataFrame:
    sum_na = df.isna().sum()
    prc_na = np.round(df.isna().mean() * 100, 2).astype(str) + '%'
    
    data = {
        'number_of_nan': sum_na,
        'percentage_of_nan': prc_na
    }
    
    df = pd.DataFrame(data)
    
    if supress:
        df = df.loc[df['number_of_nan'] > 0]
        
    return df
    
jupyter_settings()

## Data Load

In [87]:
df = pd.read_csv(train_file, index_col=0)

## Data Split

In [88]:
x = df.drop('target', axis=1)
y = df['target']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y)

# Data Description

In [89]:
df1 = pd.concat([x_train, y_train], axis=1)

## Rename Features

In [90]:
df1.columns = to_snake_case(df1.columns)
x_test.columns = to_snake_case(x_test.columns)

## Data Dimensions

In [91]:
show_dimensions(df1)

Linhas: 120000
Colunas: 11


## Data Types

In [92]:
df1.dtypes

taxa_de_utilizacao_de_linhas_nao_garantidas         float64
idade                                                 int64
numero_de_vezes30_59_dias_atraso_nao_pior             int64
taxa_de_endividamento                               float64
renda_mensal                                        float64
numero_de_linhas_de_creditoe_emprestimos_abertos      int64
numero_de_vezes90_dias_atraso                         int64
numero_de_emprestimos_ou_linhas_imobiliarias          int64
numero_de_vezes60_89_dias_atraso_nao_pior             int64
numero_de_dependentes                               float64
target                                                int64
dtype: object

## Missing Data

In [93]:
nan_status(df1)

Unnamed: 0,number_of_nan,percentage_of_nan
renda_mensal,23675,19.73%
numero_de_dependentes,3128,2.61%


In [94]:
nan_status(x_test)

Unnamed: 0,number_of_nan,percentage_of_nan
renda_mensal,6056,20.19%
numero_de_dependentes,796,2.65%


## Data Describe

In [95]:
df1.describe()

Unnamed: 0,taxa_de_utilizacao_de_linhas_nao_garantidas,idade,numero_de_vezes30_59_dias_atraso_nao_pior,taxa_de_endividamento,renda_mensal,numero_de_linhas_de_creditoe_emprestimos_abertos,numero_de_vezes90_dias_atraso,numero_de_emprestimos_ou_linhas_imobiliarias,numero_de_vezes60_89_dias_atraso_nao_pior,numero_de_dependentes,target
count,120000.0,120000.0,120000.0,120000.0,96325.0,120000.0,120000.0,120000.0,120000.0,116872.0,120000.0
mean,6.13,52.29,0.42,352.27,6651.51,8.47,0.26,1.02,0.24,0.76,0.07
std,253.36,14.77,4.18,2093.71,14541.18,5.16,4.16,1.13,4.14,1.12,0.25
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.03,41.0,0.0,0.18,3400.0,5.0,0.0,0.0,0.0,0.0,0.0
50%,0.15,52.0,0.0,0.37,5390.0,8.0,0.0,1.0,0.0,0.0,0.0
75%,0.56,63.0,0.0,0.86,8238.0,11.0,0.0,2.0,0.0,1.0,0.0
max,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0,1.0


## Data Treatment

In [96]:
x_train = df1.drop('target', axis=1)
y_train = df1['target']

In [97]:
imputer_mean = SimpleImputer(strategy='mean')
cols_to_impute = ['renda_mensal', 'numero_de_dependentes']

imputer_mean.fit(x_train[cols_to_impute])

# train
x_train[cols_to_impute] = imputer_mean.transform(x_train[cols_to_impute])

# test
x_test[cols_to_impute] = imputer_mean.transform(x_test[cols_to_impute])

# Data Selection

In [98]:
select = SelectPercentile(percentile=50)
select.fit(x_train, y_train)

x_train_selected = select.transform(x_train)

print(f'x_train shape: {x_train.shape}')
print(f'x_train_selected shape: {x_train_selected.shape}')

select_cols = [x_train.columns[i] for i in select.get_support(indices=True)]
select_cols

x_train shape: (120000, 10)
x_train_selected shape: (120000, 5)


['idade',
 'numero_de_vezes30_59_dias_atraso_nao_pior',
 'numero_de_vezes90_dias_atraso',
 'numero_de_vezes60_89_dias_atraso_nao_pior',
 'numero_de_dependentes']

# Model Training

## Logistic Regression - Baseline

In [99]:
x_test_selected = select.transform(x_test)

In [100]:
# model definition
lr_model = LogisticRegression(max_iter=1000)

# model training
lr_model.fit(x_train, y_train)

# model prediction
y_pred = lr_model.predict_proba(x_test)[:, 1]

# model performance
roc = roc_auc_score(y_test, y_pred)
print(f'ROC AUC with all features: {roc}')

ROC AUC with all features: 0.6647634975786111


In [101]:
# model training
lr_model.fit(x_train_selected, y_train)

# model prediction
y_pred = lr_model.predict_proba(x_test_selected)[:, 1]

# model performance
auc = roc_auc_score(y_test, y_pred)
print(f'ROC AUC with selected features: {auc}')

ROC AUC with selected features: 0.7116077906679987


# Hyperparameter Tuning

In [102]:
clf = [
    LogisticRegression(solver='newton-cg', penalty=None, max_iter=1000),
    LogisticRegression(solver='lbfgs', penalty=None, max_iter=1000),
    LogisticRegression(solver='sag', penalty=None, max_iter=1000),
    LogisticRegression(solver='saga', penalty=None, max_iter=1000)
]

clf_columns = []
clf_compare = pd.DataFrame(columns=clf_columns)

row_index = 0

for lr_model in clf:
    lr_model.fit(x_train_selected, y_train)
    y_pred = lr_model.predict_proba(x_test_selected)[:, 1]
    auc = roc_auc_score(y_test, y_pred)
    
    clf_compare.loc[row_index, 'model'] = lr_model.__class__.__name__
    clf_compare.loc[row_index, 'auc_score'] = auc
    clf_compare.loc[row_index, 'solver'] = lr_model.solver
    clf_compare.loc[row_index, 'penalty'] = lr_model.penalty
    clf_compare.loc[row_index, 'max_iter'] = lr_model.max_iter
    
    row_index += 1
    
clf_compare = clf_compare.sort_values(by=['auc_score'], ascending=False)
clf_compare

Unnamed: 0,model,auc_score,solver,penalty,max_iter
0,LogisticRegression,0.71,newton-cg,,1000.0
1,LogisticRegression,0.71,lbfgs,,1000.0
3,LogisticRegression,0.71,saga,,1000.0
2,LogisticRegression,0.71,sag,,1000.0


# Deploy

## Final Model

In [103]:
final_model = LogisticRegression(solver='sag', penalty=None, max_iter=1000)

X_train = pd.concat([x_train, x_test], axis=0)
Y_train = pd.concat([y_train, y_test], axis=0)

final_model.fit(X_train, Y_train)

pickle.dump(final_model, open(model_file, 'wb'))

