## Importações

In [1]:
from funcoes.funcoes import *

pd.set_option('display.max_rows', 200)

In [2]:
X_train = pd.read_csv('dados/X_train.csv')
X_test  = pd.read_csv('dados/X_test.csv')
y_train = pd.read_csv('dados/y_train.csv')
y_test  = pd.read_csv('dados/y_test.csv')

In [3]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(326399, 104)
(326399, 1)
(139886, 104)
(139886, 1)


---

## 1. Criando um scorecard

In [4]:
reg = LogisticRegressionPValues()

reg.fit(X_train, y_train)

In [5]:
nomes_features = X_train.columns

In [6]:
tabela_sumario = pd.DataFrame(columns = ['nome_feature'], data = nomes_features)
tabela_sumario['coeficiente'] = np.transpose(reg.coef_)
tabela_sumario.index = tabela_sumario.index + 1
tabela_sumario.loc[0] = ['intercept', reg.intercept_[0]]
tabela_sumario = tabela_sumario.sort_index()

p_values = reg.p_values
p_values = np.append(np.nan,np.array(p_values))
tabela_sumario['p_valor'] = p_values
tabela_sumario

Unnamed: 0,nome_feature,coeficiente,p_valor
0,intercept,-1.371397,
1,grade:A,1.120239,1.705517e-30
2,grade:B,0.891872,1.198637e-42
3,grade:C,0.706093,1.383027e-31
4,grade:D,0.529578,2.5174639999999998e-20
5,grade:E,0.344007,1.869595e-11
6,grade:F,0.181411,0.0006870495
7,home_ownership:OWN,0.080838,0.0001659498
8,home_ownership:MORTGAGE,0.118092,3.618968e-18
9,addr_state:NM_VA,0.038921,0.2552631


Criação de um score calculado com base nos coeficientes, em que coeficientes maiores vão corresponder consequentemente a melhores mutuários.

In [7]:
# criando um dataframe

df_ref_categorias = pd.DataFrame()


# concatenando o datafram com a tabela sumario

df_scorecard = pd.concat([tabela_sumario, df_ref_categorias])
df_scorecard = df_scorecard.reset_index()

df_scorecard['nome_original_feature'] = df_scorecard['nome_feature'].str.split(':').str[0]
df_scorecard

Unnamed: 0,index,nome_feature,coeficiente,p_valor,nome_original_feature
0,0,intercept,-1.371397,,intercept
1,1,grade:A,1.120239,1.705517e-30,grade
2,2,grade:B,0.891872,1.198637e-42,grade
3,3,grade:C,0.706093,1.383027e-31,grade
4,4,grade:D,0.529578,2.5174639999999998e-20,grade
5,5,grade:E,0.344007,1.869595e-11,grade
6,6,grade:F,0.181411,0.0006870495,grade
7,7,home_ownership:OWN,0.080838,0.0001659498,home_ownership
8,8,home_ownership:MORTGAGE,0.118092,3.618968e-18,home_ownership
9,9,addr_state:NM_VA,0.038921,0.2552631,addr_state


Os valores de score do modelo vão de 300 a 850, padrão utilizado por muitas empresas.

In [8]:
min_score = 300
max_score = 850

In [9]:
# agrupando dados e calculando o coeficiente mínimo de cada variável

df_scorecard.groupby('nome_original_feature')['coeficiente'].min()

nome_original_feature
acc_now_delinq                 0.168730
addr_state                     0.038921
annual_inc                    -0.063282
delinq_2yrs                    0.008231
dti                           -0.001991
emp_length                     0.058436
grade                          0.181411
home_ownership                 0.080838
initial_list_status            0.069243
inq_last_6mths                 0.386399
int_rate                       0.096453
intercept                     -1.371397
mths_since_earliest_cr_line    0.047317
mths_since_issue_d            -0.071832
mths_since_last_delinq         0.043164
mths_since_last_record         0.257033
open_acc                      -0.186017
pub_rec                        0.033668
purpose                        0.189525
term                           0.085792
total_acc                     -0.034813
total_rev_hi_lim              -0.011052
verification_status            0.005561
Name: coeficiente, dtype: float64

In [10]:
# soma dos coeficientes mínimos

min_sum_coef = df_scorecard.groupby('nome_original_feature')['coeficiente'].min().sum()
min_sum_coef

0.01033829947715837

In [11]:
# agrupando dados e calculando os coeficientes máximos de cada variável

df_scorecard.groupby('nome_original_feature')['coeficiente'].max()

nome_original_feature
acc_now_delinq                 0.168730
addr_state                     0.448291
annual_inc                     0.545912
delinq_2yrs                    0.064171
dti                            0.349212
emp_length                     0.129819
grade                          1.120239
home_ownership                 0.118092
initial_list_status            0.069243
inq_last_6mths                 0.729960
int_rate                       0.909501
intercept                     -1.371397
mths_since_earliest_cr_line    0.136221
mths_since_issue_d             1.040987
mths_since_last_delinq         0.110025
mths_since_last_record         0.616042
open_acc                      -0.064582
pub_rec                        0.040164
purpose                        0.302932
term                           0.085792
total_acc                     -0.018672
total_rev_hi_lim               0.202200
verification_status            0.108887
Name: coeficiente, dtype: float64

In [12]:
# soma dos coeficientes máximos

max_sum_coef = df_scorecard.groupby('nome_original_feature')['coeficiente'].max().sum()
max_sum_coef

5.841769670503259

Reescalando coeficientes de cada variável para um score correspondente.

In [13]:
# multiplicação do valor da coluna coeficientes pela razão das diferenças entre
# pontuação máxima e pontuação mínima e soma máxima e soma mínima dos coeficientes

df_scorecard['calculo_score'] = df_scorecard['coeficiente'] * (max_score - min_score) / (max_sum_coef - min_sum_coef)
df_scorecard

Unnamed: 0,index,nome_feature,coeficiente,p_valor,nome_original_feature,calculo_score
0,0,intercept,-1.371397,,intercept,-129.345317
1,1,grade:A,1.120239,1.705517e-30,grade,105.657014
2,2,grade:B,0.891872,1.198637e-42,grade,84.118238
3,3,grade:C,0.706093,1.383027e-31,grade,66.596188
4,4,grade:D,0.529578,2.5174639999999998e-20,grade,49.947893
5,5,grade:E,0.344007,1.869595e-11,grade,32.445523
6,6,grade:F,0.181411,0.0006870495,grade,17.110003
7,7,home_ownership:OWN,0.080838,0.0001659498,home_ownership,7.624322
8,8,home_ownership:MORTGAGE,0.118092,3.618968e-18,home_ownership,11.138061
9,9,addr_state:NM_VA,0.038921,0.2552631,addr_state,3.67093


In [14]:
# divide-se a diferença do valor da coluna coeficientes e a soma mínima dos coeficientes pela
# diferença da soma máxima e a soma mínima dos coeficientes
# multiplica-se pela diferença entre a pontuação máxima e a pontuação mínima
# e adiciona-se a pontuação mínima

df_scorecard['calculo_score'][0] = ((df_scorecard['coeficiente'][0] - min_sum_coef) / (max_sum_coef - min_sum_coef)) * (max_score - min_score) + min_score
df_scorecard

Unnamed: 0,index,nome_feature,coeficiente,p_valor,nome_original_feature,calculo_score
0,0,intercept,-1.371397,,intercept,169.679612
1,1,grade:A,1.120239,1.705517e-30,grade,105.657014
2,2,grade:B,0.891872,1.198637e-42,grade,84.118238
3,3,grade:C,0.706093,1.383027e-31,grade,66.596188
4,4,grade:D,0.529578,2.5174639999999998e-20,grade,49.947893
5,5,grade:E,0.344007,1.869595e-11,grade,32.445523
6,6,grade:F,0.181411,0.0006870495,grade,17.110003
7,7,home_ownership:OWN,0.080838,0.0001659498,home_ownership,7.624322
8,8,home_ownership:MORTGAGE,0.118092,3.618968e-18,home_ownership,11.138061
9,9,addr_state:NM_VA,0.038921,0.2552631,addr_state,3.67093


In [15]:
# arrendondando o valor do score

df_scorecard['score_preliminar'] = df_scorecard['calculo_score'].round()
df_scorecard

Unnamed: 0,index,nome_feature,coeficiente,p_valor,nome_original_feature,calculo_score,score_preliminar
0,0,intercept,-1.371397,,intercept,169.679612,170.0
1,1,grade:A,1.120239,1.705517e-30,grade,105.657014,106.0
2,2,grade:B,0.891872,1.198637e-42,grade,84.118238,84.0
3,3,grade:C,0.706093,1.383027e-31,grade,66.596188,67.0
4,4,grade:D,0.529578,2.5174639999999998e-20,grade,49.947893,50.0
5,5,grade:E,0.344007,1.869595e-11,grade,32.445523,32.0
6,6,grade:F,0.181411,0.0006870495,grade,17.110003,17.0
7,7,home_ownership:OWN,0.080838,0.0001659498,home_ownership,7.624322,8.0
8,8,home_ownership:MORTGAGE,0.118092,3.618968e-18,home_ownership,11.138061,11.0
9,9,addr_state:NM_VA,0.038921,0.2552631,addr_state,3.67093,4.0


In [16]:
# soma de todos os valores mínimos dos valores da coluna coeficiente
# score não está dentro da faixa escolhida

min_sum_score_prel = df_scorecard.groupby('nome_original_feature')['score_preliminar'].min().sum()
min_sum_score_prel

301.0

In [17]:
# soma de todos os valores máximos dos valores da coluna coeficiente

max_sum_score_prel = df_scorecard.groupby('nome_original_feature')['score_preliminar'].max().sum()
max_sum_score_prel

850.0

In [18]:
df_scorecard['diferenca'] = df_scorecard['score_preliminar'] - df_scorecard['calculo_score']
df_scorecard

Unnamed: 0,index,nome_feature,coeficiente,p_valor,nome_original_feature,calculo_score,score_preliminar,diferenca
0,0,intercept,-1.371397,,intercept,169.679612,170.0,0.320388
1,1,grade:A,1.120239,1.705517e-30,grade,105.657014,106.0,0.342986
2,2,grade:B,0.891872,1.198637e-42,grade,84.118238,84.0,-0.118238
3,3,grade:C,0.706093,1.383027e-31,grade,66.596188,67.0,0.403812
4,4,grade:D,0.529578,2.5174639999999998e-20,grade,49.947893,50.0,0.052107
5,5,grade:E,0.344007,1.869595e-11,grade,32.445523,32.0,-0.445523
6,6,grade:F,0.181411,0.0006870495,grade,17.110003,17.0,-0.110003
7,7,home_ownership:OWN,0.080838,0.0001659498,home_ownership,7.624322,8.0,0.375678
8,8,home_ownership:MORTGAGE,0.118092,3.618968e-18,home_ownership,11.138061,11.0,-0.138061
9,9,addr_state:NM_VA,0.038921,0.2552631,addr_state,3.67093,4.0,0.32907


In [19]:
# desfazendo um dos arredondamentos para que os valores de score fiquem entre o máximo e mínimo selecionados

df_scorecard['score_final'] = df_scorecard['score_preliminar']
df_scorecard['score_final'][33] = 5
df_scorecard

Unnamed: 0,index,nome_feature,coeficiente,p_valor,nome_original_feature,calculo_score,score_preliminar,diferenca,score_final
0,0,intercept,-1.371397,,intercept,169.679612,170.0,0.320388,170.0
1,1,grade:A,1.120239,1.705517e-30,grade,105.657014,106.0,0.342986,106.0
2,2,grade:B,0.891872,1.198637e-42,grade,84.118238,84.0,-0.118238,84.0
3,3,grade:C,0.706093,1.383027e-31,grade,66.596188,67.0,0.403812,67.0
4,4,grade:D,0.529578,2.5174639999999998e-20,grade,49.947893,50.0,0.052107,50.0
5,5,grade:E,0.344007,1.869595e-11,grade,32.445523,32.0,-0.445523,32.0
6,6,grade:F,0.181411,0.0006870495,grade,17.110003,17.0,-0.110003,17.0
7,7,home_ownership:OWN,0.080838,0.0001659498,home_ownership,7.624322,8.0,0.375678,8.0
8,8,home_ownership:MORTGAGE,0.118092,3.618968e-18,home_ownership,11.138061,11.0,-0.138061,11.0
9,9,addr_state:NM_VA,0.038921,0.2552631,addr_state,3.67093,4.0,0.32907,4.0


Agora o scorecard está correto dentro dos valores escolhidos.

In [20]:
min_sum_score_prel = df_scorecard.groupby('nome_original_feature')['score_final'].min().sum()
min_sum_score_prel

300.0

In [21]:
max_sum_score_prel = df_scorecard.groupby('nome_original_feature')['score_final'].max().sum()
max_sum_score_prel

850.0

### 1.1 Calculando score de crédito

Aqui foi calculado o score de cada registro do dataset de teste criado abaixo. 

In [22]:
# criando um dataframe com os dados de teste

df_features = pd.DataFrame(columns = nomes_features, data = X_test)


# inserindo nova coluna no dataset

df_features.insert(0, 'intercept', 1)

In [23]:
scorecard_scores = df_scorecard['score_final']

In [24]:
df_features.shape

(139886, 105)

In [25]:
scorecard_scores.shape

(105,)

In [26]:
scorecard_scores = scorecard_scores.values.reshape(105, 1)

In [27]:
# multiplicando os valores de teste com score para obter o score de cada registro 

y_scores = df_features.dot(scorecard_scores)

In [28]:
y_scores

Unnamed: 0,0
0,404.0
1,458.0
2,418.0
3,571.0
4,548.0
...,...
139881,558.0
139882,568.0
139883,618.0
139884,604.0


In [29]:
y_scores.min()

0    237.0
dtype: float64

### 1.2 Calculando a probabilidade de inadimplência

In [30]:
# divide-se a diferença entre as pontuações e a pontuação mínima pela
# diferença entre a pontuação máxima e a pontuação mínima
# multiplica-se pela diferença entre a soma máxima dos coeficientes e a soma mínima dos coeficientes
# adiciona-se a soma mínima dos coeficientes

sum_coef_score = ((y_scores - min_score) / (max_score - min_score)) * (max_sum_coef - min_sum_coef) + min_sum_coef

In [31]:
y_proba_score = np.exp(sum_coef_score) / (np.exp(sum_coef_score) + 1)

y_proba_score.head()

Unnamed: 0,0
0,0.75269
1,0.843638
2,0.779275
3,0.947032
4,0.933379


### 1.3 Definindo pontos de corte

Agora as métricas calculadas serão utilizadas para estipular um ponto de corte e que ajude a decidir se um empréstimo será concedido ou não.

In [32]:
modelo_pi = joblib.load('modelos/modelo_pi.sav')

In [33]:
y_pred_proba = modelo_pi.model.predict_proba(X_test)[:, 1]

taxa_falso_pos, taxa_verdadeiro_pos, thresholds = roc_curve(y_test, y_pred_proba)

In [34]:
# concatena-se as três colunas

df_cutoffs = pd.concat([pd.DataFrame(thresholds), pd.DataFrame(taxa_falso_pos), pd.DataFrame(taxa_verdadeiro_pos)], axis = 1)

df_cutoffs.columns = ['thresholds', 'taxa_falso_pos', 'taxa_verdadeiro_pos']

df_cutoffs.head()

Unnamed: 0,thresholds,taxa_falso_pos,taxa_verdadeiro_pos
0,1.992458,0.0,0.0
1,0.992458,0.0,8e-06
2,0.989886,0.0,0.000393
3,0.989883,6.5e-05,0.000393
4,0.989648,6.5e-05,0.000506


In [35]:
df_cutoffs['thresholds'][0] = 1 - 1 / np.power(10, 16)


# calculando score correspondente de cada thresholds

df_cutoffs['score'] = ((np.log(df_cutoffs['thresholds'] / (1 - df_cutoffs['thresholds'])) - min_sum_coef) * ((max_score - min_score) / (max_sum_coef - min_sum_coef)) + min_score).round()
df_cutoffs['score'][0] = max_score
df_cutoffs.head()

Unnamed: 0,thresholds,taxa_falso_pos,taxa_verdadeiro_pos,score
0,1.0,0.0,0.0,850.0
1,0.992458,0.0,8e-06,759.0
2,0.989886,0.0,0.000393,731.0
3,0.989883,6.5e-05,0.000393,731.0
4,0.989648,6.5e-05,0.000506,729.0


In [36]:
def n_aprovado(p):
    return np.where(y_pred_proba >= p, 1, 0).sum()

In [37]:
df_cutoffs['n_aprovado']     = df_cutoffs['thresholds'].apply(n_aprovado)
df_cutoffs['n_rejeitado']    = y_pred_proba.shape[0] - df_cutoffs['n_aprovado']
df_cutoffs['taxa_aprovacao'] = df_cutoffs['n_aprovado'] / y_pred_proba.shape[0]
df_cutoffs['taxa_rejeicao']  = 1 - df_cutoffs['taxa_aprovacao']

In [38]:
df_cutoffs.head()

Unnamed: 0,thresholds,taxa_falso_pos,taxa_verdadeiro_pos,score,n_aprovado,n_rejeitado,taxa_aprovacao,taxa_rejeicao
0,1.0,0.0,0.0,850.0,0,139886,0.0,1.0
1,0.992458,0.0,8e-06,759.0,1,139885,7e-06,0.999993
2,0.989886,0.0,0.000393,731.0,49,139837,0.00035,0.99965
3,0.989883,6.5e-05,0.000393,731.0,50,139836,0.000357,0.999643
4,0.989648,6.5e-05,0.000506,729.0,64,139822,0.000458,0.999542


Considerando uma probabilidade de default de 17% com thresholds de 92%, teríamos uma taxa de aprovação de 41% e de rejeição de 59%

In [39]:
df_cutoffs.iloc[5000: 5200, ]

Unnamed: 0,thresholds,taxa_falso_pos,taxa_verdadeiro_pos,score,n_aprovado,n_rejeitado,taxa_aprovacao,taxa_rejeicao
5000,0.921254,0.172727,0.434821,531.0,56818,83068,0.406174,0.593826
5001,0.921252,0.172793,0.434821,531.0,56819,83067,0.406181,0.593819
5002,0.921249,0.172793,0.434845,531.0,56822,83064,0.406202,0.593798
5003,0.921244,0.172858,0.434845,531.0,56823,83063,0.406209,0.593791
5004,0.92124,0.172858,0.434861,531.0,56825,83061,0.406224,0.593776
5005,0.92124,0.172923,0.434861,531.0,56826,83060,0.406231,0.593769
5006,0.921239,0.172923,0.434894,531.0,56830,83056,0.406259,0.593741
5007,0.921238,0.172989,0.434894,531.0,56831,83055,0.406267,0.593733
5008,0.921226,0.172989,0.434974,531.0,56841,83045,0.406338,0.593662
5009,0.921226,0.173054,0.434974,531.0,56842,83044,0.406345,0.593655


O score também pode ser utilizado para selecionar os pontos de corte. Um score de 613 teria uma taxa de aprovação de 12% e uma taxa de rejeição de 88%

In [40]:
df_cutoffs.iloc[700: 900, ]

Unnamed: 0,thresholds,taxa_falso_pos,taxa_verdadeiro_pos,score,n_aprovado,n_rejeitado,taxa_aprovacao,taxa_rejeicao
700,0.965282,0.023283,0.126376,613.0,16102,123784,0.115108,0.884892
701,0.965281,0.023349,0.126376,613.0,16103,123783,0.115115,0.884885
702,0.965269,0.023349,0.126465,613.0,16114,123772,0.115194,0.884806
703,0.965268,0.023414,0.126465,613.0,16115,123771,0.115201,0.884799
704,0.965263,0.023414,0.126529,613.0,16123,123763,0.115258,0.884742
705,0.965263,0.023479,0.126529,613.0,16124,123762,0.115265,0.884735
706,0.96515,0.023479,0.127348,612.0,16226,123660,0.115994,0.884006
707,0.96515,0.023545,0.127348,612.0,16227,123659,0.116002,0.883998
708,0.965092,0.023545,0.127701,612.0,16271,123615,0.116316,0.883684
709,0.965091,0.02361,0.127701,612.0,16272,123614,0.116323,0.883677


In [41]:
# salvando dados

df_scorecard.to_csv('dados/dados_scorecard.csv', index = False)

dados_treino1 = pd.DataFrame(columns = nomes_features, data = X_train)
dados_treino2 = pd.DataFrame(columns = ['target'], data = y_train)
dados_treino  = pd.concat([dados_treino1, dados_treino2], axis = 1)
dados_treino.to_csv('dados/dados_populacao_de_treino.csv', index = False)

---