In [1]:
# Basic data manipulation and visualization
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

# Metrics
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
)

# Functions
from utils import *

import warnings

warnings.simplefilter("ignore")

1. Identification of the cost of missclassification.

ASSUMPTIONS:
-  I treat 'bad loans' as a loan that in 95% is repaid on time. 5% of the final amount is not reachable. 
-  false negative is the loss occured by this 5% not repaid.
-  I am taking random 1000 samples with bad loans to make calculation.
-  I assume that current solution predict bad loans as true positive = 600 to false negative = 400.
-  i a not taking into acount the profits for lenders, only loss.

In [10]:
data = pd.read_csv("data_for_cost_calculation.csv", low_memory=False)
data.head()


Unnamed: 0.1,Unnamed: 0,TARGET,AMT_CREDIT,AMT_ANNUITY,CODE_GENDER,DAYS_EMPLOYED,NAME_EDUCATION_TYPE,DAYS_BIRTH,NAME_FAMILY_STATUS,NAME_INCOME_TYPE,FLAG_OWN_CAR,OWN_CAR_AGE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,PAYMENT_RATIO,TOTAL_NB_POS_CASH,TOTAL_PAYMENT_AGREEMENT
0,0,1,406597.5,24700.5,M,-637,Secondary / secondary special,-9461,Single / not married,Working,N,,0.083037,0.262949,0.139376,1.0,19.0,219625.695
1,1,0,1293502.5,35698.5,F,-1188,Higher education,-16765,Married,State servant,N,,0.311267,0.622246,,1.0,28.0,1618864.65
2,2,0,135000.0,6750.0,M,-225,Secondary / secondary special,-19046,Single / not married,Working,Y,26.0,,0.555912,0.729567,1.0,4.0,21288.465
3,3,0,312682.5,29686.5,F,-3039,Secondary / secondary special,-19005,Civil marriage,Working,N,,,0.650442,,1.0,21.0,1007153.415
4,4,0,513000.0,21865.5,M,-3038,Secondary / secondary special,-19932,Single / not married,Working,N,,,0.322738,,0.964285,66.0,835985.34


In [11]:
bad_loans = (
    data[data["TARGET"] == 1]
    .drop(columns="Unnamed: 0", axis=1)
    .sample(n=1000, random_state=0)
)

In [14]:
model = pickle.load(open("problem_loan_class.pkl", "rb"))


In [18]:
predictions = model.predict(bad_loans.drop(columns=["TARGET", "AMT_ANNUITY"], axis=1))
probabilities = model.predict_proba(bad_loans)

In [23]:
bad_loans["predictions"] = predictions
bad_loans["probability_class_0"] = probabilities[:, 0]
bad_loans["probability_class_1"] = probabilities[:, 1]

In [24]:
bad_loans.head()


Unnamed: 0,TARGET,AMT_CREDIT,AMT_ANNUITY,CODE_GENDER,DAYS_EMPLOYED,NAME_EDUCATION_TYPE,DAYS_BIRTH,NAME_FAMILY_STATUS,NAME_INCOME_TYPE,FLAG_OWN_CAR,OWN_CAR_AGE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,PAYMENT_RATIO,TOTAL_NB_POS_CASH,TOTAL_PAYMENT_AGREEMENT,predictions,probability_class_0,probability_class_1
128379,1,497520.0,33376.5,M,-511,Secondary / secondary special,-8545,Single / not married,Working,N,,,0.262241,0.328063,0.999779,44.0,21823.515,1,0.211315,0.788685
65204,1,284400.0,19134.0,M,-2942,Secondary / secondary special,-16065,Married,Working,Y,9.0,,0.034349,0.556727,1.0,12.0,173073.6,1,0.19906,0.80094
115547,1,755190.0,36459.0,F,-6183,Secondary / secondary special,-15003,Civil marriage,Working,N,,,0.551054,0.355639,0.748576,27.0,452046.69,1,0.475819,0.524181
73799,1,622413.0,31909.5,F,-1816,Secondary / secondary special,-12258,Civil marriage,Working,N,,,0.249802,0.48305,0.939113,25.0,507225.825,1,0.293103,0.706897
11423,1,302544.0,14233.5,F,-986,Lower secondary,-13046,Civil marriage,Working,N,,,0.646339,0.251239,1.0,10.0,70768.485,1,0.294932,0.705068


In [27]:
report_as_df(bad_loans["TARGET"], bad_loans["predictions"])

Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,0.0
1,1.0,0.683,0.811646,1000.0
accuracy,0.683,0.683,0.683,0.683
macro avg,0.5,0.3415,0.405823,1000.0
weighted avg,1.0,0.683,0.811646,1000.0


In [28]:
confusion_matrix(bad_loans["TARGET"], bad_loans["predictions"])

array([[  0,   0],
       [317, 683]], dtype=int64)

In case of 683 bad_loans predicted as true bad_loans, the Home Credit Group may take additional steps to assure the credit will be paid or can reject the application to avoid a loss.

In case of this 317 badly predicted loans, I am going to calculate the exptected loss.

In [30]:
loss = bad_loans.groupby("predictions").agg({"AMT_CREDIT": "sum"})

In [33]:
loss["loss"] = round((loss["AMT_CREDIT"] * 0.05), 2)
loss

Unnamed: 0_level_0,AMT_CREDIT,loss
predictions,Unnamed: 1_level_1,Unnamed: 2_level_1
0,202628857.5,10131442.88
1,360759573.0,18037978.65


With my model - If borrowers does not pay 5% of their loans and copany would not predict that, the loss is estimated to be **10,1mln**. 

In [35]:
replace_idxs = np.random.choice(
    bad_loans[bad_loans["predictions"] == 1].index, size=83, replace=False
)
bad_loans.loc[replace_idxs, "predictions"] = 0

In [36]:
confusion_matrix(bad_loans["TARGET"], bad_loans["predictions"])

array([[  0,   0],
       [400, 600]], dtype=int64)

In [37]:
loss = bad_loans.groupby("predictions").agg({"AMT_CREDIT": "sum"})
loss["loss"] = round((loss["AMT_CREDIT"] * 0.05), 2)
loss

Unnamed: 0_level_0,AMT_CREDIT,loss
predictions,Unnamed: 1_level_1,Unnamed: 2_level_1
0,247341226.5,12367061.32
1,316047204.0,15802360.2


With the current model - If borrowers does not pay 5% of their loans and copany would not predict that, the loss is estimated to be **12,4mln**. 

In [39]:
10.1 / 12.4

0.814516129032258

**If we replace the existing model with the model that I have built, a company can save up to 19% of current loss.**

----
----

In [42]:
class_0 = bad_loans[
    (bad_loans["predictions"] == 0) & (bad_loans["probability_class_0"] >= 0.5)
][["AMT_CREDIT", "probability_class_0", "probability_class_1"]]

In [45]:
class_0[class_0["probability_class_0"] <= 0.65]

Unnamed: 0,AMT_CREDIT,probability_class_0,probability_class_1
2352,539100.0,0.531196,0.468804
110396,284400.0,0.598519,0.401481
103074,808650.0,0.649565,0.350435
42214,270000.0,0.550609,0.449391
306367,1030680.0,0.550173,0.449827
...,...,...,...
80035,684657.0,0.597199,0.402801
172879,180000.0,0.508708,0.491292
232026,675000.0,0.509104,0.490896
10189,1143567.0,0.610138,0.389862


178 out of 317 bad predictions have prediction probability of class 0 equal or less than 0.65. If we add an alert for this kind of predictions, we could **minimize loss** of more than:

In [47]:
class_0[class_0["probability_class_0"] <= 0.65]["AMT_CREDIT"].sum() * 0.05

5356167.300000001

**5mln.**

My suggestion is to allert this prediction based on value of predict_proba and apply additional verification steps for these borrowers.

----
----