# Python Data Science & Analysis 
### Project: Credit Risk Assessment 


# Simulation

In [1]:
import pandas as pd
import numpy as np

N = 1000

income = np.r_[
    np.random.normal(20_000, 2_500, N//2),
    np.random.normal(40_000, 5_000, N//4),
    np.random.normal(60_000, 6_000, N//8),
    np.random.normal(80_000, 7_000, N//16),
    np.random.normal(35_000, 10_000, N//16 + 1),
].round(-2)

len(income)

1000

In [2]:
terms = ['Short Term', 'Long Term']

term = np.r_[
    np.random.choice(terms, N//2, p=(0.8, 0.2)),
    np.random.choice(terms, N//4, p=(0.6, 0.4)),
    np.random.choice(terms, N//8, p=(0.5, 0.5)),
    np.random.choice(terms, N//16, p=(0.4, 0.6)),
    np.random.choice(terms, N//16 + 1, p=(0.2, 0.8)),
]

In [3]:
balance = abs(np.r_[
    np.random.normal(1000, 250, N//2),
    np.random.normal(1250, 300, N//4),
    np.random.normal(1750, 400, N//8),
    np.random.normal(5000, 1000, N//16),
    np.random.normal(1500, 500, N//16 + 1),
]).round(-1)


In [4]:
debt = np.r_[
    np.random.normal(0, 1000, N//2),
    np.random.normal(0, 1500, N//4),
    np.random.normal(0, 2500, N//8),
    np.random.normal(0, 5000, N//16),
    np.random.normal(0, 3000, N//16 + 1),
].clip(min=0).round()


In [5]:
from scipy.stats import zscore

score = np.r_[
    np.random.normal(500, 150, N//2),
    np.random.normal(600, 150, N//4),
    np.random.normal(700, 150, N//8),
    np.random.normal(800, 150, N//16),
    np.random.normal(600, 150, N//16 + 1),
]

score += score*0.6*zscore(income) + score*0.1*zscore(balance) - score*0.3*zscore(debt)

score = score.clip(min=0, max=1000).round()

In [6]:
defaults = [False, True]

default = ((debt/(income/12) >= 0.85) & (score < 500))

print(default.mean())

default = default | np.r_[
    np.random.choice(defaults, N//2, p=(0.95, 0.05)),
    np.random.choice(defaults, N//4, p=(0.96, 0.04)),
    np.random.choice(defaults, N//8, p=(0.97, 0.03)),
    np.random.choice(defaults, N//16, p=(0.98, 0.02)),
    np.random.choice(defaults, N//16 + 1, p=(0.93, 0.07)),
]

print(default.mean())


0.054
0.094


In [7]:
df = pd.DataFrame({
    'ID': np.random.choice(np.arange(1, N + 1), N, replace=False),
    'Income': income,
    'Term': term,
    'Balance': balance,
    'Debt': debt,
    'Score': score,
    'Default': default,
})

In [8]:
df.sample(10)

Unnamed: 0,ID,Income,Term,Balance,Debt,Score,Default
122,287,19200.0,Long Term,1410.0,0.0,134.0,False
490,252,17000.0,Short Term,360.0,1537.0,49.0,True
600,191,41100.0,Short Term,1540.0,784.0,732.0,False
777,517,68500.0,Short Term,1740.0,0.0,1000.0,False
220,311,20200.0,Short Term,990.0,0.0,349.0,False
875,951,93100.0,Long Term,6230.0,3710.0,1000.0,False
290,752,19900.0,Short Term,1080.0,0.0,255.0,False
538,887,37400.0,Long Term,530.0,0.0,524.0,False
283,833,19500.0,Short Term,1360.0,367.0,233.0,False
71,767,20600.0,Short Term,630.0,758.0,265.0,False


In [9]:
len(df.loc[ df['Score'] == 1000 , : ].index)

180

In [10]:
indexes = df.loc[ df['Score'] == 1000 , : ].index

indexes = np.random.choice(indexes, replace=False, size=(8*len(indexes))//10)
df = df.drop(indexes)

In [11]:
df.loc[ np.random.choice(df.index, N//50), 'Score' ] = np.nan

In [12]:
df.to_csv('loan_data.csv', index=False)

In [13]:
len(df)

856

In [14]:
(df['Score'] == 1000).mean()

0.04205607476635514

In [15]:
df.sample(10)

Unnamed: 0,ID,Income,Term,Balance,Debt,Score,Default
933,226,79900.0,Short Term,3720.0,12891.0,0.0,True
149,885,17500.0,Short Term,1040.0,1695.0,89.0,True
227,667,19100.0,Short Term,840.0,728.0,235.0,False
350,318,16300.0,Short Term,570.0,0.0,357.0,False
42,841,20600.0,Short Term,1640.0,91.0,517.0,False
298,989,18500.0,Long Term,1040.0,0.0,246.0,False
424,894,21000.0,Short Term,530.0,0.0,384.0,True
491,678,21200.0,Short Term,360.0,575.0,250.0,False
237,884,21800.0,Short Term,1010.0,0.0,537.0,False
329,827,20600.0,Short Term,540.0,725.0,303.0,False


In [21]:
import pandas as pd 

df = pd.read_csv('./loan_data.csv')
list(df.sample(10).to_records(index=False))

[(215, 37900., 'Short Term', 1530., 0., 595., False),
 (442, 78700., 'Long Term', 4220., 2561., 1000., False),
 (22, 41900., 'Long Term', 1720., 1884., 372., False),
 (711, 24600., 'Short Term', 1200., 0., 385., True),
 (113, 33900., 'Short Term', 1010., 1404., 456., False),
 (91, 23200., 'Short Term', 1310., 2003., 264., True),
 (268, 17700., 'Short Term', 640., 0., 289., False),
 (735, 37100., 'Long Term', 1030., 0., 661., False),
 (971, 35300., 'Short Term', 790., 1479., nan, False),
 (858, 16700., 'Short Term', 610., 0., 201., False)]