In [1]:
import pandas as pd
import numpy as np

In [183]:
data = pd.read_csv('data/credit/application_record.csv')

record = pd.read_csv('data/credit/credit_record.csv')

In [184]:
print(f"Shape: {data.shape}")
print(data.columns)
print(f"Shape: {record.shape}")
print(record.columns)

Shape: (438557, 18)
Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS'],
      dtype='object')
Shape: (1048575, 3)
Index(['ID', 'MONTHS_BALANCE', 'STATUS'], dtype='object')


Kaggle Notebook: https://www.kaggle.com/code/rikdifos/credit-card-approval-prediction-using-ml/notebook

## Feature Engineering

**Target variable**

- 0: 1-29 days past due 
- 1: 30-59 days past due 
- 2: 60-89 days overdue 
- 3: 90-119 days overdue 
- 4: 120-149 days overdue 
- 5: Overdue or bad debts, write-offs for more than 150 days 
- C: paid off that month 
- X: No loan for the month


TARGET VARIABLE: User at risk (1/0). 1 if overdue for more than 60 days (label 2,3,4,5)

In [185]:
record.STATUS.value_counts()

STATUS
C    442031
0    383120
X    209230
1     11090
5      1693
2       868
3       320
4       223
Name: count, dtype: int64

In [186]:
# Create variable "begin_month" --> age of the account
begin_month = pd.DataFrame(record.groupby(["ID"])["MONTHS_BALANCE"].agg(min))
begin_month = begin_month.rename(columns={'MONTHS_BALANCE':'begin_month'}) 

new_data = pd.merge(data,begin_month,how="left",on="ID") #merge to record data

  begin_month = pd.DataFrame(record.groupby(["ID"])["MONTHS_BALANCE"].agg(min))


In [187]:
record['dep_value'] = np.where(record['STATUS'].isin(['2','3','4','5']), 'Yes', None)

# Flag customers who have been late 60+ days at least once
cpunt = record.groupby('ID').count()
cpunt['dep_value'] = np.where(cpunt['dep_value'] >0, 'Yes','No')
cpunt = cpunt[['dep_value']]

new_data = pd.merge(new_data, cpunt, how='inner', on='ID')

new_data['target'] = new_data['dep_value']
new_data['target'] = np.where(new_data['target']=='Yes', 1, 0)

new_data.target.value_counts()


target
0    35841
1      616
Name: count, dtype: int64

#### Binary Variables

**Gender**

In [188]:
print(new_data['CODE_GENDER'].value_counts())

new_data['CODE_GENDER'] = np.where(new_data['CODE_GENDER']=='M', 1, 0)

CODE_GENDER
F    24430
M    12027
Name: count, dtype: int64


In [189]:
new_data['CODE_GENDER'].isnull().sum()

0

**Own a car**

In [190]:
print(new_data['FLAG_OWN_CAR'].value_counts())

new_data['FLAG_OWN_CAR'] = np.where(new_data['FLAG_OWN_CAR']=='Y', 1, 0)

FLAG_OWN_CAR
N    22614
Y    13843
Name: count, dtype: int64


**Own Realty**

In [191]:
print(new_data['FLAG_OWN_REALTY'].value_counts())

new_data['FLAG_OWN_REALTY'] = np.where(new_data['FLAG_OWN_REALTY']=='Y', 1, 0)

FLAG_OWN_REALTY
Y    24506
N    11951
Name: count, dtype: int64


**Own Phone**

In [192]:
print(new_data['FLAG_PHONE'].value_counts())

FLAG_PHONE
0    25709
1    10748
Name: count, dtype: int64


**Own Work Phone**

In [193]:
new_data['FLAG_WORK_PHONE'] = np.where(new_data['FLAG_WORK_PHONE']=='Y', 1, 0)

**Own Email**

In [194]:
new_data['FLAG_EMAIL'].value_counts()

FLAG_EMAIL
0    33186
1     3271
Name: count, dtype: int64

#### Numerical Variables

**Children Count**

In [195]:
new_data['CNT_CHILDREN'].value_counts()

CNT_CHILDREN
0     25201
1      7492
2      3256
3       419
4        63
5        20
14        3
7         2
19        1
Name: count, dtype: int64

In [15]:
new_data['CNT_CHILDREN'] = np.where(new_data['CNT_CHILDREN']>=2, '2+ childrens',
                                    np.where(new_data['CNT_CHILDREN']==1, '1', '0'))

print(new_data['CNT_CHILDREN'].value_counts())
new_data = pd.get_dummies(new_data, columns=['CNT_CHILDREN'])

CNT_CHILDREN
0               25201
1                7492
2+ childrens     3764
Name: count, dtype: int64


**Income**

In [196]:
new_data['AMT_INCOME_TOTAL'].describe()

count    3.645700e+04
mean     1.866857e+05
std      1.017892e+05
min      2.700000e+04
25%      1.215000e+05
50%      1.575000e+05
75%      2.250000e+05
max      1.575000e+06
Name: AMT_INCOME_TOTAL, dtype: float64

In [197]:
new_data['AMT_INCOME_TOTAL'] = new_data['AMT_INCOME_TOTAL']#/10000

In [18]:
new_data['AMT_INCOME_TOTAL'] = pd.qcut(new_data['AMT_INCOME_TOTAL'], q = 3, labels = ["low","medium", "high"])
new_data['AMT_INCOME_TOTAL'].value_counts()

AMT_INCOME_TOTAL
low       14473
high      11282
medium    10702
Name: count, dtype: int64

In [19]:
new_data = pd.get_dummies(new_data, columns=['AMT_INCOME_TOTAL'])

**Age**

In [198]:
new_data['AGE'] = new_data['DAYS_BIRTH']/-365

# new_data['AGE'] = pd.qcut(new_data['AGE'], q = 3, labels = ["low","medium", "high"])

In [199]:
new_data['AGE'].value_counts()

AGE
34.728767    54
42.517808    54
46.290411    38
40.183562    37
41.479452    32
             ..
45.282192     1
63.958904     1
59.375342     1
38.893151     1
25.172603     1
Name: count, Length: 7183, dtype: int64

In [200]:
new_data = new_data.drop(columns=['DAYS_BIRTH'])

In [22]:
new_data = pd.get_dummies(new_data, columns=['AGE'])

**Working Years**

In [201]:
new_data['YEARS_EMPLOYED'] = - new_data['DAYS_EMPLOYED'] / 365	
new_data['YEARS_EMPLOYED'] = np.where(new_data['YEARS_EMPLOYED']<0, np.nan,new_data['YEARS_EMPLOYED'])


# new_data['YEARS_EMPLOYED'] = pd.qcut(new_data['YEARS_EMPLOYED'], q = 5, labels = ["lowest","low","medium","high","highest"])
# new_data['YEARS_EMPLOYED'].value_counts()

In [202]:
new_data['YEARS_EMPLOYED'] = new_data['YEARS_EMPLOYED'].fillna(0)

In [24]:
new_data = pd.get_dummies(new_data, columns=['YEARS_EMPLOYED'])

**Family Size**

In [204]:
new_data['CNT_FAM_MEMBERS'].value_counts()

CNT_FAM_MEMBERS
2.0     19463
1.0      6987
3.0      6421
4.0      3106
5.0       397
6.0        58
7.0        19
15.0        3
9.0         2
20.0        1
Name: count, dtype: int64

In [26]:
new_data['CNT_FAM_MEMBERS'] = np.where(new_data['CNT_FAM_MEMBERS']>=3, '3+ members', 
                                       np.where(new_data['CNT_FAM_MEMBERS']==2, '2',
                                                np.where(new_data['CNT_FAM_MEMBERS']==1, '1', '0')))

In [27]:
new_data = pd.get_dummies(new_data, columns=['CNT_FAM_MEMBERS'])

#### Categorical Variables

**Occupation**

In [205]:
# Define the occupation categories
laborwk_categories = ['Cleaning staff', 'Cooking staff', 'Drivers', 'Laborers', 'Low-skill Laborers', 'Security staff', 'Waiters/barmen staff']
officewk_categories = ['Accountants', 'Core staff', 'HR staff', 'Medicine staff', 'Private service staff', 'Realty agents', 'Sales staff', 'Secretaries']
hightecwk_categories = ['Managers', 'High skill tech staff', 'IT staff']

new_data['OCCUPATION_TYPE'] = np.where(new_data['OCCUPATION_TYPE'].isin(laborwk_categories), 'labor', 
                                       np.where(new_data['OCCUPATION_TYPE'].isin(officewk_categories), 'office',
                                                np.where(new_data['OCCUPATION_TYPE'].isin(hightecwk_categories), 'hightec', 'other')))

In [206]:
new_data.OCCUPATION_TYPE.value_counts()

OCCUPATION_TYPE
other      11323
labor      10496
office     10183
hightec     4455
Name: count, dtype: int64

In [207]:
new_data = pd.get_dummies(new_data, columns=['OCCUPATION_TYPE'])

**Income**

In [208]:
new_data['NAME_INCOME_TYPE'] = np.where(new_data['NAME_INCOME_TYPE'].isin(['Pensioner', 'Student']), 'State servant' ,new_data['NAME_INCOME_TYPE'])

In [209]:
new_data.NAME_INCOME_TYPE.value_counts()

NAME_INCOME_TYPE
Working                 18819
State servant            9148
Commercial associate     8490
Name: count, dtype: int64

In [210]:
## Create dummy variables - one hot encoding
new_data = pd.get_dummies(new_data, columns=['NAME_INCOME_TYPE'])

**House Type**

In [211]:
new_data['NAME_HOUSING_TYPE'].value_counts()

NAME_HOUSING_TYPE
House / apartment      32548
With parents            1776
Municipal apartment     1128
Rented apartment         575
Office apartment         262
Co-op apartment          168
Name: count, dtype: int64

In [212]:
new_data = pd.get_dummies(new_data, columns=['NAME_HOUSING_TYPE'])

**Acedemic level**

In [213]:
new_data['NAME_EDUCATION_TYPE'].value_counts()

NAME_EDUCATION_TYPE
Secondary / secondary special    24777
Higher education                  9864
Incomplete higher                 1410
Lower secondary                    374
Academic degree                     32
Name: count, dtype: int64

In [214]:
new_data['NAME_EDUCATION_TYPE'] = np.where(new_data['NAME_EDUCATION_TYPE']=='Academic degree', 'Higher education', new_data['NAME_EDUCATION_TYPE'])

# dummies
new_data = pd.get_dummies(new_data, columns=['NAME_EDUCATION_TYPE'])

**Marriage Condition**

In [215]:
new_data['NAME_FAMILY_STATUS'].value_counts()

NAME_FAMILY_STATUS
Married                 25048
Single / not married     4829
Civil marriage           2945
Separated                2103
Widow                    1532
Name: count, dtype: int64

In [216]:
new_data = pd.get_dummies(new_data, columns=['NAME_FAMILY_STATUS'])

## Models

In [217]:
new_data.columns

Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'begin_month', 'dep_value', 'target',
       'AGE', 'YEARS_EMPLOYED', 'OCCUPATION_TYPE_hightec',
       'OCCUPATION_TYPE_labor', 'OCCUPATION_TYPE_office',
       'OCCUPATION_TYPE_other', 'NAME_INCOME_TYPE_Commercial associate',
       'NAME_INCOME_TYPE_State servant', 'NAME_INCOME_TYPE_Working',
       'NAME_HOUSING_TYPE_Co-op apartment',
       'NAME_HOUSING_TYPE_House / apartment',
       'NAME_HOUSING_TYPE_Municipal apartment',
       'NAME_HOUSING_TYPE_Office apartment',
       'NAME_HOUSING_TYPE_Rented apartment', 'NAME_HOUSING_TYPE_With parents',
       'NAME_EDUCATION_TYPE_Higher education',
       'NAME_EDUCATION_TYPE_Incomplete higher',
       'NAME_EDUCATION_TYPE_Lower secondary',
       'NAME_EDUCATION_TYPE_Secondary / secondary special',
       'NAME_FAMILY_STATUS_Civil marriage', 'NA

In [218]:
X = new_data.drop(['ID','target','dep_value'], axis=1)
y = new_data['target']

In [219]:
from sklearn.model_selection import train_test_split

In [220]:
X_train, X_test, y_train, y_test = train_test_split(X,y, 
                                                    stratify=y, test_size=0.3,
                                                    random_state = 10086)

In [221]:
##SMOTE oversampling
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

In [222]:
y_res.value_counts()

target
0    25088
1    25088
Name: count, dtype: int64

In [223]:
y_train.value_counts()

target
0    25088
1      431
Name: count, dtype: int64

In [225]:
numerical_columns = ['CNT_CHILDREN','AMT_INCOME_TOTAL','CNT_FAM_MEMBERS','begin_month', 'AGE', 'YEARS_EMPLOYED']

In [226]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_res[numerical_columns] = scaler.fit_transform(X_res[numerical_columns])

In [180]:
X_res.columns

Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_MOBIL',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS',
       'begin_month', 'AGE', 'YEARS_EMPLOYED', 'OCCUPATION_TYPE_hightec',
       'OCCUPATION_TYPE_labor', 'OCCUPATION_TYPE_office',
       'OCCUPATION_TYPE_other', 'NAME_INCOME_TYPE_Commercial associate',
       'NAME_INCOME_TYPE_State servant', 'NAME_INCOME_TYPE_Working',
       'NAME_HOUSING_TYPE_Co-op apartment',
       'NAME_HOUSING_TYPE_House / apartment',
       'NAME_HOUSING_TYPE_Municipal apartment',
       'NAME_HOUSING_TYPE_Office apartment',
       'NAME_HOUSING_TYPE_Rented apartment', 'NAME_HOUSING_TYPE_With parents',
       'NAME_EDUCATION_TYPE_Higher education',
       'NAME_EDUCATION_TYPE_Incomplete higher',
       'NAME_EDUCATION_TYPE_Lower secondary',
       'NAME_EDUCATION_TYPE_Secondary / secondary special',
       'NAME_FAMILY_STATUS_Civil marriage', '

In [227]:
X_train, y_train = X_res, y_res

**Logistic Regression**

In [234]:
from sklearn.metrics import accuracy_score, confusion_matrix, auc, roc_curve, precision_score, recall_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
import seaborn as sns


model = LogisticRegression(C=0.8,
                           random_state=42,
                           solver='lbfgs')


model.fit(X_train, y_train)

X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])
y_predict = model.predict(X_test)

## METRICS
# Accuracy
log_accuracy = accuracy_score(y_test, y_predict)

# Precision
log_precision = precision_score(y_test, y_predict)

# Recall
log_recall = recall_score(y_test, y_predict)

# AUC
log_auc = roc_auc_score(y_test, y_predict)

pd.DataFrame(confusion_matrix(y_test,y_predict))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,0,1
0,10753,0
1,185,0


In [235]:
## Metrics dataframe
log_metrics = pd.DataFrame({'Logistic Regression': [log_accuracy, log_precision, log_recall, log_auc]}, columns = ['Logistic Regression'], index=['Accuracy', 'Precision', 'Recall', 'AUC'])
log_metrics

Unnamed: 0,Logistic Regression
Accuracy,0.983086
Precision,0.0
Recall,0.0
AUC,0.5


**CART**

In [236]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(max_depth=12,
                               min_samples_split=8,
                               random_state=1024)
model.fit(X_train, y_train)


X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])
y_predict = model.predict(X_test)


## METRICS
# Accuracy
cart_accuracy = accuracy_score(y_test, y_predict)

# Precision
cart_precision = precision_score(y_test, y_predict)

# Recall
cart_recall = recall_score(y_test, y_predict)

# AUC
cart_auc = roc_auc_score(y_test, y_predict)

pd.DataFrame(confusion_matrix(y_test,y_predict))

Unnamed: 0,0,1
0,7272,3481
1,128,57


In [237]:
cart_metrics = pd.DataFrame({'Decision Tree': [cart_accuracy, cart_precision, cart_recall, cart_auc]}, columns = ['Decision Tree'], index=['Accuracy', 'Precision', 'Recall', 'AUC'])
cart_metrics

Unnamed: 0,Decision Tree
Accuracy,0.670049
Precision,0.016111
Recall,0.308108
AUC,0.492192


**Random Forest**

In [238]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=250,
                              max_depth=12,
                              min_samples_leaf=16
                              )
model.fit(X_train, y_train)


X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])
y_predict = model.predict(X_test)

## METRICS
# Accuracy
rf_accuracy = accuracy_score(y_test, y_predict)

# Precision
rf_precision = precision_score(y_test, y_predict)

# Recall
rf_recall = recall_score(y_test, y_predict)

# AUC
rf_auc = roc_auc_score(y_test, y_predict)

pd.DataFrame(confusion_matrix(y_test,y_predict))

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,0,1
0,10753,0
1,185,0


In [239]:
rf_metrics = pd.DataFrame({'Random Forest': [rf_accuracy, rf_precision, rf_recall, rf_auc]}, columns = ['Random Forest'], index=['Accuracy', 'Precision', 'Recall', 'AUC'])
rf_metrics  

Unnamed: 0,Random Forest
Accuracy,0.983086
Precision,0.0
Recall,0.0
AUC,0.5


**LightGBM**

In [241]:
from lightgbm import LGBMClassifier


model = LGBMClassifier(num_leaves=31,
                       max_depth=8, 
                       learning_rate=0.02,
                       n_estimators=250,
                       subsample = 0.8,
                       colsample_bytree =0.8
                      )
model.fit(X_train, y_train)


X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])
y_predict = model.predict(X_test)

## METRICS
# Accuracy
lgm_accuracy = accuracy_score(y_test, y_predict)

# Precision
lgm_precision = precision_score(y_test, y_predict)

# Recall
lgm_recall = recall_score(y_test, y_predict)

# AUC
lgm_auc = roc_auc_score(y_test, y_predict)

pd.DataFrame(confusion_matrix(y_test,y_predict))

[LightGBM] [Info] Number of positive: 25088, number of negative: 25088
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006072 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1337
[LightGBM] [Info] Number of data points in the train set: 50176, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,0,1
0,10753,0
1,185,0


In [242]:
lgm_metrics = pd.DataFrame({'LightGBM': [lgm_accuracy, lgm_precision, lgm_recall, lgm_auc]}, columns = ['LightGBM'], index=['Accuracy', 'Precision', 'Recall', 'AUC'])
lgm_metrics

Unnamed: 0,LightGBM
Accuracy,0.983086
Precision,0.0
Recall,0.0
AUC,0.5


**XGBoost**

In [243]:
from xgboost import XGBClassifier

model = XGBClassifier(max_depth=12,
                      n_estimators=250,
                      min_child_weight=8, 
                      subsample=0.8, 
                      learning_rate =0.02,    
                      seed=42)

model.fit(X_train, y_train)


X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])
y_predict = model.predict(X_test)

## METRICS
# Accuracy
xgb_accuracy = accuracy_score(y_test, y_predict)
# Precision
xgb_precision = precision_score(y_test, y_predict)
# Recall
xgb_recall = recall_score(y_test, y_predict)
# AUC
xgb_auc = roc_auc_score(y_test, y_predict)

pd.DataFrame(confusion_matrix(y_test,y_predict))

Unnamed: 0,0,1
0,10315,438
1,176,9


In [244]:
xgb_metrics = pd.DataFrame({'XGBoost': [xgb_accuracy, xgb_precision, xgb_recall, xgb_auc]}, columns = ['XGBoost'], index=['Accuracy', 'Precision', 'Recall', 'AUC'])
xgb_metrics

Unnamed: 0,XGBoost
Accuracy,0.943865
Precision,0.020134
Recall,0.048649
AUC,0.503958


**Summary**

In [245]:
metrics = pd.concat([log_metrics, cart_metrics, rf_metrics, lgm_metrics, xgb_metrics], axis=1)
metrics['Best Model'] = metrics.idxmax(axis=1)
metrics

Unnamed: 0,Logistic Regression,Decision Tree,Random Forest,LightGBM,XGBoost,Best Model
Accuracy,0.983086,0.670049,0.983086,0.983086,0.943865,Logistic Regression
Precision,0.0,0.016111,0.0,0.0,0.020134,XGBoost
Recall,0.0,0.308108,0.0,0.0,0.048649,Decision Tree
AUC,0.5,0.492192,0.5,0.5,0.503958,XGBoost


In [246]:
metrics

Unnamed: 0,Logistic Regression,Decision Tree,Random Forest,LightGBM,XGBoost,Best Model
Accuracy,0.983086,0.670049,0.983086,0.983086,0.943865,Logistic Regression
Precision,0.0,0.016111,0.0,0.0,0.020134,XGBoost
Recall,0.0,0.308108,0.0,0.0,0.048649,Decision Tree
AUC,0.5,0.492192,0.5,0.5,0.503958,XGBoost
