In [1080]:
import pandas as pd
import numpy as np

In [1081]:
data = pd.read_csv('data/credit/application_record.csv')

record = pd.read_csv('data/credit/credit_record.csv')

In [1082]:
print(f"Shape: {data.shape}")
print(data.columns)
print(f"Shape: {record.shape}")
print(record.columns)

Shape: (438557, 18)
Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS'],
      dtype='object')
Shape: (1048575, 3)
Index(['ID', 'MONTHS_BALANCE', 'STATUS'], dtype='object')


Kaggle Notebook: https://www.kaggle.com/code/rikdifos/credit-card-approval-prediction-using-ml/notebook

## Feature Engineering

In [1083]:
record.ID.nunique()

45985

**Target variable**

- 0: 1-29 days past due 
- 1: 30-59 days past due 
- 2: 60-89 days overdue 
- 3: 90-119 days overdue 
- 4: 120-149 days overdue 
- 5: Overdue or bad debts, write-offs for more than 150 days 
- C: paid off that month 
- X: No loan for the month


TARGET VARIABLE: User at risk (1/0). 1 if overdue for more than 60 days (label 2,3,4,5)

**Approach 1**

Bad Customer: A customer is classified as bad if they have:
- More than 2 instances of a status between 1 and 4 (indicating being past due between 30 to 149 days) over their entire history, or
- Any instance of a status of 5 (indicating overdue or bad debts for more than 150 days), or
- A recent trend toward worsening status (e.g., moving from status 0 to 1 or higher in the last 6 months).

Good Customer: A customer is considered good if none of the above conditions are met.

In [1079]:
# def is_worsening(status_list):
#     for i in range(len(status_list) - 1):
#         # Check if the status is worsening
#         if status_list[i] < status_list[i + 1]:
#             return True
#     return False

# record_d = pd.get_dummies(data=record,columns=['STATUS'], prefix='', prefix_sep='').groupby('ID')[sorted(record['STATUS'].unique().tolist())].sum()
# record_d['bad_customer'] = np.where((record_d['1'] + record_d['2'] + record_d['3'] + record_d['4'] >= 2) | (record_d['5'] > 0), 1, 0)

# record_past_6_monhts = record.groupby('ID').head(6)
# record_past_6_monhts['STATUS'] = record_past_6_monhts['STATUS'].str.replace('C','-1')
# record_past_6_monhts['STATUS'] = record_past_6_monhts['STATUS'].str.replace('X','-2')

# # Group by Customer_ID and apply the function
# worsening_customers = record_past_6_monhts.groupby('ID')['STATUS'].apply(list).apply(is_worsening)

# worsening_customers_df = pd.DataFrame(worsening_customers).reset_index()
# worsening_customers_df['STATUS'] = np.where(worsening_customers_df['STATUS'] == True, 1, 0)
# worsening_customers_df.columns = ['ID','worsening']

# new_record = pd.merge(record_d, worsening_customers_df, on='ID', how='left')[['ID','bad_customer','worsening']]
# new_record['target'] = np.where(new_record['bad_customer'] == 1, 1, np.where(new_record['worsening'] == 1, 1, 0))
# new_record = new_record.drop(['bad_customer','worsening'], axis=1)

# new_data = pd.merge(data, record_d.reset_index()[['ID','bad_customer']], how='inner', on='ID')
# new_data.rename(columns={'bad_customer':'target'}, inplace=True)
# new_data

**Approach 2**

In [1084]:
# Create variable "begin_month" --> age of the account
begin_month = pd.DataFrame(record.groupby(["ID"])["MONTHS_BALANCE"].agg(min))
begin_month = begin_month.rename(columns={'MONTHS_BALANCE':'begin_month'}) 

new_data = pd.merge(data,begin_month,how="left",on="ID") #merge to record data


record['dep_value'] = np.where(record['STATUS'].isin(['2','3','4','5']), 'Yes', None)

# Flag customers who have been late 60+ days at least once
cpunt = record.groupby('ID').count()
cpunt['dep_value'] = np.where(cpunt['dep_value'] >0, 'Yes','No')
cpunt = cpunt[['dep_value']]

new_data = pd.merge(new_data, cpunt, how='inner', on='ID')

new_data['target'] = new_data['dep_value']
new_data['target'] = np.where(new_data['target']=='Yes', 1, 0)
new_data = new_data.drop(columns=['dep_value','begin_month'])
new_data.target.value_counts()

  begin_month = pd.DataFrame(record.groupby(["ID"])["MONTHS_BALANCE"].agg(min))


target
0    35841
1      616
Name: count, dtype: int64

#### Binary Variables

**Gender**

In [1085]:
print(new_data['CODE_GENDER'].value_counts())

new_data['CODE_GENDER'] = np.where(new_data['CODE_GENDER']=='M', 1, 0)

CODE_GENDER
F    24430
M    12027
Name: count, dtype: int64


**Own a car**

In [1086]:
print(new_data['FLAG_OWN_CAR'].value_counts())

new_data['FLAG_OWN_CAR'] = np.where(new_data['FLAG_OWN_CAR']=='Y', 1, 0)

FLAG_OWN_CAR
N    22614
Y    13843
Name: count, dtype: int64


**Own Realty**

In [1087]:
print(new_data['FLAG_OWN_REALTY'].value_counts())

new_data['FLAG_OWN_REALTY'] = np.where(new_data['FLAG_OWN_REALTY']=='Y', 1, 0)

FLAG_OWN_REALTY
Y    24506
N    11951
Name: count, dtype: int64


**Own Phone**

In [1088]:
print(new_data['FLAG_PHONE'].value_counts())

FLAG_PHONE
0    25709
1    10748
Name: count, dtype: int64


**Own Work Phone**

In [1089]:
new_data['FLAG_WORK_PHONE'] = np.where(new_data['FLAG_WORK_PHONE']=='Y', 1, 0)

**Own Email**

In [1090]:
new_data['FLAG_EMAIL'].value_counts()

FLAG_EMAIL
0    33186
1     3271
Name: count, dtype: int64

#### Numerical Variables

**Children Count**

In [1091]:
new_data['CNT_CHILDREN'].value_counts()

CNT_CHILDREN
0     25201
1      7492
2      3256
3       419
4        63
5        20
14        3
7         2
19        1
Name: count, dtype: int64

**Income**

In [1092]:
new_data['AMT_INCOME_TOTAL'].describe()

count    3.645700e+04
mean     1.866857e+05
std      1.017892e+05
min      2.700000e+04
25%      1.215000e+05
50%      1.575000e+05
75%      2.250000e+05
max      1.575000e+06
Name: AMT_INCOME_TOTAL, dtype: float64

In [1093]:
new_data['AMT_INCOME_TOTAL'] = new_data['AMT_INCOME_TOTAL']#/10000

**Age**

In [1094]:
new_data['AGE'] = new_data['DAYS_BIRTH']/-365

new_data = new_data.drop(columns=['DAYS_BIRTH'])

**Working Years**

In [1095]:
new_data['YEARS_EMPLOYED'] = - new_data['DAYS_EMPLOYED'] / 365	

new_data = new_data.drop(columns=['DAYS_EMPLOYED'])

**Family Size**

In [1097]:
new_data['CNT_FAM_MEMBERS'].value_counts()

CNT_FAM_MEMBERS
2.0     19463
1.0      6987
3.0      6421
4.0      3106
5.0       397
6.0        58
7.0        19
15.0        3
9.0         2
20.0        1
Name: count, dtype: int64

#### Categorical Variables

**Occupation**

In [1098]:
# Define the occupation categories
laborwk_categories = ['Cleaning staff', 'Cooking staff', 'Drivers', 'Laborers', 'Low-skill Laborers', 'Security staff', 'Waiters/barmen staff']
officewk_categories = ['Accountants', 'Core staff', 'HR staff', 'Medicine staff', 'Private service staff', 'Realty agents', 'Sales staff', 'Secretaries']
hightecwk_categories = ['Managers', 'High skill tech staff', 'IT staff']

new_data['OCCUPATION_TYPE'] = np.where(new_data['OCCUPATION_TYPE'].isin(laborwk_categories), 'labor', 
                                       np.where(new_data['OCCUPATION_TYPE'].isin(officewk_categories), 'office',
                                                np.where(new_data['OCCUPATION_TYPE'].isin(hightecwk_categories), 'hightec', 'other')))

In [1099]:
new_data.OCCUPATION_TYPE.value_counts()

OCCUPATION_TYPE
other      11323
labor      10496
office     10183
hightec     4455
Name: count, dtype: int64

In [1100]:
new_data = pd.get_dummies(new_data, columns=['OCCUPATION_TYPE'])

**Income**

In [1101]:
new_data['NAME_INCOME_TYPE'] = np.where(new_data['NAME_INCOME_TYPE'].isin(['Pensioner', 'Student']), 'State servant' ,new_data['NAME_INCOME_TYPE'])

In [1102]:
new_data.NAME_INCOME_TYPE.value_counts()

NAME_INCOME_TYPE
Working                 18819
State servant            9148
Commercial associate     8490
Name: count, dtype: int64

In [1103]:
## Create dummy variables - one hot encoding
new_data = pd.get_dummies(new_data, columns=['NAME_INCOME_TYPE'])

**House Type**

In [1104]:
new_data['NAME_HOUSING_TYPE'].value_counts()

NAME_HOUSING_TYPE
House / apartment      32548
With parents            1776
Municipal apartment     1128
Rented apartment         575
Office apartment         262
Co-op apartment          168
Name: count, dtype: int64

In [1105]:
new_data = pd.get_dummies(new_data, columns=['NAME_HOUSING_TYPE'])

**Acedemic level**

In [1106]:
new_data['NAME_EDUCATION_TYPE'].value_counts()

NAME_EDUCATION_TYPE
Secondary / secondary special    24777
Higher education                  9864
Incomplete higher                 1410
Lower secondary                    374
Academic degree                     32
Name: count, dtype: int64

In [1107]:
new_data['NAME_EDUCATION_TYPE'] = np.where(new_data['NAME_EDUCATION_TYPE']=='Academic degree', 'Higher education', new_data['NAME_EDUCATION_TYPE'])

# dummies
new_data = pd.get_dummies(new_data, columns=['NAME_EDUCATION_TYPE'])

**Marriage Condition**

In [1108]:
new_data['NAME_FAMILY_STATUS'].value_counts()

NAME_FAMILY_STATUS
Married                 25048
Single / not married     4829
Civil marriage           2945
Separated                2103
Widow                    1532
Name: count, dtype: int64

In [1109]:
new_data = pd.get_dummies(new_data, columns=['NAME_FAMILY_STATUS'])

## Models

In [1132]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [1113]:
df = new_data.copy().drop(['ID'], axis=1)

In [1114]:
train_df, test_df = train_test_split(df, stratify=df['target'], test_size=0.3, random_state = 42)

X_train, y_train = train_df.drop(['target'], axis=1), train_df['target']
X_test, y_test = test_df.drop(['target'], axis=1), test_df['target']

SMOTE Oversampling on Train set

In [1115]:
##SMOTE oversampling
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

In [1116]:
y_res.value_counts()

target
0    25088
1    25088
Name: count, dtype: int64

In [1117]:
y_test.value_counts()

target
0    10753
1      185
Name: count, dtype: int64

Scale Numerical Features

In [1119]:
numerical_columns = ['AMT_INCOME_TOTAL','CNT_FAM_MEMBERS', 'AGE','YEARS_EMPLOYED','CNT_CHILDREN']
X_train, y_train = X_res, y_res

scaler = StandardScaler()

X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])

X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

**CART**

In [1121]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

y_predict = model.predict(X_test)

## METRICS
# Accuracy
cart_accuracy = accuracy_score(y_test, y_predict)

# Precision
cart_precision = precision_score(y_test, y_predict)

# Recall
cart_recall = recall_score(y_test, y_predict)

# AUC
cart_auc = roc_auc_score(y_test, y_predict)

pd.DataFrame(confusion_matrix(y_test,y_predict))

Unnamed: 0,0,1
0,10629,124
1,133,52


In [1122]:
cart_metrics = pd.DataFrame({'Decision Tree': [cart_accuracy, cart_precision, cart_recall, cart_auc]}, columns = ['Decision Tree'], index=['Accuracy', 'Precision', 'Recall', 'AUC'])
cart_metrics

Unnamed: 0,Decision Tree
Accuracy,0.976504
Precision,0.295455
Recall,0.281081
AUC,0.634775


**Random Forest**

In [1123]:
rf = RandomForestClassifier(class_weight ='balanced')  
rf.fit(X_train, y_train)
y_pred_rf= rf.predict(X_test) 

## METRICS
# Accuracy
rf_accuracy = accuracy_score(y_test, y_pred_rf)

# Precision
rf_precision = precision_score(y_test, y_pred_rf)

# Recall
rf_recall = recall_score(y_test, y_pred_rf)

# AUC
rf_auc = roc_auc_score(y_test, y_pred_rf)

pd.DataFrame(confusion_matrix(y_test,y_pred_rf))

Unnamed: 0,0,1
0,10667,86
1,142,43


In [1124]:
rf_metrics = pd.DataFrame({'Random Forest': [rf_accuracy, rf_precision, rf_recall, rf_auc]}, columns = ['Random Forest'], index=['Accuracy', 'Precision', 'Recall', 'AUC'])
rf_metrics  

Unnamed: 0,Random Forest
Accuracy,0.979155
Precision,0.333333
Recall,0.232432
AUC,0.612217


**LightGBM**

In [1127]:
model = LGBMClassifier()
model.fit(X_train, y_train)

y_predict = model.predict(X_test)

## METRICS
# Accuracy
lgm_accuracy = accuracy_score(y_test, y_predict)

# Precision
lgm_precision = precision_score(y_test, y_predict)

# Recall
lgm_recall = recall_score(y_test, y_predict)

# AUC
lgm_auc = roc_auc_score(y_test, y_predict)

pd.DataFrame(confusion_matrix(y_test,y_predict))

[LightGBM] [Info] Number of positive: 25088, number of negative: 25088
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004725 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1081
[LightGBM] [Info] Number of data points in the train set: 50176, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Unnamed: 0,0,1
0,10653,100
1,155,30


In [1128]:
lgm_metrics = pd.DataFrame({'LightGBM': [lgm_accuracy, lgm_precision, lgm_recall, lgm_auc]}, columns = ['LightGBM'], index=['Accuracy', 'Precision', 'Recall', 'AUC'])
lgm_metrics

Unnamed: 0,LightGBM
Accuracy,0.976687
Precision,0.230769
Recall,0.162162
AUC,0.576431


**XGBoost**

In [1129]:
model = XGBClassifier()

model.fit(X_train, y_train)

y_predict = model.predict(X_test)

## METRICS
# Accuracy
xgb_accuracy = accuracy_score(y_test, y_predict)
# Precision
xgb_precision = precision_score(y_test, y_predict)
# Recall
xgb_recall = recall_score(y_test, y_predict)
# AUC
xgb_auc = roc_auc_score(y_test, y_predict)

pd.DataFrame(confusion_matrix(y_test,y_predict))

Unnamed: 0,0,1
0,10651,102
1,146,39


In [1130]:
xgb_metrics = pd.DataFrame({'XGBoost': [xgb_accuracy, xgb_precision, xgb_recall, xgb_auc]}, columns = ['XGBoost'], index=['Accuracy', 'Precision', 'Recall', 'AUC'])
xgb_metrics

Unnamed: 0,XGBoost
Accuracy,0.977327
Precision,0.276596
Recall,0.210811
AUC,0.600663


**Summary**

In [1131]:
metrics = pd.concat([cart_metrics, rf_metrics, lgm_metrics, xgb_metrics], axis=1)
metrics['Best Model'] = metrics.idxmax(axis=1)
metrics

Unnamed: 0,Decision Tree,Random Forest,LightGBM,XGBoost,Best Model
Accuracy,0.976504,0.979155,0.976687,0.977327,Random Forest
Precision,0.295455,0.333333,0.230769,0.276596,Random Forest
Recall,0.281081,0.232432,0.162162,0.210811,Decision Tree
AUC,0.634775,0.612217,0.576431,0.600663,Decision Tree
