## IMPORT LIBRARY AND DATA

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_val_predict

In [None]:
ar = pd.read_csv("application_record.csv")
cr = pd.read_csv("credit_record.csv")

## DATA PREPARATION


In [None]:
ar.info()

In [None]:
cr.info()

#### Add New Calculated Column

In [None]:
ar["AGE"] = (ar.DAYS_BIRTH * -1) // 365

In [None]:
ar = ar.replace(365243,0)

In [None]:
ar["WORKING_YEAR"] = np.where(((ar.DAYS_EMPLOYED) > -365) & ((ar.DAYS_EMPLOYED) < 0), 
                              1, (ar.DAYS_EMPLOYED * -1) // 365)

#### Handling Missing Value 

In [None]:
ar['OCCUPATION_TYPE'] = np.where((ar['OCCUPATION_TYPE'].isnull()) & (ar['WORKING_YEAR'] == 0), 
                                 'Not working', ar['OCCUPATION_TYPE'])

In [None]:
ar['OCCUPATION_TYPE'] = ar['OCCUPATION_TYPE'].replace(np.nan,'Others')

#### Data Transformation

In [None]:
ar.CODE_GENDER = ar.CODE_GENDER.map({"F":0,"M":1})
ar.FLAG_OWN_CAR = ar.FLAG_OWN_CAR.map({"N":0,"Y":1})
ar.FLAG_OWN_REALTY = ar.FLAG_OWN_REALTY.map({"N":0,"Y":1})

In [None]:
features = ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE']

for col in features:
    le = preprocessing.LabelEncoder()
    ar[col] = le.fit_transform(ar[col].values)

In [None]:
cr.STATUS = cr.STATUS.map({"X":1,"C":1, "0":1, "1":0, "2":0, "3":0, "4":0, "5":0})

In [None]:
grouped = cr.groupby('ID')
pivot_tb = cr.pivot(index = 'ID', columns = 'MONTHS_BALANCE', values = 'STATUS')

#### Status Classification

In [None]:
#Counting number of good and bad record.
pivot_tb['GOOD'] = pivot_tb[pivot_tb.iloc[:,0:61] == 1].count(axis = 1)
pivot_tb['BAD'] = pivot_tb[pivot_tb.iloc[:,0:61] == 0].count(axis = 1)

In [None]:
pivot_tb.reset_index(inplace=True)

In [None]:
pivot_tb['STATUS'] = np.where(((pivot_tb['GOOD'] < pivot_tb['BAD']) | (pivot_tb[0] == 0) 
                               | (pivot_tb[-1] == 0) | (pivot_tb[-2] == 0)), 0, 1)

#### Join Table

In [None]:
datacredit = ar.merge(pivot_tb, how='inner', on=['ID'])

In [None]:
datacredit.info()

In [None]:
del datacredit["DAYS_BIRTH"]
del datacredit["DAYS_EMPLOYED"]

In [None]:
datacredit.drop(datacredit.iloc[:, -64:-1], inplace=True, axis=1)

In [None]:
# datacredit = datacredit.drop(['ID', 'FLAG_MOBIL'], axis=1)
datacredit = datacredit.drop(['ID', 'FLAG_MOBIL', 'FLAG_EMAIL', 'CODE_GENDER', 'CNT_CHILDREN', 
                              'NAME_HOUSING_TYPE', 'FLAG_WORK_PHONE', 'CNT_FAM_MEMBERS','FLAG_OWN_CAR', 
                              'FLAG_OWN_REALTY', 'FLAG_PHONE', 'NAME_INCOME_TYPE'], axis=1)

#### Drop Duplicate

In [None]:
datacredit = datacredit.drop_duplicates()

In [None]:
datacredit.groupby('STATUS').size()

In [None]:
#plt.figure(figsize=(18, 8))
#heatmap = sns.heatmap(datacredit.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
#heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':10}, pad=12);
#plt.show()

In [None]:
datacredit.head(10)

## MODELING

In [None]:
X = datacredit.drop(['STATUS'],axis=1)
y = datacredit.STATUS

In [None]:
# #over sampling, imbalanced data
ros = RandomOverSampler(random_state=0)
X_res, y_res = ros.fit_resample(X,y)
y_res.value_counts()

In [None]:
# split untuk oversampling
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.35, random_state=0)

#### MODEL 1 -- LOGISTIC REGRESSION

In [None]:
model1 = LogisticRegression()
model1.fit(X_train, y_train)
y_predict = model1.predict(X_test)
print('Accuracy Score is' ,accuracy_score(y_test,y_predict))

In [None]:
y_predictl = model1.predict(X_train)
print('Accuracy Score is' ,accuracy_score(y_train,y_predictl))

In [None]:
print(classification_report(y_test, y_predict))

In [None]:
f, ax = plt.subplots(figsize=(8,5))
sns.heatmap(confusion_matrix(y_test, y_predict), annot=True, fmt=".0f", ax=ax)
plt.xlabel("Predict")
plt.ylabel("Actual")
plt.show()

In [None]:
feature_names = X_res.columns
importances = model1.coef_[0]
forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
bars = ax.barh(feature_names, forest_importances)
ax.bar_label(bars)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

#### Metode K-fold pada Logistic Regression

In [None]:
cross_val_score (model1, X_res, y_res, cv=6)

In [None]:
cross_val_score (model1, X_res, y_res, cv=6).mean()

In [None]:
y_pred = cross_val_predict (model1, X_res, y_res, cv=5)

In [None]:
f, ax = plt.subplots(figsize=(8,5))
sns.heatmap(confusion_matrix(y_res, y_pred), annot=True, fmt=".0f", ax=ax)
plt.xlabel("Predict")
plt.ylabel("Actual")
plt.show()

#### MODEL 2 -- DECISION TREE

In [None]:
model2 = DecisionTreeClassifier(random_state=125)
model2.fit(X_train, y_train)
y_predict2 = model2.predict(X_test)
print('Accuracy Score is' ,accuracy_score(y_test,y_predict2))

In [None]:
y_predict2d = model2.predict(X_train)
print('Accuracy Score is' ,accuracy_score(y_train,y_predict2d))

In [None]:
print(classification_report(y_test, y_predict2))

In [None]:
f, ax = plt.subplots(figsize=(8,5))
sns.heatmap(confusion_matrix(y_test, y_predict2), annot=True, fmt=".0f", ax=ax)
plt.xlabel("Predict")
plt.ylabel("Actual")
plt.show()

In [None]:
feature_names = X_res.columns
importances = model2.feature_importances_
forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
bars = ax.barh(feature_names, forest_importances)
ax.bar_label(bars)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

#### Metode K-fold pada Decision Tree

In [None]:
cross_val_score (model2, X_res, y_res, cv=6)

In [None]:
cross_val_score (model2, X_res, y_res, cv=6).mean()

In [None]:
y_pred2 = cross_val_predict (model2, X_res, y_res, cv=5)

In [None]:
f, ax = plt.subplots(figsize=(8,5))
sns.heatmap(confusion_matrix(y_res, y_pred2), annot=True, fmt=".0f", ax=ax)
plt.xlabel("Predict")
plt.ylabel("Actual")
plt.show()

#### MODEL 3 -- RANDOM FOREST

In [None]:
model3 = RandomForestClassifier(random_state=125)
model3.fit(X_train, y_train)
y_predict3 = model3.predict(X_test)
print('Accuracy Score is' ,accuracy_score(y_test,y_predict3))

In [None]:
y_predict3r = model3.predict(X_train)
print('Accuracy Score is' ,accuracy_score(y_train,y_predict3r))

In [None]:
print(classification_report(y_test, y_predict3))

In [None]:
f, ax = plt.subplots(figsize=(8,5))
sns.heatmap(confusion_matrix(y_test, y_predict3), annot=True, fmt=".0f", ax=ax)
plt.xlabel("Predict")
plt.ylabel("Actual")
plt.show()

In [None]:
feature_names = X_res.columns
importances = model3.feature_importances_
std = np.std([tree.feature_importances_ for tree in model3.estimators_], axis=0)
forest_importances = pd.Series(importances, index=feature_names)
forest_importances_sorted = forest_importances.sort_values(ascending=True)

fig, ax = plt.subplots()
bars = ax.barh(forest_importances_sorted.index, forest_importances_sorted)
ax.bar_label(bars)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

#### Metode K-fold pada Random Forest

In [None]:
cross_val_score (model3, X_res, y_res, cv=6)

In [None]:
cross_val_score (model3, X_res, y_res, cv=6).mean()

In [None]:
y_pred3 = cross_val_predict (model3, X_res, y_res, cv=5)

In [None]:
f, ax = plt.subplots(figsize=(8,5))
sns.heatmap(confusion_matrix(y_res, y_pred3), annot=True, fmt=".0f", ax=ax)
plt.xlabel("Predict")
plt.ylabel("Actual")
plt.show()

### SAVE MODEL

In [None]:
filename = 'credit_model.sav'
pickle.dump(model3,open(filename, 'wb'))