# Loan Prediction EDA

## Import libraries and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split ,KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, Binarizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

import graphviz

In [None]:
# Unzip
!unzip -o archive.zip

In [None]:
# Fetch data
data_train = pd.read_csv('Training Data.csv')
data_test = pd.read_csv('Test Data.csv')
data_sample = pd.read_csv('Sample Prediction Dataset.csv')

## Data Preprocessing

In [None]:
data_train.head()

In [None]:
data_train.describe()

In [None]:
data_train.info()

In [None]:
data_train.isnull().sum()

In [None]:
# Drop Unnecessary Columns
data_train.drop(['CITY', 'STATE'], axis=1, inplace=True)
data_train

## Evaluation

In [None]:
# Extract X for feature dataset, y for label dataset
X = data_train.iloc[:, :-1]
y = data_train.iloc[:, -1]

In [None]:
# # ML Algorithm cannot fit featues which contains characters
# # Therefore, we should encode them into numbers
# For applying various algorithms, it could be a good idea to select One-Hot Encoding
X = pd.get_dummies(X)
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11, stratify=y)

In [None]:
# Utility Function
def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('Confusion Matrix')
    print(confusion)
    print('Accuracy: {0:.4f}, Precision: {1:.4f}, Recall {2:.4f}, F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))

In [None]:
# Process fitting, prediction and evalution by Logistic Regression
# Create Estimator CLass
dt_clf = DecisionTreeClassifier()
lr_clf = LogisticRegression()
rf_clf = RandomForestClassifier()

# Fitting
dt_clf.fit(X_train, y_train)
lr_clf.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)

# Prediction
dt_pred = dt_clf.predict(X_test)
lr_pred = lr_clf.predict(X_test)
rf_pred = rf_clf.predict(X_test)

# Pred_Proba
dt_pred_proba = dt_clf.predict_proba(X_test)[:, 1]
lr_pred_proba = lr_clf.predict_proba(X_test)[:, 1]
rf_pred_proba = rf_clf.predict_proba(X_test)[:, 1]

# Evaluation
get_clf_eval(y_test, dt_pred, dt_pred_proba)
get_clf_eval(y_test, lr_pred, lr_pred_proba)
get_clf_eval(y_test, rf_pred, rf_pred_proba)

In [None]:
# Plot Function
def precision_recall_curve_plot(y_test, pred_proba_c1):
    # threshold ndarray와 이 threshold에 따른 정밀도, 재현율 ndarray 추출.
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba_c1)

    # X축을 thresdhold값으로, Y축은 정밀도, 재현율 값으로 각각 Plot 수행. 정밀도는 점선으로 표시
    plt.figure(figsize=(8, 6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    plt.plot(thresholds, recalls[0:threshold_boundary], label='recall')

    # threshold 값 X 축의 Scale을 0.1 단위로 변경
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start, end, 0.1), 2))

    # x축, y축 label과 legend, 그리고 grid 설정
    plt.xlabel('Threshold value'); plt.ylabel('Precision and Recall value')
    plt.legend();plt.grid()
    plt.show()

# DecisionTreeClassifier
precision_recall_curve_plot(y_test, dt_pred_proba)
# LogisitcRegression
precision_recall_curve_plot(y_test, lr_pred_proba)
# RandomForestClassifier
precision_recall_curve_plot(y_test, rf_pred_proba)

### Summary
1. The performance with RandomForestClassifier was much better than other algorithms.
2. As you can see the last plot, we could set threshold value as 0.3 for custom.

In [None]:
rf_pred_proba

In [None]:
# Evaluation with custom threshold value
# Set threshold value as 0.3
custom_threshold = 0.3

# Extract 'Positive Class' in order to apply Binarizer
custom_pred_proba = rf_pred_proba.reshape(-1, 1)

binarizer = Binarizer(threshold=custom_threshold).fit(custom_pred_proba)
custom_predict = binarizer.transform(custom_pred_proba)

get_clf_eval(y_test, custom_predict, custom_pred_proba)

In [None]:
# Plot
precision_recall_curve_plot(y_test, custom_pred_proba)