## Import all necessary library 

In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


from sklearn.model_selection import train_test_split

## Problem Statement
We intend to create an intelligent, accurate credit scoring system to assess creditworthiness for the credit applicant. Our goal is to build a model which can accurately predict and differentiate bad and good customers.

## Load Data

we have 3 data:
1. customer information
2. credit bureau
3. external scoring

In [2]:
master = pd.read_csv("./data/raw/Customer_Information.csv")
master.head()

Unnamed: 0,customer_id,AGE,INCOME,GENDER,EDUCATION,LOAN_PURPOSE,APPLICATION_RECEIVE_TIME,HAS_APPLIED_BEFORE,HAS_INCOME_VERIFICATION,LOAN_WAS_PAID_BACK
0,36493,40,10000000.0,Female,Diploma,Working Capital,09:07:00,No,Yes,1
1,36494,39,15000000.0,Female,Bachelor Degree,Other,15:15:00,No,Yes,1
2,36495,28,15000000.0,Male,Bachelor Degree,Renovation,07:13:00,No,Yes,1
3,36496,28,4000000.0,Male,Bachelor Degree,Renovation,12:05:00,Yes,Yes,1
4,36497,38,10000000.0,Male,Diploma,Renovation,14:35:00,Yes,Yes,0


In [4]:
bureau = pd.read_csv("./data/raw/Credit_Bureau_Data.csv")
bureau.head()

Unnamed: 0,customer_id,LOANS_WITHOUT_DELAYS,LOANS_WITH_DELAYS,KNOWN_ASSETS
0,36493,3,3,40000000.0
1,36494,3,0,20000000.0
2,36496,2,1,0.0
3,36497,1,0,40000000.0
4,36498,7,0,


In [5]:
external = pd.read_csv("./data/raw/External_Score.csv")
external.head()

Unnamed: 0,customer_id,Score
0,36493,0.77
1,36494,0.93
2,36496,0.78
3,36497,0.81
4,36498,0.78


because we want to predict LOAN_WAS_PAID_BACK so we think that score from exernal_score doesn't .....

### 1. Merge the Data

In [8]:
# merge master and bureau
data = pd.merge(master, bureau, on='customer_id', how='left')
data.head()

Unnamed: 0,customer_id,AGE,INCOME,GENDER,EDUCATION,LOAN_PURPOSE,APPLICATION_RECEIVE_TIME,HAS_APPLIED_BEFORE,HAS_INCOME_VERIFICATION,LOAN_WAS_PAID_BACK,LOANS_WITHOUT_DELAYS,LOANS_WITH_DELAYS,KNOWN_ASSETS
0,36493,40,10000000.0,Female,Diploma,Working Capital,09:07:00,No,Yes,1,3.0,3.0,40000000.0
1,36494,39,15000000.0,Female,Bachelor Degree,Other,15:15:00,No,Yes,1,3.0,0.0,20000000.0
2,36495,28,15000000.0,Male,Bachelor Degree,Renovation,07:13:00,No,Yes,1,,,
3,36496,28,4000000.0,Male,Bachelor Degree,Renovation,12:05:00,Yes,Yes,1,2.0,1.0,0.0
4,36497,38,10000000.0,Male,Diploma,Renovation,14:35:00,Yes,Yes,0,1.0,0.0,40000000.0


we want to check is that data duplicated to prevent data leakage

In [11]:
df = data.drop('customer_id', axis=1)
df

Unnamed: 0,AGE,INCOME,GENDER,EDUCATION,LOAN_PURPOSE,APPLICATION_RECEIVE_TIME,HAS_APPLIED_BEFORE,HAS_INCOME_VERIFICATION,LOAN_WAS_PAID_BACK,LOANS_WITHOUT_DELAYS,LOANS_WITH_DELAYS,KNOWN_ASSETS
0,40,10000000.0,Female,Diploma,Working Capital,09:07:00,No,Yes,1,3.0,3.0,40000000.0
1,39,15000000.0,Female,Bachelor Degree,Other,15:15:00,No,Yes,1,3.0,0.0,20000000.0
2,28,15000000.0,Male,Bachelor Degree,Renovation,07:13:00,No,Yes,1,,,
3,28,4000000.0,Male,Bachelor Degree,Renovation,12:05:00,Yes,Yes,1,2.0,1.0,0.0
4,38,10000000.0,Male,Diploma,Renovation,14:35:00,Yes,Yes,0,1.0,0.0,40000000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6583,38,11000000.0,Female,Bachelor Degree,Working Capital,10:54:00,No,Yes,1,,,
6584,37,30000000.0,Female,Bachelor Degree,Education,07:04:00,No,Yes,1,1.0,0.0,30000000.0
6585,50,16000000.0,Female,Master's Degree/Post graduate,Renovation,10:47:00,No,Yes,1,1.0,0.0,
6586,36,5000000.0,Male,Bachelor Degree,Other,15:49:00,No,Yes,1,1.0,0.0,


In [17]:
df.duplicated().sum()

0

there is no data duplicated

### 2. Split the Data