In [13]:
import numpy as np
import pandas as pd
import glob
import os
import pickle

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.io as pio

In [6]:
df = pd.read_csv('train.csv', encoding='utf-8')
df.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

# 2. Exploratory Data Analysis


## 2.1. `credit`

In [9]:
df['credit'].value_counts()

2.0    16968
1.0     6267
0.0     3222
Name: credit, dtype: int64

In [14]:
fig = go.Figure(data=[go.Pie(labels=['2.0', '1.0', '0.0'], values=df['credit'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>Target Ratio</b>', width=800, height=600)

pio.write_html(fig, file='target_ratio.html', auto_open=True)
fig.show()

## 2.2. `gender`

In [15]:
df['gender'].value_counts()

F    17697
M     8760
Name: gender, dtype: int64

In [42]:
fig = go.Figure(data=[go.Pie(labels=['F', 'M'], values=df['gender'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>Gender Ratio</b>', width=800, height=600)

pio.write_html(fig, file='gender_ratio.html', auto_open=False)
fig.show()

In [40]:
fig = make_subplots(rows=3, cols=1, specs=[[{'type': 'domain'}], [{'type': 'domain'}], [{'type': 'domain'}]])

fig.add_trace(go.Pie(labels=df[df['credit'] == 2.0]['gender'].value_counts().index, values=df[df['credit'] == 2.0]['gender'].value_counts(), name="credit 2.0"), 1, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 1.0]['gender'].value_counts().index, values=df[df['credit'] == 1.0]['gender'].value_counts(), name='credit 1.0'), 2, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 0.0]['gender'].value_counts().index, values=df[df['credit'] == 0.0]['gender'].value_counts(), name='credit 0.0'), 3, 1)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')
fig.update_layout(title_text='<b>Gender Ratio for Each Credit Score</b>',
                  width=800, height=1200,
                  annotations=[dict(text='credit 2.0', x=0.5, y=0.88, font_size=12, showarrow=False),
                               dict(text='credit 1.0', x=0.5, y=0.5, font_size=12, showarrow=False),
                               dict(text='credit 0.0', x=0.5, y=0.12, font_size=12, showarrow=False)])

pio.write_html(fig, file='gender_ratio_credit.html', auto_open=False)
fig.show()

## 2.3. `car`<br>
* 차량 소유 여부

In [41]:
df['car'].value_counts()

N    16410
Y    10047
Name: car, dtype: int64

In [43]:
fig = go.Figure(data=[go.Pie(labels=['No', 'Yes'], values=df['car'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>Car Ownership Ratio</b>', width=800, height=600)

pio.write_html(fig, file='car_ratio.html', auto_open=False)
fig.show()

In [44]:
fig = make_subplots(rows=3, cols=1, specs=[[{'type': 'domain'}], [{'type': 'domain'}], [{'type': 'domain'}]])

fig.add_trace(go.Pie(labels=df[df['credit'] == 2.0]['car'].value_counts().index, values=df[df['credit'] == 2.0]['car'].value_counts(), name="credit 2.0"), 1, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 1.0]['car'].value_counts().index, values=df[df['credit'] == 1.0]['car'].value_counts(), name='credit 1.0'), 2, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 0.0]['car'].value_counts().index, values=df[df['credit'] == 0.0]['car'].value_counts(), name='credit 0.0'), 3, 1)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')
fig.update_layout(title_text='<b>Car Ownership Ratio for Each Credit Score</b>',
                  width=800, height=1200,
                  annotations=[dict(text='credit 2.0', x=0.5, y=0.88, font_size=12, showarrow=False),
                               dict(text='credit 1.0', x=0.5, y=0.5, font_size=12, showarrow=False),
                               dict(text='credit 0.0', x=0.5, y=0.12, font_size=12, showarrow=False)])

pio.write_html(fig, file='car_ratio_credit.html', auto_open=False)
fig.show()

## 2.4. `reality`<br>
* 부동산 소유 여부

In [45]:
df['reality'].value_counts()

Y    17830
N     8627
Name: reality, dtype: int64

In [50]:
fig = go.Figure(data=[go.Pie(labels=['Yes', 'No'], values=df['reality'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>Real Estate Ownership Ratio</b>', width=800, height=600)

pio.write_html(fig, file='reality_ratio.html', auto_open=False)
fig.show()

In [51]:
fig = make_subplots(rows=3, cols=1, specs=[[{'type': 'domain'}], [{'type': 'domain'}], [{'type': 'domain'}]])

fig.add_trace(go.Pie(labels=df[df['credit'] == 2.0]['reality'].value_counts().index, values=df[df['credit'] == 2.0]['reality'].value_counts(), name="credit 2.0"), 1, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 1.0]['reality'].value_counts().index, values=df[df['credit'] == 1.0]['reality'].value_counts(), name='credit 1.0'), 2, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 0.0]['reality'].value_counts().index, values=df[df['credit'] == 0.0]['reality'].value_counts(), name='credit 0.0'), 3, 1)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')
fig.update_layout(title_text='<b>Real Estate Ownership Ratio for Each Credit Score</b>',
                  width=800, height=1200,
                  annotations=[dict(text='credit 2.0', x=0.5, y=0.88, font_size=12, showarrow=False),
                               dict(text='credit 1.0', x=0.5, y=0.5, font_size=12, showarrow=False),
                               dict(text='credit 0.0', x=0.5, y=0.12, font_size=12, showarrow=False)])

pio.write_html(fig, file='reality_ratio_credit.html', auto_open=False)
fig.show()

## 2.5. `child_num`<br>
* 자녀 수

In [57]:
fig = go.Figure([go.Bar(x=df['child_num'].value_counts().index, y=df['child_num'].value_counts())])
fig.update_layout(title_text='<b>The Number of Childern</b>', width=800, height=600)

pio.write_html(fig, file='child_num.html', auto_open=False)
fig.show()

In [73]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])
fig.add_trace(go.Pie(labels=df[df['child_num'] == 0]['credit'].value_counts().index, values=df[df['child_num'] == 0]['credit'].value_counts(), name="No Child"), 1, 1)
fig.add_trace(go.Pie(labels=df[df['child_num'] != 0]['credit'].value_counts().index, values=df[df['child_num'] != 0]['credit'].value_counts(), name='Have Children'), 1, 2)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')

fig.update_layout(title_text='<b>Credit Ratio Difference Between Customer with children and One without them</b>',
                  annotations=[dict(text='No Child', x=0.205, y=0.5, font_size=12, showarrow=False),
                               dict(text='With Children', x=0.805, y=0.5, font_size=12, showarrow=False)])

pio.write_html(fig, file='child_num_credit.html', auto_open=False)
fig.show()

## 2.6. `income_total`<br>
* 연간 소득

In [83]:
fig = px.histogram(df, x='income_total', color='credit', marginal='box')
fig.update_layout(title_text='<b>Distribution of Yearly Income</b>', width=1500, height=600)

pio.write_html(fig, file='income_total.html', auto_open=False)
fig.show()

## 2.7. `income_type`<br>
* 소득 분류

In [85]:
fig = go.Figure(data=[go.Pie(labels=df['income_type'].value_counts().index, values=df['income_type'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>Income Type Ratio</b>', width=800, height=600)

pio.write_html(fig, file='income_type.html', auto_open=False)
fig.show()

In [87]:
fig = make_subplots(rows=3, cols=1, specs=[[{'type': 'domain'}], [{'type': 'domain'}], [{'type': 'domain'}]])

fig.add_trace(go.Pie(labels=df[df['credit'] == 2.0]['income_type'].value_counts().index, values=df[df['credit'] == 2.0]['income_type'].value_counts(), name="credit 2.0"), 1, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 1.0]['income_type'].value_counts().index, values=df[df['credit'] == 1.0]['income_type'].value_counts(), name='credit 1.0'), 2, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 0.0]['income_type'].value_counts().index, values=df[df['credit'] == 0.0]['income_type'].value_counts(), name='credit 0.0'), 3, 1)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')
fig.update_layout(title_text='<b>Income Type Ratio for Each Credit Score</b>',
                  width=800, height=1200,
                  annotations=[dict(text='credit 2.0', x=0.5, y=0.88, font_size=12, showarrow=False),
                               dict(text='credit 1.0', x=0.5, y=0.5, font_size=12, showarrow=False),
                               dict(text='credit 0.0', x=0.5, y=0.12, font_size=12, showarrow=False)])

pio.write_html(fig, file='income_type_credit.html', auto_open=False)
fig.show()

## 2.8. `edu_type`<br>
* 교육 수준

In [89]:
fig = go.Figure(data=[go.Pie(labels=df['edu_type'].value_counts().index, values=df['edu_type'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>Education Level Ratio</b>', width=800, height=600)

pio.write_html(fig, file='edu_type.html', auto_open=False)
fig.show()

In [90]:
fig = make_subplots(rows=3, cols=1, specs=[[{'type': 'domain'}], [{'type': 'domain'}], [{'type': 'domain'}]])

fig.add_trace(go.Pie(labels=df[df['credit'] == 2.0]['edu_type'].value_counts().index, values=df[df['credit'] == 2.0]['edu_type'].value_counts(), name="credit 2.0"), 1, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 1.0]['edu_type'].value_counts().index, values=df[df['credit'] == 1.0]['edu_type'].value_counts(), name='credit 1.0'), 2, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 0.0]['edu_type'].value_counts().index, values=df[df['credit'] == 0.0]['edu_type'].value_counts(), name='credit 0.0'), 3, 1)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')
fig.update_layout(title_text='<b>Education Level Ratio for Each Credit Score</b>',
                  width=800, height=1200,
                  annotations=[dict(text='credit 2.0', x=0.5, y=0.88, font_size=12, showarrow=False),
                               dict(text='credit 1.0', x=0.5, y=0.5, font_size=12, showarrow=False),
                               dict(text='credit 0.0', x=0.5, y=0.12, font_size=12, showarrow=False)])

pio.write_html(fig, file='edu_type_credit.html', auto_open=False)
fig.show()

## 2.9. `family_type`<br>
* 결혼 여부

In [91]:
fig = go.Figure(data=[go.Pie(labels=df['family_type'].value_counts().index, values=df['family_type'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>Family Type Ratio</b>', width=800, height=600)

pio.write_html(fig, file='family_type.html', auto_open=False)
fig.show()

In [92]:
fig = make_subplots(rows=3, cols=1, specs=[[{'type': 'domain'}], [{'type': 'domain'}], [{'type': 'domain'}]])

fig.add_trace(go.Pie(labels=df[df['credit'] == 2.0]['family_type'].value_counts().index, values=df[df['credit'] == 2.0]['family_type'].value_counts(), name="credit 2.0"), 1, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 1.0]['family_type'].value_counts().index, values=df[df['credit'] == 1.0]['family_type'].value_counts(), name='credit 1.0'), 2, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 0.0]['family_type'].value_counts().index, values=df[df['credit'] == 0.0]['family_type'].value_counts(), name='credit 0.0'), 3, 1)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')
fig.update_layout(title_text='<b>Family Type Ratio for Each Credit Score</b>',
                  width=800, height=1200,
                  annotations=[dict(text='credit 2.0', x=0.5, y=0.88, font_size=12, showarrow=False),
                               dict(text='credit 1.0', x=0.5, y=0.5, font_size=12, showarrow=False),
                               dict(text='credit 0.0', x=0.5, y=0.12, font_size=12, showarrow=False)])

pio.write_html(fig, file='family_type_credit.html', auto_open=False)
fig.show()

## 2.10. `house_type`<br>
* 생활 방식

In [93]:
fig = go.Figure(data=[go.Pie(labels=df['house_type'].value_counts().index, values=df['house_type'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>House Type Ratio</b>', width=800, height=600)

pio.write_html(fig, file='house_type.html', auto_open=False)
fig.show()

In [94]:
fig = make_subplots(rows=3, cols=1, specs=[[{'type': 'domain'}], [{'type': 'domain'}], [{'type': 'domain'}]])

fig.add_trace(go.Pie(labels=df[df['credit'] == 2.0]['house_type'].value_counts().index, values=df[df['credit'] == 2.0]['house_type'].value_counts(), name="credit 2.0"), 1, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 1.0]['house_type'].value_counts().index, values=df[df['credit'] == 1.0]['house_type'].value_counts(), name='credit 1.0'), 2, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 0.0]['house_type'].value_counts().index, values=df[df['credit'] == 0.0]['house_type'].value_counts(), name='credit 0.0'), 3, 1)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')
fig.update_layout(title_text='<b>House Type Ratio for Each Credit Score</b>',
                  width=800, height=1200,
                  annotations=[dict(text='credit 2.0', x=0.5, y=0.88, font_size=12, showarrow=False),
                               dict(text='credit 1.0', x=0.5, y=0.5, font_size=12, showarrow=False),
                               dict(text='credit 0.0', x=0.5, y=0.12, font_size=12, showarrow=False)])

pio.write_html(fig, file='house_type_credit.html', auto_open=False)
fig.show()

## 2.11. `DAYS_BIRTH`<br>
* 출생일

In [96]:
fig = px.histogram(df, x='DAYS_BIRTH', color='credit', marginal='box')
fig.update_layout(title_text='<b>Distribution of Days Past Since Birth</b>', width=1500, height=600)

pio.write_html(fig, file='days_birth.html', auto_open=False)
fig.show()

## 2.11.1 `age`<br>
* 파생 변수로 `DAYS_BIRTH`에 절대값을 취한 후 365로 나눈 값, 즉 나이를 제안함

In [188]:
df['age'] = df['DAYS_BIRTH'].map(lambda x: round(abs(x) / 365, 2))

In [189]:
fig = px.histogram(df, x='age', color='credit', marginal='box')
fig.update_layout(title_text='<b>Distribution of Age</b>', width=1500, height=600)

pio.write_html(fig, file='deriv_age.html', auto_open=False)
fig.show()

## 2.12. `DAYS_EMPLOYED`<br>
* 업무 시작일<br>
* 양수 값은 고용되지 <u>않은</u> 상태를 의미함

In [109]:
# all the positive values are 365243
df[df['DAYS_EMPLOYED'].map(lambda x: x > 0)]['DAYS_EMPLOYED'].value_counts()

365243    4438
Name: DAYS_EMPLOYED, dtype: int64

In [119]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])
fig.add_trace(go.Pie(labels=df[df['DAYS_EMPLOYED'] > 0]['credit'].value_counts().index, values=df[df['DAYS_EMPLOYED'] > 0]['credit'].value_counts(), name='Unemployed'), 1, 1)
fig.add_trace(go.Pie(labels=df[df['DAYS_EMPLOYED'] < 0]['credit'].value_counts().index, values=df[df['DAYS_EMPLOYED'] < 0]['credit'].value_counts(), name='Employed'), 1, 2)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')

fig.update_layout(title_text='<b>Credit Ratio Difference Between the Unemployed and the Employed</b>',
                  annotations=[dict(text='Unemployed', x=0.195, y=0.5, font_size=12, showarrow=False),
                               dict(text='Employed', x=0.7975, y=0.5, font_size=12, showarrow=False)])

pio.write_html(fig, file='employed_credit.html', auto_open=False)
fig.show()

In [121]:
fig = px.histogram(df[df['DAYS_EMPLOYED'].map(lambda x: x < 0)], x='DAYS_EMPLOYED', color='credit', marginal='box')
fig.update_layout(title_text='<b>Distribution of Employed Days</b>', width=1500, height=600)

pio.write_html(fig, file='employed_days.html', auto_open=False)
fig.show()

## 2.13. `FLAG_MOBIL`<br>
* 휴대전화 소유 여부

In [122]:
fig = go.Figure(data=[go.Pie(labels=df['FLAG_MOBIL'].value_counts().index, values=df['FLAG_MOBIL'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>Cell Phone Ownership Ratio</b>', width=800, height=600)

# pio.write_html(fig, file='house_type.html', auto_open=False)
fig.show()

## 2.14. `work_phone`<br>
* 업무용 휴대전화 소유 여부

In [125]:
fig = go.Figure(data=[go.Pie(labels=df['work_phone'].value_counts().index, values=df['work_phone'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>Cell Phone for Business Purpose Ownership Ratio</b>', width=800, height=600)

pio.write_html(fig, file='work_phone.html', auto_open=False)
fig.show()

In [129]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}]])
fig.add_trace(go.Pie(labels=df[df['work_phone'] == 0]['credit'].value_counts().index, values=df[df['work_phone'] == 0]['credit'].value_counts(), name='No Work Phone'), 1, 1)
fig.add_trace(go.Pie(labels=df[df['work_phone'] != 0]['credit'].value_counts().index, values=df[df['work_phone'] != 0]['credit'].value_counts(), name='Owned Work Phone'), 1, 2)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')

fig.update_layout(title_text='<b>Credit Ratio Difference Between One Owned Cell Phone for Business Purpose and One Doesn\'t</b>',
                  annotations=[dict(text='Unemployed', x=0.195, y=0.5, font_size=12, showarrow=False),
                               dict(text='Employed', x=0.7975, y=0.5, font_size=12, showarrow=False)])

pio.write_html(fig, file='work_phone_credit.html', auto_open=False)
fig.show()

## 2.15. `phone`<br>
* 유선전화 소유 여부

In [131]:
fig = go.Figure(data=[go.Pie(labels=df['phone'].value_counts().index, values=df['phone'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>Phone Ownership Ratio</b>', width=800, height=600)

pio.write_html(fig, file='phone.html', auto_open=False)
fig.show()

In [133]:
fig = make_subplots(rows=3, cols=1, specs=[[{'type': 'domain'}], [{'type': 'domain'}], [{'type': 'domain'}]])

fig.add_trace(go.Pie(labels=df[df['credit'] == 2.0]['phone'].value_counts().index, values=df[df['credit'] == 2.0]['phone'].value_counts(), name="credit 2.0"), 1, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 1.0]['phone'].value_counts().index, values=df[df['credit'] == 1.0]['phone'].value_counts(), name='credit 1.0'), 2, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 0.0]['phone'].value_counts().index, values=df[df['credit'] == 0.0]['phone'].value_counts(), name='credit 0.0'), 3, 1)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')
fig.update_layout(title_text='<b>Phone Ownership Ratio for Each Credit Score</b>',
                  width=800, height=1200,
                  annotations=[dict(text='credit 2.0', x=0.5, y=0.88, font_size=12, showarrow=False),
                               dict(text='credit 1.0', x=0.5, y=0.5, font_size=12, showarrow=False),
                               dict(text='credit 0.0', x=0.5, y=0.12, font_size=12, showarrow=False)])

pio.write_html(fig, file='phone_credit.html', auto_open=False)
fig.show()

## 2.16. `email`<br>
* 이메일 소유 여부

In [136]:
fig = go.Figure(data=[go.Pie(labels=df['email'].value_counts().index, values=df['email'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>E-mail Ownership Ratio</b>', width=800, height=600)

pio.write_html(fig, file='email.html', auto_open=False)
fig.show()

In [137]:
fig = make_subplots(rows=3, cols=1, specs=[[{'type': 'domain'}], [{'type': 'domain'}], [{'type': 'domain'}]])

fig.add_trace(go.Pie(labels=df[df['credit'] == 2.0]['email'].value_counts().index, values=df[df['credit'] == 2.0]['email'].value_counts(), name="credit 2.0"), 1, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 1.0]['email'].value_counts().index, values=df[df['credit'] == 1.0]['email'].value_counts(), name='credit 1.0'), 2, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 0.0]['email'].value_counts().index, values=df[df['credit'] == 0.0]['email'].value_counts(), name='credit 0.0'), 3, 1)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')
fig.update_layout(title_text='<b>E-mail Ownership Ratio for Each Credit Score</b>',
                  width=800, height=1200,
                  annotations=[dict(text='credit 2.0', x=0.5, y=0.88, font_size=12, showarrow=False),
                               dict(text='credit 1.0', x=0.5, y=0.5, font_size=12, showarrow=False),
                               dict(text='credit 0.0', x=0.5, y=0.12, font_size=12, showarrow=False)])

pio.write_html(fig, file='email_credit.html', auto_open=False)
fig.show()

## 2.17. `occyp_type`<br>
* 직업 유형

In [139]:
fig = go.Figure(data=[go.Pie(labels=df['occyp_type'].value_counts().index, values=df['occyp_type'].value_counts(), hole=0.3)])
fig.update_layout(title_text='<b>Occupation Type Ratio</b>', width=800, height=600)

pio.write_html(fig, file='occyp.html', auto_open=False)
fig.show()

In [141]:
fig = make_subplots(rows=3, cols=1, specs=[[{'type': 'domain'}], [{'type': 'domain'}], [{'type': 'domain'}]])

fig.add_trace(go.Pie(labels=df[df['credit'] == 2.0]['occyp_type'].value_counts().index, values=df[df['credit'] == 2.0]['occyp_type'].value_counts(), name="credit 2.0"), 1, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 1.0]['occyp_type'].value_counts().index, values=df[df['credit'] == 1.0]['occyp_type'].value_counts(), name='credit 1.0'), 2, 1)
fig.add_trace(go.Pie(labels=df[df['credit'] == 0.0]['occyp_type'].value_counts().index, values=df[df['credit'] == 0.0]['occyp_type'].value_counts(), name='credit 0.0'), 3, 1)

fig.update_traces(hole=0.5, hoverinfo='label+percent+name')
fig.update_layout(title_text='<b>Occupation Type Ratio for Each Credit Score</b>',
                  width=800, height=1200,
                  annotations=[dict(text='credit 2.0', x=0.5, y=0.88, font_size=12, showarrow=False),
                               dict(text='credit 1.0', x=0.5, y=0.5, font_size=12, showarrow=False),
                               dict(text='credit 0.0', x=0.5, y=0.12, font_size=12, showarrow=False)])

pio.write_html(fig, file='occyp_credit.html', auto_open=False)
fig.show()

## 2.18. `family_size`<br>
* 가족 규모

In [143]:
fig = go.Figure([go.Bar(x=df['family_size'].value_counts().index, y=df['family_size'].value_counts())])
fig.update_layout(title_text='<b>Family Size</b>', width=800, height=600)

pio.write_html(fig, file='family_size.html', auto_open=False)
fig.show()

In [146]:
fig = px.histogram(df, x='family_size', color='credit', marginal='box')
fig.update_layout(title_text='<b>Distribution of Family Size</b>', width=800, height=600)

pio.write_html(fig, file='family_credit.html', auto_open=False)
fig.show()

## 2.19. `begin_month`<br>
* 신용카드 발급 월

In [149]:
fig = px.histogram(df, x='begin_month', color='credit', marginal='box')
fig.update_layout(title_text='<b>Distribution of Begin Month</b>', width=1500, height=600)

pio.write_html(fig, file='begin_month.html', auto_open=False)
fig.show()

## 2.19.1. `holding_period`<br>
* 파생 변수로 `begin_month`에 절대값을 취한 후 12로 나눈 값, 즉 보유 기간(년)을 제안함

In [158]:
df['holding_period'] = df['begin_month'].map(lambda x: round(abs(x) / 12, 2))

In [160]:
fig = px.histogram(df, x='holding_period', color='credit', marginal='box')
fig.update_layout(title_text='<b>Distribution of Holding Period</b>', width=1500, height=600)

pio.write_html(fig, file='deriv_holding_period.html', auto_open=False)
fig.show()

# 3. Feature Generation

## 3.1. `family_expense`<br>
* 파생 변수로 `income_total`을 `family_size`로 나눈 값, 즉 1인당 가족 부양비를 제안함

In [162]:
df['family_expense'] = df['income_total'] / df['family_size']

In [165]:
fig = px.histogram(df, x='family_expense', color='credit', marginal='box')
fig.update_layout(title_text='<b>Distribution of Family Expense</b>', width=1200, height=600)

pio.write_html(fig, file='deriv_family_expense.html', auto_open=False)
fig.show()

## 3.2. `employed_age`<br>
* 파생변수로 `age`에서 `DAYS_EMPLOYED`를 연 단위로 변환한 값을 뺀 값, 즉 현재 직장에 취업한 나이를 제안함<br>
* 미취업 상태인 경우 0 반환

In [191]:
df['work_experience'] = df['DAYS_EMPLOYED'].map(lambda x: np.where(x < 0, round(abs(x) / 365, 2), 0))
df['employed_age'] = df['age'] - df['work_experience']

In [199]:
d_unemp = {365243: 0}
df['employed_age'] = df['DAYS_EMPLOYED'].map(d_unemp).fillna(df['employed_age'])

In [203]:
fig = px.histogram(df[df['employed_age'].map(lambda x: x != 0)], x='employed_age', color='credit', marginal='box')
fig.update_layout(title_text='<b>Distribution of Employed Age</b>', width=1200, height=600)

pio.write_html(fig, file='deriv_employed_age.html', auto_open=False)
fig.show()