## 요약 정리
### 3가지 데이터의 null 값을 조사함
1. User_spec
    * 14개 컬럼에 null이 존재 
2. loan_result
    * 3개 컬럼에 null이 존재
3. log_data
    * 2개 컬럼에 null이 존재

### null 값이 존재하는 컬럼간의 관계

1. User_spec
    * birth_year == gender
    * income_type == employment_type == houseown_type == desired_amount == purpose
    * personal_rehabilitation_yn ⊂ personal_rehabilitation_complete_yn
    * existing_loan_cnt ⊂ existing_loan_amt
    * income_type ⊂ yearly_income
    * desired_amount ⊂ company_enter_month
    * desired_amount ⊂ personal_rehabilitation_complete_yn
    * desired_amount ⊂ personal_rehabilitation_complete_yn
2. loan_result
    * loan_limit == loan_rate
3. log_data
    * mp_os ⊂ mp_app_version

In [3]:
import pandas as pd
import os
import numpy as np

In [2]:
user_df = pd.read_csv('./data/user_spec.csv')
loan_df = pd.read_csv('./data/loan_result.csv')
log_df = pd.read_csv('./data/log_data.csv')

# null 값 조사

In [4]:
a = pd.DataFrame([
    [1, np.NaN,0],
    [np.NaN,3,1],
    [0,2,np.NaN]
])

In [7]:
a.isnull()

Unnamed: 0,0,1,2
0,False,True,False
1,True,False,False
2,False,False,True


In [9]:
sum(a.isnull())

3

In [12]:
user_df.isnull().sum()

application_id                               0
user_id                                      0
birth_year                               12961
gender                                   12961
insert_time                                  0
credit_score                            105115
yearly_income                               90
income_type                                 85
company_enter_month                     171760
employment_type                             85
houseown_type                               85
desired_amount                              85
purpose                                     85
personal_rehabilitation_yn              587461
personal_rehabilitation_complete_yn    1203354
existing_loan_cnt                       198556
existing_loan_amt                       313774
dtype: int64

In [13]:
loan_df.isnull().sum()

application_id                 0
loanapply_insert_time          0
bank_id                        0
product_id                     0
loan_limit                  7495
loan_rate                   7495
is_applied               3257239
dtype: int64

In [14]:
log_df.isnull().sum()

user_id                0
event                  0
timestamp              0
mp_os                980
mp_app_version    660597
date_cd                0
dtype: int64

# null 값인 column 분석

## user_spec

In [55]:
# 분석할 항목
a=sorted(list(zip(list(user_df.isnull().sum()), user_df.isnull().sum().index)))
for i,v in enumerate(a):
    if v[0]==0:
        continue
    for j,v2 in enumerate(a[i+1:]):
        print(f'{i}-{j}', v, v2)

3-0 (85, 'desired_amount') (85, 'employment_type')
3-1 (85, 'desired_amount') (85, 'houseown_type')
3-2 (85, 'desired_amount') (85, 'income_type')
3-3 (85, 'desired_amount') (85, 'purpose')
3-4 (85, 'desired_amount') (90, 'yearly_income')
3-5 (85, 'desired_amount') (12961, 'birth_year')
3-6 (85, 'desired_amount') (12961, 'gender')
3-7 (85, 'desired_amount') (105115, 'credit_score')
3-8 (85, 'desired_amount') (171760, 'company_enter_month')
3-9 (85, 'desired_amount') (198556, 'existing_loan_cnt')
3-10 (85, 'desired_amount') (313774, 'existing_loan_amt')
3-11 (85, 'desired_amount') (587461, 'personal_rehabilitation_yn')
3-12 (85, 'desired_amount') (1203354, 'personal_rehabilitation_complete_yn')
4-0 (85, 'employment_type') (85, 'houseown_type')
4-1 (85, 'employment_type') (85, 'income_type')
4-2 (85, 'employment_type') (85, 'purpose')
4-3 (85, 'employment_type') (90, 'yearly_income')
4-4 (85, 'employment_type') (12961, 'birth_year')
4-5 (85, 'employment_type') (12961, 'gender')
4-6 (85, 

### null인 속성들 중 수가 같은 경우, 서로 같은 개체인가?

birth_year == gender

income_type == employment_type == houseown_type == desired_amount == purpose

In [19]:
set(user_df[user_df['birth_year'].isnull()].index) == set(user_df[user_df['gender'].isnull()].index)

True

In [36]:
set(user_df[user_df['income_type'].isnull()].index) == set(user_df[user_df['employment_type'].isnull()].index)

True

In [37]:
set(user_df[user_df['employment_type'].isnull()].index) == set(user_df[user_df['houseown_type'].isnull()].index)

True

In [38]:
set(user_df[user_df['houseown_type'].isnull()].index) == set(user_df[user_df['desired_amount'].isnull()].index)

True

In [39]:
set(user_df[user_df['desired_amount'].isnull()].index) == set(user_df[user_df['purpose'].isnull()].index)

True

### 수가 다르다면, 포함관계인가?

personal_rehabilitation_yn ⊂ personal_rehabilitation_complete_yn

existing_loan_cnt ⊂ existing_loan_amt

income_type ⊂ yearly_income

desired_amount ⊂ company_enter_month

desired_amount ⊂ personal_rehabilitation_complete_yn

desired_amount ⊂ personal_rehabilitation_complete_yn

In [40]:
# 포함 관계임을 나타내는 코드 3가지
print(
    set([1,2]) & set([1,2,3]) == set([1,2]), # True
    set([1,2,3]).issuperset(set([1,2])),       # True 
    set([1,2]).issubset(set([1,2,3]))      # True 
)

True True True


In [97]:
# 서로소임을 나타내는 코드
print(
    set([1,2,3]).isdisjoint(set([1,2])), # False
    set([1,2,3]).isdisjoint(set([4,5])) # True
)

False True


특정 컬럼

In [34]:
set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index).issuperset(
    set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
)

True

In [35]:
set(user_df[user_df['existing_loan_amt'].isnull()].index).issuperset(
    set(user_df[user_df['existing_loan_cnt'].isnull()].index)
)

True

In [43]:
set(user_df[user_df['yearly_income'].isnull()].index).issuperset(
    set(user_df[user_df['income_type'].isnull()].index)
)

True

85 null(desired_amount, employment_type, houseown_type, income_type, purpose)

In [61]:
print(
    # 포함관계 파악
    set(user_df[user_df['desired_amount'].isnull()].index).issubset(
        set(user_df[user_df['birth_year'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['desired_amount'].isnull()].index).isdisjoint(
        set(user_df[user_df['birth_year'].isnull()].index)
    )
)

False False


In [62]:
print(
    # 포함관계 파악
    set(user_df[user_df['desired_amount'].isnull()].index).issubset(
        set(user_df[user_df['credit_score'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['desired_amount'].isnull()].index).isdisjoint(
        set(user_df[user_df['credit_score'].isnull()].index)
    )
)

False False


In [63]:
print(
    # 포함관계 파악
    set(user_df[user_df['desired_amount'].isnull()].index).issubset(
        set(user_df[user_df['company_enter_month'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['desired_amount'].isnull()].index).isdisjoint(
        set(user_df[user_df['company_enter_month'].isnull()].index)
    )
)

True False


In [64]:
print(
    # 포함관계 파악
    set(user_df[user_df['desired_amount'].isnull()].index).issubset(
        set(user_df[user_df['existing_loan_cnt'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['desired_amount'].isnull()].index).isdisjoint(
        set(user_df[user_df['existing_loan_cnt'].isnull()].index)
    )
)

False False


In [65]:
print(
    # 포함관계 파악
    set(user_df[user_df['desired_amount'].isnull()].index).issubset(
        set(user_df[user_df['existing_loan_amt'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['desired_amount'].isnull()].index).isdisjoint(
        set(user_df[user_df['existing_loan_amt'].isnull()].index)
    )
)

False False


In [66]:
print(
    # 포함관계 파악
    set(user_df[user_df['desired_amount'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['desired_amount'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
)

True False


In [67]:
print(
    # 포함관계 파악
    set(user_df[user_df['desired_amount'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['desired_amount'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
)

True False


90 null (yearly_income )

In [68]:
print(
    # 포함관계 파악
    set(user_df[user_df['yearly_income'].isnull()].index).issubset(
        set(user_df[user_df['birth_year'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['yearly_income'].isnull()].index).isdisjoint(
        set(user_df[user_df['birth_year'].isnull()].index)
    )
)

False False


In [69]:
print(
    # 포함관계 파악
    set(user_df[user_df['yearly_income'].isnull()].index).issubset(
        set(user_df[user_df['credit_score'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['yearly_income'].isnull()].index).isdisjoint(
        set(user_df[user_df['credit_score'].isnull()].index)
    )
)

False False


In [70]:
print(
    # 포함관계 파악
    set(user_df[user_df['yearly_income'].isnull()].index).issubset(
        set(user_df[user_df['company_enter_month'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['yearly_income'].isnull()].index).isdisjoint(
        set(user_df[user_df['company_enter_month'].isnull()].index)
    )
)

False False


In [72]:
print(
    # 포함관계 파악
    set(user_df[user_df['yearly_income'].isnull()].index).issubset(
        set(user_df[user_df['existing_loan_cnt'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['yearly_income'].isnull()].index).isdisjoint(
        set(user_df[user_df['existing_loan_cnt'].isnull()].index)
    )
)

False False


In [73]:
print(
    # 포함관계 파악
    set(user_df[user_df['yearly_income'].isnull()].index).issubset(
        set(user_df[user_df['existing_loan_amt'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['yearly_income'].isnull()].index).isdisjoint(
        set(user_df[user_df['existing_loan_amt'].isnull()].index)
    )
)

False False


In [74]:
print(
    # 포함관계 파악
    set(user_df[user_df['yearly_income'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['yearly_income'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
)

False False


In [75]:
print(
    # 포함관계 파악
    set(user_df[user_df['yearly_income'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['yearly_income'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
)

False False


12961 null (birth_year, gender)

In [76]:
print(
    # 포함관계 파악
    set(user_df[user_df['birth_year'].isnull()].index).issubset(
        set(user_df[user_df['credit_score'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['birth_year'].isnull()].index).isdisjoint(
        set(user_df[user_df['credit_score'].isnull()].index)
    )
)

False False


In [77]:
print(
    # 포함관계 파악
    set(user_df[user_df['birth_year'].isnull()].index).issubset(
        set(user_df[user_df['company_enter_month'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['birth_year'].isnull()].index).isdisjoint(
        set(user_df[user_df['company_enter_month'].isnull()].index)
    )
)

False False


In [78]:
print(
    # 포함관계 파악
    set(user_df[user_df['birth_year'].isnull()].index).issubset(
        set(user_df[user_df['existing_loan_cnt'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['birth_year'].isnull()].index).isdisjoint(
        set(user_df[user_df['existing_loan_cnt'].isnull()].index)
    )
)

False False


In [79]:
print(
    # 포함관계 파악
    set(user_df[user_df['birth_year'].isnull()].index).issubset(
        set(user_df[user_df['existing_loan_amt'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['birth_year'].isnull()].index).isdisjoint(
        set(user_df[user_df['existing_loan_amt'].isnull()].index)
    )
)

False False


In [80]:
print(
    # 포함관계 파악
    set(user_df[user_df['birth_year'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['birth_year'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
)

False False


In [81]:
print(
    # 포함관계 파악
    set(user_df[user_df['birth_year'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['birth_year'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
)

False False


105115 null(credit_score)

In [82]:
print(
    # 포함관계 파악
    set(user_df[user_df['credit_score'].isnull()].index).issubset(
        set(user_df[user_df['company_enter_month'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['credit_score'].isnull()].index).isdisjoint(
        set(user_df[user_df['company_enter_month'].isnull()].index)
    )
)

False False


In [83]:
print(
    # 포함관계 파악
    set(user_df[user_df['credit_score'].isnull()].index).issubset(
        set(user_df[user_df['existing_loan_cnt'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['credit_score'].isnull()].index).isdisjoint(
        set(user_df[user_df['existing_loan_cnt'].isnull()].index)
    )
)

False False


In [84]:
print(
    # 포함관계 파악
    set(user_df[user_df['credit_score'].isnull()].index).issubset(
        set(user_df[user_df['existing_loan_amt'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['credit_score'].isnull()].index).isdisjoint(
        set(user_df[user_df['existing_loan_amt'].isnull()].index)
    )
)

False False


In [85]:
print(
    # 포함관계 파악
    set(user_df[user_df['credit_score'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['credit_score'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
)

False False


In [86]:
print(
    # 포함관계 파악
    set(user_df[user_df['credit_score'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['credit_score'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
)

False False


171760 null(company_enter_month)

In [87]:
print(
    # 포함관계 파악
    set(user_df[user_df['company_enter_month'].isnull()].index).issubset(
        set(user_df[user_df['existing_loan_cnt'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['company_enter_month'].isnull()].index).isdisjoint(
        set(user_df[user_df['existing_loan_cnt'].isnull()].index)
    )
)

False False


In [88]:
print(
    # 포함관계 파악
    set(user_df[user_df['company_enter_month'].isnull()].index).issubset(
        set(user_df[user_df['existing_loan_amt'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['company_enter_month'].isnull()].index).isdisjoint(
        set(user_df[user_df['existing_loan_amt'].isnull()].index)
    )
)

False False


In [89]:
print(
    # 포함관계 파악
    set(user_df[user_df['company_enter_month'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['company_enter_month'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
)

False False


In [90]:
print(
    # 포함관계 파악
    set(user_df[user_df['company_enter_month'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['company_enter_month'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
)

False False


198556 null (existing_loan_cnt)

In [91]:
print(
    # 포함관계 파악
    set(user_df[user_df['existing_loan_cnt'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['existing_loan_cnt'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
)

False False


In [92]:
print(
    # 포함관계 파악
    set(user_df[user_df['existing_loan_cnt'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['existing_loan_cnt'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
)

False False


313774 null (existing_loan_amt)

In [94]:
print(
    # 포함관계 파악
    set(user_df[user_df['existing_loan_amt'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['existing_loan_amt'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_yn'].isnull()].index)
    )
)

False False


In [93]:
print(
    # 포함관계 파악
    set(user_df[user_df['existing_loan_amt'].isnull()].index).issubset(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(user_df[user_df['existing_loan_amt'].isnull()].index).isdisjoint(
        set(user_df[user_df['personal_rehabilitation_complete_yn'].isnull()].index)
    )
)

False False


## loan_result

In [98]:
# 분석할 항목
a=sorted(list(zip(list(loan_df.isnull().sum()), loan_df.isnull().sum().index)))
for i,v in enumerate(a):
    if v[0]==0:
        continue
    for j,v2 in enumerate(a[i+1:]):
        print(f'{i}-{j}', v, v2)

4-0 (7495, 'loan_limit') (7495, 'loan_rate')
4-1 (7495, 'loan_limit') (3257239, 'is_applied')
5-0 (7495, 'loan_rate') (3257239, 'is_applied')


### null 수가 같은 경우, 같은 개체일까?

loan_limit == loan_rate

In [99]:
set(loan_df[loan_df['loan_limit'].isnull()].index) == set(loan_df[loan_df['loan_rate'].isnull()].index)

True

### null 수가 다르다면, 포함관계일까?

In [100]:
print(
    # 포함관계 파악
    set(loan_df[loan_df['loan_limit'].isnull()].index).issubset(
        set(loan_df[loan_df['is_applied'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(loan_df[loan_df['loan_limit'].isnull()].index).isdisjoint(
        set(loan_df[loan_df['is_applied'].isnull()].index)
    )
)

False False


## log_data

### null 수가 다르다면, 포함관계일까?


mp_os ⊂ mp_app_version

In [102]:
print(
    # 포함관계 파악
    set(log_df[log_df['mp_os'].isnull()].index).issubset(
        set(log_df[log_df['mp_app_version'].isnull()].index)
    )
    ,
    # 서로소 판단
    set(log_df[log_df['mp_os'].isnull()].index).isdisjoint(
        set(log_df[log_df['mp_app_version'].isnull()].index)
    )
)

True False
