# CUST_DATA

In [1]:
# Import packages
import pandas as pd
import numpy as np

In [2]:
# Load data
customer_df = pd.read_csv('./Data/BGCON_CUST_DATA.csv', encoding='utf-16')

## Nan 처리

In [3]:
# 데이터 탐색
len(customer_df) - customer_df.count()

CUST_ID                 0
DIVIDED_SET             0
SIU_CUST_YN          1793
SEX                     0
AGE                     0
RESI_COST               0
RESI_TYPE_CODE       1254
FP_CAREER               0
CUST_RGST             456
CTPR                  621
OCCP_GRP_1            595
OCCP_GRP_2            595
TOTALPREM            5791
MINCRDT              9476
MAXCRDT              9476
WEDD_YN               473
MATE_OCCP_GRP_1     11827
MATE_OCCP_GRP_2     11827
CHLD_CNT              473
LTBN_CHLD_AGE         473
MAX_PAYM_YM          6486
MAX_PRM              6486
CUST_INCM            5263
RCBASE_HSHD_INCM        0
JPBASE_HSHD_INCM      680
dtype: int64

### RESI_TYPE_CODE ( 주택유형 ) ==> 10 (주택없음)

In [4]:
customer_df['RESI_TYPE_CODE'].unique()

array([ 20.,  nan,  40.,  99.,  30.,  50.,  12.,  11.,  13.,  60.,  70.])

In [5]:
customer_df['RESI_TYPE_CODE'].fillna(value=10, inplace=True)

### CUST_RGST ( 고객등록일 ) ==> 999999 (yyyymm)

In [6]:
customer_df['CUST_RGST'].fillna(value=999999, inplace=True)

### CTPR ( 시도구분 ) ==> 알수없음

In [7]:
customer_df['CTPR'].fillna(value='알수없음', inplace=True)

### OCCP_GRP_1 / OCCP_GRP_2 ( 직업분류코드 ) ==> 9. 알수없음 / 알수없음

In [8]:
customer_df['OCCP_GRP_1'].fillna(value='9.알수없음', inplace=True)

In [9]:
customer_df['OCCP_GRP_2'].fillna(value='알수없음', inplace=True)

### ***완료안됨 TOTALPREM ( 납입총보험료 ) ==> 합계보험료 x 계약 누적달

In [4]:
contract_df = pd.read_csv('./Data/BGCON_CNTT_DATA.csv', encoding='utf-16')

merged_df = customer_df.merge(contract_df, on='CUST_ID')

In [5]:
# get preprocessed ym
merged_df['preprocessed_ym'] = merged_df['CNTT_YM'].apply(lambda x: str(x)[0:4] + "-" + str(x)[4:6])

In [6]:
# define monthdelta function
from calendar import monthrange
from datetime import datetime, timedelta

def monthdelta(d1, d2 = datetime.strptime(str('2016-08'), '%Y-%m')):
    delta = 0
    while True:
        mdays = monthrange(d1.year, d1.month)[1]
        d1 += timedelta(days=mdays)
        if d1 <= d2:
            delta += 1
        else:
            break
    return delta

In [7]:
merged_df['TOTAL_MONTHS'] = merged_df['preprocessed_ym'].apply(lambda x: monthdelta(d1 = datetime.strptime(x, '%Y-%m')))

In [8]:
merged_df['expecting_total_premium'] = merged_df['TOTAL_MONTHS'] * merged_df['SUM_ORIG_PREM']

In [24]:
merged_df[['SUM_ORIG_PREM', 'expecting_total_premium']].loc[6]

SUM_ORIG_PREM                20000000
expecting_total_premium    3880000000
Name: 6, dtype: float64

In [15]:
customer_df['TOTALPREM'].fillna(value=0, inplace=True)

### MINCRDT / MAXCRDT ( 신용등급 ) ==> 6

In [16]:
customer_df['MINCRDT'].fillna(value=6, inplace=True)
customer_df['MINCRDT'].replace(28, 6, inplace=True)
customer_df['MINCRDT'].replace(99, 6, inplace=True)

In [17]:
customer_df['MAXCRDT'].fillna(value=6, inplace=True)
customer_df['MAXCRDT'].replace(28, 6, inplace=True)
customer_df['MAXCRDT'].replace(99, 6, inplace=True)

### WEDD_YN ( 결혼여부 ) ==> 알수없음

In [18]:
customer_df['WEDD_YN'].fillna(value='알수없음', inplace=True)

### MATE_OCCP_GRP_1 / MATE_OCCP_GRP_2 ( 배우자직업코드 )
* 결혼여부가 Y or 알수없음 인 경우, ==> 9.알수없음 / 알수없음
* 결혼여부가 N 인 경우, ==> 10.배우자없음 / 배우자없음

In [19]:
is_null = np.array(customer_df['MATE_OCCP_GRP_1'].isnull())
is_wedd = np.array(customer_df['WEDD_YN'] == 'Y')


wedd = np.bitwise_and(is_null, is_wedd)
not_wedd = np.bitwise_and(is_null, ~is_wedd)

customer_df['MATE_OCCP_GRP_1'][wedd] = "9.알수없음"
customer_df['MATE_OCCP_GRP_1'][not_wedd] = "10.배우자없음"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [20]:
is_unknown_wedd = np.array(customer_df['WEDD_YN'] == '알수없음')
is_not_having_wife = np.array(customer_df['MATE_OCCP_GRP_1'] == '10.배우자없음')

unknown_wedd = np.bitwise_and(is_unknown_wedd, is_not_having_wife)

customer_df['MATE_OCCP_GRP_1'][unknown_wedd] = '9.알수없음'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [21]:
is_null = np.array(customer_df['MATE_OCCP_GRP_2'].isnull())
is_wedd = np.array(customer_df['WEDD_YN'] == 'Y')

wedd = np.bitwise_and(is_null, is_wedd)
not_wedd = np.bitwise_and(is_null, ~is_wedd)

customer_df['MATE_OCCP_GRP_2'][wedd] = "알수없음"
customer_df['MATE_OCCP_GRP_2'][not_wedd] = "배우자없음"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
is_unknown_wedd = np.array(customer_df['WEDD_YN'] == '알수없음')
is_not_having_wife = np.array(customer_df['MATE_OCCP_GRP_2'] == '배우자없음')

unknown_wedd = np.bitwise_and(is_unknown_wedd, is_not_having_wife)

customer_df['MATE_OCCP_GRP_2'][unknown_wedd] = '알수없음'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### CHLD_CNT / LTBN_CHLD_AGE ( 자녀수 / 막내자녀연령 ) ==> 0

In [23]:
is_null = np.array(customer_df['CHLD_CNT'].isnull())
is_wedd = np.array(customer_df['WEDD_YN'] == 'Y')

unknown_wedd_and_not_wedd = np.bitwise_and(is_null, ~is_wedd)

customer_df['CHLD_CNT'][unknown_wedd_and_not_wedd] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [24]:
is_null = np.array(customer_df['LTBN_CHLD_AGE'].isnull())
is_wedd = np.array(customer_df['WEDD_YN'] == 'Y')

unknown_wedd_and_not_wedd = np.bitwise_and(is_null, ~is_wedd)

customer_df['LTBN_CHLD_AGE'][unknown_wedd_and_not_wedd] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### MAX_PAYM_YM ( 최대보험료연월 ) ==> 999999

In [25]:
customer_df['MAX_PAYM_YM'].fillna(value=999999, inplace=True)

### MAX_PRM ( 최대보험료 ) ==> 0

In [26]:
customer_df['MAX_PRM'].fillna(value=0, inplace=True)

### CUST_INCM ( 고객추정소득 ) / JPBASE_HSHD_INCM ( 가구추정소득2 )

In [27]:
dropna_customer_df = customer_df.dropna(subset=['CUST_INCM', 'JPBASE_HSHD_INCM'])

avg_income_df = dropna_customer_df.groupby(['OCCP_GRP_1'])[['CUST_INCM', 'JPBASE_HSHD_INCM']].mean()
avg_income_df

Unnamed: 0_level_0,CUST_INCM,JPBASE_HSHD_INCM
OCCP_GRP_1,Unnamed: 1_level_1,Unnamed: 2_level_1
1.주부,0.0,4944.122149
2.자영업,4621.081266,6865.929815
3.사무직,4319.376505,5532.882851
4.전문직,4173.168564,6029.196748
5.서비스,3949.647207,5684.830743
6.제조업,4181.931034,5473.449138
7.1차산업,4025.830601,5239.360656
8.기타,415.870774,4689.529677
9.알수없음,4169.121951,5094.439024


In [28]:
def fill_cust_incm():
    occupation_group_list = customer_df['OCCP_GRP_1'].unique()
    occupation_group_list.sort()
    
    cust_incm_list = avg_income_df['CUST_INCM']
    
    is_null = np.array(customer_df['CUST_INCM'].isnull())
    
    for number in range(0, len(occupation_group_list)):
        is_group = np.array(customer_df['OCCP_GRP_1'] == occupation_group_list[number])
        group_and_null = np.bitwise_and(is_null, is_group)
        customer_df['CUST_INCM'][group_and_null] = cust_incm_list[number]

In [29]:
fill_cust_incm()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [30]:
def fill_jpbase_incm():
    occupation_group_list = customer_df['OCCP_GRP_1'].unique()
    occupation_group_list.sort()
    
    jpbase_incm_list = avg_income_df['JPBASE_HSHD_INCM']
    
    is_null = np.array(customer_df['JPBASE_HSHD_INCM'].isnull())
    
    for number in range(0, len(occupation_group_list)):
        is_group = np.array(customer_df['OCCP_GRP_1'] == occupation_group_list[number])
        group_and_null = np.bitwise_and(is_null, is_group)
        customer_df['JPBASE_HSHD_INCM'][group_and_null] = jpbase_incm_list[number]

In [31]:
fill_jpbase_incm()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Save it as new file

In [32]:
len(customer_df) - customer_df.count()

CUST_ID                0
DIVIDED_SET            0
SIU_CUST_YN         1793
SEX                    0
AGE                    0
RESI_COST              0
RESI_TYPE_CODE         0
FP_CAREER              0
CUST_RGST              0
CTPR                   0
OCCP_GRP_1             0
OCCP_GRP_2             0
TOTALPREM              0
MINCRDT                0
MAXCRDT                0
WEDD_YN                0
MATE_OCCP_GRP_1        0
MATE_OCCP_GRP_2        0
CHLD_CNT               0
LTBN_CHLD_AGE          0
MAX_PAYM_YM            0
MAX_PRM                0
CUST_INCM              0
RCBASE_HSHD_INCM       0
JPBASE_HSHD_INCM       0
dtype: int64

In [33]:
customer_df.to_csv('./Data/CUST_DATA_PREPROCESSED.csv', encoding='utf-16')