In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, date
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer

In [2]:
X = pd.read_csv('bank-churn-prediction/data/preprocess/X_train.csv')
y = pd.read_csv('bank-churn-prediction/data/preprocess/y_train.csv')
X_true = pd.read_csv('bank-churn-prediction/data/preprocess/X_test.csv')

### y (Label for Validation)

In [3]:
y['label'].value_counts() / len(y) * 100

 1    63.882006
 0    20.810621
-1    15.307373
Name: label, dtype: float64

In [4]:
lb = LabelBinarizer()
lb.fit(y['label'])

LabelBinarizer()

In [5]:
lb.classes_

array([-1,  0,  1], dtype=int64)

In [6]:
y_label = lb.transform(y['label'])

In [21]:
y

Unnamed: 0,cust_no,label
0,0xb2d640a6,0
1,0xb2d4dd44,-1
2,0xb2d69fef,1
3,0xb2d42c89,0
4,0xb2d90b58,-1
...,...,...
145291,0xb2d174f1,1
145292,0x3b9af14f,0
145293,0xb2d1bb5d,0
145294,0xb2d9ed26,1


In [43]:
y.to_csv('bank-churn-prediction/data/y_train.csv', index=False)

### X (Feature for Training)

1. 열별 결측치 비율 확인  
결측치 50% 이상이면 의미 없지 않을까...?  

**E4**(58.15%) : first online banking login date  
**E7**(98%) : first time deposit date (최초 입금 일)   
**E8**(87.67%) : first loan date (최초 대출 일)  
**E9**(99.95%) : first overdue date (최초 연체 날짜)  
**E11**(100%) : first bank-securities transfer date (최초 은행 - 증권 이체 날짜)  
**E12**(83.5%) : first transfer at counter date (최초 카운터에서 이체 날짜)  
**E13**(87.75%) : first transfer via online banking date (온라인 뱅킹을 이용한 최초 이체 날짜)  
**E14**(61.95%) : first transfer via mobile banking date (모바일 뱅킹을 이용한 최초 이체 날짜)   
**I9**(100%) : contribution (기부금)  
**I10**(88.43%) : education level (교육 수준)  
**I13**(98.49%) : marriage description
**I14**(89.23%) : occupation description

In [27]:
X.isnull().sum()/len(X) * 100

cust_no     0.000000
X1          0.000000
X2          0.000000
X3          0.000000
X4          0.000000
X5          0.000000
X6          0.000000
X7          0.000000
X8          0.000000
B1          0.000000
B2          0.000000
B3          0.000000
B4          0.000000
B5          0.000000
B6          6.110285
B7          0.000000
E1          0.000000
E2          4.384154
E3          4.384154
E5         37.942545
E6          5.188030
E10         0.561612
E15         0.000000
E16        47.165786
E17         0.000000
E18        42.772685
C1          0.000000
C2          0.000000
I1          0.044048
I2          0.000000
I3          0.000000
I4          0.000000
I5          0.000000
I6          0.000000
I11         0.000000
I15         0.000000
I16         0.000000
I17         0.000000
I18         0.000000
I19         0.000000
I20         0.000000
dtype: float64

In [8]:
# Drop columns with large portion of missing values
col_to_drop = ['E4', 'E7', 'E8', 'E9', 'E11', 'E12', 'E13', 'E14', 'I9', 'I10', 'I13', 'I14']
X = X.drop(col_to_drop, axis=1)

In [9]:
X.isnull().sum()/len(X) * 100

cust_no     0.000000
X1          0.000000
X2          0.000000
X3          0.000000
X4          0.000000
X5          0.000000
X6          0.000000
X7          0.000000
X8          0.000000
B1          0.000000
B2          0.000000
B3          0.000000
B4          0.000000
B5          0.000000
B6          6.110285
B7          0.000000
E1          0.000000
E2          4.384154
E3          4.384154
E5         37.942545
E6          5.188030
E10         0.561612
E15         0.000000
E16        47.165786
E17         0.000000
E18        42.772685
C1          0.004818
C2          0.004818
I1          0.044048
I2          0.000000
I3          0.000000
I4          0.000000
I5          7.986455
I6          0.000000
I7          0.000000
I8          0.000000
I11         0.000000
I12         0.000000
I15         0.000000
I16         0.000000
I17         0.000000
I18         0.000000
I19         0.000000
I20         0.000000
dtype: float64

**결측치 존재: ['B6', 'E2', 'E3', 'E5', 'E6', 'E10', 'E16', 'E18', 'C1', 'C2', 'I1', 'I5']**

#### customer's asset at the end of month Y.

- X1 : structured deposit balance(구조화 예금 잔액)
- X2 : time deposit balance (정기 예금 잔액)
- X3 : demand deposit balance (입출금 통장 잔액)
- X4 : financial products balance (금융 상품 잔액)
- X5 : fund balance (펀드 잔액)
- X6 : asset management balance (자산 관리 잔액)
- X7 : loan balance (대출 잔액)
- X8 : large deposit certificate balance (거액 예금 증서 잔액)

#### customers' behaviors in month Y.  
  
Column B6 and B7 only have data if month == [3, 6, 9, 12].

- B1 : mobile banking login times (로그인 횟수)
- B2 : transfer-in times (입금 횟수)
- B3 : transfer-in money amount (입금 금액)
- B4 : transfer-out times (출금 횟수)
- B5 : transfer-out money amount (출금 금액)
- **B6(6.11%) : latest transfer time (마지막 거래 시간)**
    - min값으로 대체?
- B7 : number of transfers in a season (거래 횟수?)

#### customers' important behaviors in the season Z.

- E1 : account opening date 
- **E2(4.38%) : online banking opening date** 
    - max값으로 대체?
- **E3(4.38%) : mobile banking opening date** 
    - max값으로 대체?
- ~~E4 : first online banking login date~~ 
- **E5(37.94%) : first mobile banking login date**
    - max값으로 대체?
- **E6(5.19%) : first demand deposit date**
    - 평균값으로 대체?
- ~~E7 : first time deposit date (최초 입금 일)~~
- ~~E8 : first loan date (최초 대출 일)~~
- ~~E9 : first overdue date (최초 연체 날짜)~~
- **E10(0.56%) : first cash transaction date (최초 현금 거래일)** 
    - 평균값으로 대체?
- ~~E11 : first bank-securities transfer date (최초 은행 - 증권 이체 날짜)~~
- ~~E12 : first transfer at counter date (최초 카운터에서 이체 날짜)~~
- ~~E13 : first transfer via online banking date (온라인 뱅킹을 이용한 최초 이체 날짜)~~
- ~~E14 : first transfer via mobile banking date (모바일 뱅킹을 이용한 최초 이체 날짜)~~
- E15 : maximum amount transferred out of another bank (다른 은행에서 이체된 최대 금액)
- **E16(47.17%) : maximum amount transferred out of another bank date (다른 은행에서 이체된 최대 금액이 이체된 날짜)** 
    - 평균값으로 대체?
- E17 : Maximum transfer amount from other bank (타 은행 최대 이체 금액)
- **E18(42.77%) : Maximum transfer amount from other bank date (타 은행 최대 이체 금액 이체 날짜)** 
    - 평균값으로 대체?  
    
#### customers' deposits in month Y.

- **C1(0.005%) : deposit products value (예금 상품 가치)**  
    - 0으로 대체
- **C2(0.005%) : number of deposit products (예금 상품 수)**
    - 0으로 대체  
    
#### valid customer IDs in the season Z.

####  customer information in the season Z.

- **l1(0.044%) : gender (성별)**
    - '여자'로 대체?
- l2 : age (나이)
- l3 : class
- l4 : tag
- **I5(7.99%) : occupation (직업)**
    - '무직'으로 대체?
- I6 : deposit customer tag (예금 고객 태그)
- I7 : number of products owning (보유 제품 수)
- I8 : constellation (별자리) -> 없앰
- ~~I9 : contribution (기부금)~~
- I10 : education level (교육 수준)
- I11 : family annual income (가구 연간 소득)
- I12 : field description
- ~~I13 : marriage description~~
- ~~I14 : occupation description~~
- I15 : QR code recipient
- I16 : VIP (vip 여부)
- I17 : online banking client
- I18 : mobile banking client
- I19 : SMS client
- I20 : WeChat Pay client

In [10]:
X = X.drop(['I8'], axis=1) # 별자리가 이탈에 영향을 미치지 않을 것이라고 판단

In [11]:
X['I7'].value_counts()

0    145296
Name: I7, dtype: int64

In [12]:
X['I12'].value_counts()

个人    145295
农业         1
Name: I12, dtype: int64

In [13]:
X = X.drop(['I7', 'I12'], axis=1) 

In [15]:
# C1
X['C1'] = X['C1'].fillna(0)
# C2
X['C2'] = X['C2'].fillna(0)
# I5
X['I5'] = X['I5'].replace(np.NaN, '未知') # '알 수 없음'으로 대체

In [32]:
# # E16, E18 은 결측치가 많고, 영향을 크게 미치지 않을 것으로 판단.. 금액이 더 중요하지 않을까?
# X = X.drop(['E16', 'E18'], axis=1) 

In [34]:
# # B6: Latest transfer time
# fmt = '%Y-%m-%d %H:%M:%S'
# X['B6'] = pd.to_datetime(X['B6'], format=fmt, errors='ignore')

In [35]:
# # E category
# fmt = '%Y-%m-%d'
# col_names = ['E2', 'E3', 'E5', 'E6', 'E10']
# for col_name in col_names:
#     X[col_name] = pd.to_datetime(X[col_name], format=fmt, errors='ignore')

In [36]:
# # 2, 3, 5, 6, 10 
# # 날짜 같은 경우는 min, max 값보다 mean 값이 더 적절할 것이라고 판단함,,
# X['B6'] = X['B6'].replace(np.NaN, X['B6'].mean())
# X['E2'] = X['E2'].replace(np.NaN, X['E2'].mean())
# X['E3'] = X['E3'].replace(np.NaN, X['E3'].mean())
# X['E5'] = X['E5'].replace(np.NaN, X['E5'].mean())
# X['E6'] = X['E6'].replace(np.NaN, X['E6'].mean())
# X['E10'] = X['E10'].replace(np.NaN, X['E10'].mean())

In [17]:
X['I5'].replace('不便分类的其他从业人员', '기타', inplace=True)
X['I5'].replace('商业工作人员', '직장인', inplace=True)
X['I5'].replace('服务性工作人员', '노동자', inplace=True)
X['I5'].replace('办事人员和有关人员', '사무원, 관계자', inplace=True)
X['I5'].replace('专业技术人员', '전문 기술자', inplace=True)
X['I5'].replace('未知', '알 수 없음', inplace=True)
X['I5'].replace('国家机关、党群组织、企业、事业单位负责人', '국가기관, 공공기관의 책임자', inplace=True)
X['I5'].replace('生产、运输设备操作人员及有关人员', '생산, 운송 설비 운영자', inplace=True)
X['I5'].replace('农、林、牧、渔、水利业生产人员', '농업, 임업, 목축업, 어업, 수리업 생산인력', inplace=True)
X['I5'].replace('军人', '군인', inplace=True)
X['I5'].replace('退休', '퇴직', inplace=True)

In [18]:
X['I1'].replace('女性', '여자', inplace=True)
X['I1'].replace('男性', '남자', inplace=True)

In [25]:
X['I3'].replace('普通客户', '일반', inplace=True)
X['I3'].replace('黄金', '황금', inplace=True)
X['I3'].replace('白金', '백금', inplace=True)
X['I3'].replace('钻石', '다이아', inplace=True)

In [26]:
X.to_csv('bank-churn-prediction/data/X_train.csv', index=False)

### X_true (Features for Testing)

In [28]:
X_true.isnull().sum()/len(X) * 100

cust_no     0.000000
X1          0.000000
X2          0.000000
X3          0.000000
X4          0.000000
X5          0.000000
X6          0.000000
X7          0.000000
X8          0.000000
B1          0.000000
B2          0.000000
B3          0.000000
B4          0.000000
B5          0.000000
B6          9.242512
B7          0.000000
E1          0.000000
E2          1.988355
E3          1.988355
E4         30.755836
E5         19.354284
E6          1.238162
E7         51.129419
E8         46.297902
E9         52.733042
E10         0.405379
E11        52.803931
E12        43.008755
E13        46.595226
E14        31.274777
E15         0.000000
E16        24.003414
E17         0.000000
E18        22.198822
C1          0.002065
C2          0.002065
I1          0.022024
I2          0.000000
I3          0.000000
I4          0.000000
I5          2.611909
I6          0.000000
I7          0.000000
I8          0.000000
I9         52.803931
I10        46.661298
I11         0.000000
I12         0

In [30]:
col_to_drop = ['E4', 'E7', 'E8', 'E9', 'E11', 'E12', 'E13', 'E14', 'I9', 'I10', 'I13', 'I14']
X_true = X_true.drop(col_to_drop, axis=1) # X에서 없앤 열들

In [29]:
X_true = X_true.drop(['I7', 'I8', 'I12'], axis=1) # X에서 없앤 열들

In [32]:
# C1
X_true['C1'] = X_true['C1'].fillna(0)
# C2
X_true['C2'] = X_true['C2'].fillna(0)
# I5
X_true['I5'] = X_true['I5'].replace(np.NaN, '未知') # '알 수 없음'으로 대체

In [33]:
X_true

Unnamed: 0,cust_no,X1,X2,X3,X4,X5,X6,X7,X8,B1,...,I4,I5,I6,I11,I15,I16,I17,I18,I19,I20
0,0x3b9b4615,200000,0.00,3.35,0,1.21,0.0,0.0,0,1,...,0.0,服务性工作人员,0,0.0,0,1.0,1,1,1,1
1,0x3b9ae61b,100000,0.00,282259.81,0,22310.47,0.0,0.0,0,11,...,1.0,未知,1,0.0,0,1.0,1,1,1,1
2,0x3b9add69,0,66712.43,6.61,0,0.00,0.0,0.0,0,0,...,0.0,未知,0,0.0,0,1.0,1,1,1,0
3,0x3b9b3601,80000,0.00,1505.15,0,0.00,0.0,0.0,350000,1,...,0.0,未知,0,0.0,0,1.0,1,1,1,0
4,0x3b9b2599,0,0.00,78.71,0,0.00,0.0,0.0,400000,0,...,0.0,不便分类的其他从业人员,0,0.0,0,1.0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76717,0xb2d69017,0,9900.00,368.67,0,0.00,0.0,0.0,0,6,...,0.0,商业工作人员,0,0.0,0,0.0,1,1,1,1
76718,0xb2d68153,0,0.00,5044.07,0,0.00,0.0,0.0,200000,0,...,0.0,服务性工作人员,0,0.0,0,1.0,0,0,0,0
76719,0xb2d5bba1,3700000,0.00,152.08,0,0.00,0.0,0.0,0,13,...,0.0,不便分类的其他从业人员,0,0.0,0,1.0,1,1,1,0
76720,0xb2d61b9b,50000,0.00,1118.68,0,0.00,0.0,0.0,0,0,...,0.0,农、林、牧、渔、水利业生产人员,0,0.0,0,0.0,1,1,1,0


In [37]:
X_true['I5'].value_counts()

不便分类的其他从业人员             24129
服务性工作人员                 14482
商业工作人员                  13274
办事人员和有关人员                9545
未知                       5332
专业技术人员                   5160
生产、运输设备操作人员及有关人员         2249
国家机关、党群组织、企业、事业单位负责人     1958
农、林、牧、渔、水利业生产人员           547
军人                         41
退休                          5
Name: I5, dtype: int64

In [38]:
X_true['I5'].replace('不便分类的其他从业人员', '기타', inplace=True)
X_true['I5'].replace('商业工作人员', '직장인', inplace=True)
X_true['I5'].replace('服务性工作人员', '노동자', inplace=True)
X_true['I5'].replace('办事人员和有关人员', '사무원, 관계자', inplace=True)
X_true['I5'].replace('专业技术人员', '전문 기술자', inplace=True)
X_true['I5'].replace('未知', '알 수 없음', inplace=True)
X_true['I5'].replace('国家机关、党群组织、企业、事业单位负责人', '국가기관, 공공기관의 책임자', inplace=True)
X_true['I5'].replace('生产、运输设备操作人员及有关人员', '생산, 운송 설비 운영자', inplace=True)
X_true['I5'].replace('农、林、牧、渔、水利业生产人员', '농업, 임업, 목축업, 어업, 수리업 생산인력', inplace=True)
X_true['I5'].replace('军人', '군인', inplace=True)
X_true['I5'].replace('退休', '퇴직', inplace=True)

In [39]:
X_true['I1'].replace('女性', '여자', inplace=True)
X_true['I1'].replace('男性', '남자', inplace=True)

In [40]:
X_true['I3'].replace('普通客户', '일반', inplace=True)
X_true['I3'].replace('黄金', '황금', inplace=True)
X_true['I3'].replace('白金', '백금', inplace=True)
X_true['I3'].replace('钻石', '다이아', inplace=True)

In [41]:
X_true

Unnamed: 0,cust_no,X1,X2,X3,X4,X5,X6,X7,X8,B1,...,I4,I5,I6,I11,I15,I16,I17,I18,I19,I20
0,0x3b9b4615,200000,0.00,3.35,0,1.21,0.0,0.0,0,1,...,0.0,노동자,0,0.0,0,1.0,1,1,1,1
1,0x3b9ae61b,100000,0.00,282259.81,0,22310.47,0.0,0.0,0,11,...,1.0,알 수 없음,1,0.0,0,1.0,1,1,1,1
2,0x3b9add69,0,66712.43,6.61,0,0.00,0.0,0.0,0,0,...,0.0,알 수 없음,0,0.0,0,1.0,1,1,1,0
3,0x3b9b3601,80000,0.00,1505.15,0,0.00,0.0,0.0,350000,1,...,0.0,알 수 없음,0,0.0,0,1.0,1,1,1,0
4,0x3b9b2599,0,0.00,78.71,0,0.00,0.0,0.0,400000,0,...,0.0,기타,0,0.0,0,1.0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76717,0xb2d69017,0,9900.00,368.67,0,0.00,0.0,0.0,0,6,...,0.0,직장인,0,0.0,0,0.0,1,1,1,1
76718,0xb2d68153,0,0.00,5044.07,0,0.00,0.0,0.0,200000,0,...,0.0,노동자,0,0.0,0,1.0,0,0,0,0
76719,0xb2d5bba1,3700000,0.00,152.08,0,0.00,0.0,0.0,0,13,...,0.0,기타,0,0.0,0,1.0,1,1,1,0
76720,0xb2d61b9b,50000,0.00,1118.68,0,0.00,0.0,0.0,0,0,...,0.0,"농업, 임업, 목축업, 어업, 수리업 생산인력",0,0.0,0,0.0,1,1,1,0


In [42]:
X_true.to_csv('bank-churn-prediction/data/X_test.csv', index=False)