In [1]:
import pandas as pd
import os
import numpy as np

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from catboost import CatBoostClassifier, Pool
from bayes_opt import BayesianOptimization

data_path = '../../data'

random_seed = 20
import json
path = '../code/para_dict/data_list.json'
with open(path,'r',encoding='utf-8') as f:
    para = json.loads(f.read())
    
data_list= para['data_list_FE_AN7']
delete_list = para['delete_list_overfit1']

def load_data(data_list):
    data=[]
    for d in data_list:
        x = pd.read_csv('../data/preprocess/{}'.format(d))
        x_null = x.isnull().sum()
        
        print('\n',d,x.shape)
        print("Null columns:\n",x_null[x_null>0])

        if (d=='FE_data1.csv') or (d=='FE_data2.csv'):
            x.fillna(value=-1,inplace=True)
        
        if d[:8]=='FE_data9':
            if d!='FE_data9_raw.csv':
                x = x.drop(columns=['bacno_shift1','bacno_shiftm1'])
        data.append(x)

    all_data = pd.concat(data,axis=1)
    del data
    all_data_numsum = all_data.isnull().sum()
    print('ALL data shape:',all_data.shape)
    print('ALL data null:')
    print(all_data_numsum[all_data_numsum>0])
    return all_data


    
    
all_data = load_data(data_list)
category_list=['csmcu','hcefg','stscd','scity','stocn','mcc','acqic',\
                'mchno','etymd','contp','locdt_week']
#                 'ovrlt','insfg','ecfg',\
# 'cano_only_consecutive_stscd2','bacno_consecutive_and_only_ecfg','bacno_consecutive_and_only_ecfg',\
# 'cano_lastday_use_twokind','cano_lastlocdt2','bacno_stscd_equal2','bacno_ecfg_equal1']
## mode
for c in all_data.columns:
    print(c)
print(all_data.dtypes)

for c in category_list:
    if all_data[c].dtypes == 'float64':
        all_data[c] = all_data[c].astype('int')
    all_data[c]=all_data[c].astype('category')

for c in all_data.columns[all_data.dtypes==bool]:
    all_data[c]=all_data[c].map({True:1,False:0})
    print(all_data[c].value_counts())

bool_list= ['cano_lastlocdt2_shift1','cano_lastlocdt2_shiftm1','bacno_stscd_equal2_shift1','bacno_stscd_equal2_shiftm1',\
            'bacno_ecfg_equal1_shift1','bacno_ecfg_equal1_shiftm1']
for c in bool_list:
    all_data[c]=all_data[c].map({'True':1,'False':0,'-1':-1})
    print(c)
    print(all_data[c].value_counts(dropna=False))
    
# for c in all_data.columns:
#     print(all_data[c].value_counts().head())
    
## 切三種不同的訓練集驗證
X_train1 = all_data[all_data['locdt']<=60].drop(columns=delete_list)
y_train1 = all_data[all_data['locdt']<=60]['fraud_ind']
X_test1 = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test1 = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)]['fraud_ind']

X_train_all = all_data[all_data['locdt']<=90].drop(columns=delete_list)
y_train_all = all_data[all_data['locdt']<=90]['fraud_ind'] 
X_test_all = all_data[all_data['locdt']>90] .drop(columns=delete_list)
y_test_all = all_data[all_data['locdt']>90]['fraud_ind'] 

categorical_features_indices = np.where(X_train1.columns.isin(category_list))[0]
print(X_train1.dtypes[categorical_features_indices])

param_cat={
    'loss_function':'Logloss',
    'eval_metric':'F1',
    
    'iterations':3000,
    'scale_pos_weight':1,
    'target_border':0.5,
    'random_seed':random_seed,
    'thread_count':1,
    'task_type':"GPU",
    'devices':'0:1',
    'verbose':20,

    # 'min_data_in_leaf':1,
    # 'has_time':True,

    'learning_rate':0.1,
    'l2_leaf_reg':1.12,#20
    'depth':15,
    'max_leaves':38,
    'bagging_temperature':0.32,#10
    'random_strength':10,
    # 'rsm':0.8,

    # 'fold_permutation_block':1,
    # 'feature_border_type':'MinEntropy',
    # 'boosting_type':'Ordered',
    # 'leaf_estimation_backtracking':'Armijo',
    
    'one_hot_max_size':200,
    'grow_policy':'Lossguide',
    # 'grow_policy':'Depthwise',
}


 raw_data.csv (1943452, 23)
Null columns:
 fraud_ind    421665
dtype: int64

 FE_data1.csv (1943452, 56)
Null columns:
 cano_conam_skew      92612
cano_conam_kurt     155720
cano_conam_var       38678
bacno_locdt_skew     58303
bacno_locdt_kurt    101191
cano_locdt_skew      92612
cano_locdt_kurt     155720
dtype: int64

 FE_data2.csv (1943452, 30)
Null columns:
 Series([], dtype: int64)

 FE_data2_2.csv (1943452, 31)
Null columns:
 Series([], dtype: int64)

 FE_data3.csv (1943452, 9)
Null columns:
 Series([], dtype: int64)

 FE_data4.csv (1943452, 4)
Null columns:
 Series([], dtype: int64)

 FE_data4_2.csv (1943452, 3)
Null columns:
 Series([], dtype: int64)

 FE_data5.csv (1943452, 4)
Null columns:
 Series([], dtype: int64)

 FE_data6.csv (1943452, 17)
Null columns:
 Series([], dtype: int64)

 FE_data8.csv (1943452, 11)
Null columns:
 Series([], dtype: int64)

 FE_data9_3.csv (1943452, 20)
Null columns:
 Series([], dtype: int64)

 FE_data9_4.csv (1943452, 10)
Null columns:
 Series([

0    1750491
1     192961
Name: bacno_ismax_conam, dtype: int64
0    1687135
1     256317
Name: bacno_ismin_conam, dtype: int64
1    1750733
0     192719
Name: bacno_stocn_ismode, dtype: int64
1    1209667
0     733785
Name: bacno_scity_ismode, dtype: int64
1    1682930
0     260522
Name: bacno_csmcu_ismode, dtype: int64
1    1773728
0     169724
Name: cano_stocn_ismode, dtype: int64
1    1270111
0     673341
Name: cano_scity_ismode, dtype: int64
1    1719562
0     223890
Name: cano_csmcu_ismode, dtype: int64
0    1874608
1      68844
Name: cano_lastlocdt2, dtype: int64
0    1941042
1       2410
Name: bacno_stscd_equal2, dtype: int64
0    1922205
1      21247
Name: bacno_ecfg_equal1, dtype: int64
1    1258954
0     684498
Name: bacno_cano_monoincrease, dtype: int64
cano_lastlocdt2_shift1
 0    1711423
-1     163185
 1      68844
Name: cano_lastlocdt2_shift1, dtype: int64
cano_lastlocdt2_shiftm1
 0    1715834
-1     163185
 1      64433
Name: cano_lastlocdt2_shiftm1, dtype: int64
bacno_

In [2]:
model = CatBoostClassifier(**param_cat)
model.fit(X_train1, y_train1,
cat_features=categorical_features_indices,    
eval_set=(X_test1, y_test1),
early_stopping_rounds=800,
verbose=500) 

0:	learn: 0.7017376	test: 0.4784265	best: 0.4784265 (0)	total: 42ms	remaining: 2m 6s
500:	learn: 0.9748994	test: 0.6291080	best: 0.6311081 (468)	total: 18.7s	remaining: 1m 33s
1000:	learn: 0.9996711	test: 0.6353180	best: 0.6360153 (987)	total: 37.1s	remaining: 1m 14s
1500:	learn: 1.0000000	test: 0.6416290	best: 0.6419194 (1474)	total: 55.6s	remaining: 55.5s
2000:	learn: 1.0000000	test: 0.6409791	best: 0.6419669 (1981)	total: 1m 14s	remaining: 37.2s
2500:	learn: 1.0000000	test: 0.6453949	best: 0.6457085 (2445)	total: 1m 34s	remaining: 18.8s
2999:	learn: 1.0000000	test: 0.6487342	best: 0.6502708 (2836)	total: 1m 53s	remaining: 0us
bestTest = 0.6502707581
bestIteration = 2836
Shrink model to first 2837 iterations.


<catboost.core.CatBoostClassifier at 0x7f59b8c2e550>

In [9]:
y_test_pred_cat = model.predict_proba(X_test_all)[:,1]
print(y_test_pred_cat.sum(),y_test_pred_cat.shape[0])

th=0.000001
p_id = y_test_pred_cat<=(th)
n_id = y_test_pred_cat>(th)
y_test_pred_cat2 = y_test_pred_cat.copy()
y_test_pred_cat2[p_id]=1
y_test_pred_cat2[n_id]=0
print(y_test_pred_cat2.sum(),y_test_pred_cat2.sum()/y_test_pred_cat2.shape[0])
X_test_all2 = all_data[all_data['locdt']>90]
X_test_all2 = X_test_all2.loc[p_id]
X_test_all2['fraud_ind']=0
X_test_all2.to_csv('../data/preprocess/X_test_select_th0000001_AN7.csv',index=False)

5363.498202035436 421665
300503.0 0.7126581527990229


In [7]:
# y_test_pred_cat = model.predict_proba(X_test_all)[:,1]
# print(y_test_pred_cat.sum(),y_test_pred_cat.shape[0])

# th=0.6
# p_id = y_test_pred_cat>(th)
# n_id = y_test_pred_cat<=(th)
# y_test_pred_cat2 = y_test_pred_cat.copy()
# y_test_pred_cat2[p_id]=1
# y_test_pred_cat2[n_id]=0
# print(y_test_pred_cat2.sum(),y_test_pred_cat2.sum()/y_test_pred_cat2.shape[0])
# X_test_all2 = all_data[all_data['locdt']>90]
# X_test_all2 = X_test_all2.loc[p_id]
# X_test_all2['fraud_ind']=1
# X_test_all2.to_csv('../data/preprocess/X_test_select_th06_AN7.csv',index=False)

# th=0.8
# p_id = y_test_pred_cat>(th)
# n_id = y_test_pred_cat<=(th)
# y_test_pred_cat2 = y_test_pred_cat.copy()
# y_test_pred_cat2[p_id]=1
# y_test_pred_cat2[n_id]=0
# print(y_test_pred_cat2.sum(),y_test_pred_cat2.sum()/y_test_pred_cat2.shape[0])
# X_test_all2 = all_data[all_data['locdt']>90]
# X_test_all2 = X_test_all2.loc[p_id]
# X_test_all2['fraud_ind']=1
# X_test_all2.to_csv('../data/preprocess/X_test_select_th08_AN7.csv',index=False)

# th=0.9
# p_id = y_test_pred_cat>(th)
# n_id = y_test_pred_cat<=(th)
# y_test_pred_cat2 = y_test_pred_cat.copy()
# y_test_pred_cat2[p_id]=1
# y_test_pred_cat2[n_id]=0
# print(y_test_pred_cat2.sum(),y_test_pred_cat2.sum()/y_test_pred_cat2.shape[0])
# X_test_all2 = all_data[all_data['locdt']>90]
# X_test_all2 = X_test_all2.loc[p_id]
# X_test_all2['fraud_ind']=1
# X_test_all2.to_csv('../data/preprocess/X_test_select_th09_AN7.csv',index=False)


5225.908192910114 421665
4598.0 0.01090439092644635
3958.0 0.009386598366001447
3551.0 0.008421377159593516


In [29]:
def find_new_category(x,target_name):
    x_train = x[x['locdt']<=90][target_name].unique()
    x_test = x[x['locdt']>90][target_name].unique()
    
    print(target_name)
    print('{} categories in Training data:'.format(x_train.shape[0]))
    print('{} categories in Testing data:'.format(x_test.shape[0]))

    x_new_test=[]
    for b in x_test:
        if b not in x_train:
            x_new_test.append(b)

    print('{} new categories'.format(len(x_new_test)))
    return x_new_test

In [None]:
# all_data2 = all_data.copy()
# # 幾乎都太多只出現在test data上的新類別,這樣得轉換可以用來訓練(TODO)

# for c in ['mchno','acqic','mcc','stocn','scity','csmcu']:
#     new_category = find_new_category(all_data2,c)
#     tmp_df = pd.DataFrame(new_category,columns=[c])
#     tmp_df['new_{}'.format(c)]=1
#     all_data2 = pd.merge(all_data2,tmp_df,on=c,how='left')
#     all_data2['new_{}'.format(c)].fillna(value=0,inplace=True)
# print(all_data2)

bacno
95214 categories in Training data:
71099 categories in Testing data:
67971 new categories
mchno
89316 categories in Training data:
45501 categories in Testing data:
13467 new categories
acqic
6051 categories in Training data:
3748 categories in Testing data:
815 new categories
mcc
434 categories in Training data:
372 categories in Testing data:
26 new categories
stocn
103 categories in Training data:
87 categories in Testing data:
6 new categories
scity
5698 categories in Training data:
2857 categories in Testing data:
949 new categories


In [44]:
# for c in ['mchno','acqic','mcc','stocn','scity','csmcu']:
#     print(all_data2['new_{}'.format(c)].sum())
    
# all_data2['new_category']=0

# for c in ['mchno','acqic','mcc','stocn','scity','csmcu']:
#     all_data2['new_category']+=all_data2['new_{}'.format(c)]
# print(all_data2['new_category'])
# print((all_data2['new_category']>0).sum())

0          0.0
1          0.0
2          0.0
3          0.0
4          0.0
5          0.0
6          0.0
7          0.0
8          0.0
9          0.0
10         0.0
11         0.0
12         0.0
13         0.0
14         0.0
15         0.0
16         0.0
17         0.0
18         0.0
19         0.0
20         0.0
21         0.0
22         0.0
23         0.0
24         0.0
25         0.0
26         0.0
27         0.0
28         0.0
29         0.0
          ... 
1943422    0.0
1943423    0.0
1943424    0.0
1943425    0.0
1943426    0.0
1943427    0.0
1943428    0.0
1943429    0.0
1943430    0.0
1943431    0.0
1943432    0.0
1943433    0.0
1943434    0.0
1943435    0.0
1943436    0.0
1943437    0.0
1943438    0.0
1943439    0.0
1943440    0.0
1943441    0.0
1943442    0.0
1943443    0.0
1943444    0.0
1943445    0.0
1943446    0.0
1943447    0.0
1943448    0.0
1943449    0.0
1943450    0.0
1943451    0.0
Name: new_category, Length: 1943452, dtype: float64
21541


In [45]:
# test_all_data = all_data2[all_data2['locdt']>90]
# test_all_data_good = test_all_data['new_category']<1
# test_all_data_bad = test_all_data['new_category']>=1
# np.save('../data/preprocess/test_data_good_index.npy',test_all_data['new_category']<1)
# np.save('../data/preprocess/test_data_bad_index.npy',test_all_data['new_category']>=1)