In [None]:
import pandas as pd
import os
import numpy as np

from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from catboost import CatBoostClassifier, Pool
from bayes_opt import BayesianOptimization

random_seed = 20
import json
path = '../../code/para_dict/data_list.json'
with open(path,'r',encoding='utf-8') as f:
    para = json.loads(f.read())
    
data_list= para['data_list_FE_AN3']
delete_list = para['delete_list_overfit1']

def load_data(data_list):
    data=[]
    for d in data_list:
        x = pd.read_csv('../../data/preprocess/{}'.format(d))
        x_null = x.isnull().sum()
        
        print('\n',d,x.shape)
        print("Null columns:\n",x_null[x_null>0])

        if (d=='FE_data1.csv') or (d=='FE_data2.csv'):
            x.fillna(value=-1,inplace=True)
        
        if d[:8]=='FE_data9':
            if d!='FE_data9_raw.csv':
                x = x.drop(columns=['bacno_shift1','bacno_shiftm1'])
        data.append(x)

    all_data = pd.concat(data,axis=1)
    del data
    all_data_numsum = all_data.isnull().sum()
    print('ALL data shape:',all_data.shape)
    print('ALL data null:')
    print(all_data_numsum[all_data_numsum>0])
    return all_data


    
    
all_data = load_data(data_list)
category_list=['csmcu','hcefg','stscd','scity','stocn','mcc','acqic',\
                'mchno','etymd','contp','locdt_week']
#                 'ovrlt','insfg','ecfg',\
# 'cano_only_consecutive_stscd2','bacno_consecutive_and_only_ecfg','bacno_consecutive_and_only_ecfg',\
# 'cano_lastday_use_twokind','cano_lastlocdt2','bacno_stscd_equal2','bacno_ecfg_equal1']
## mode
for c in all_data.columns:
    print(c)
print(all_data.dtypes)

for c in category_list:
    if all_data[c].dtypes == 'float64':
        all_data[c] = all_data[c].astype('int')
    all_data[c]=all_data[c].astype('category')

for c in all_data.columns[all_data.dtypes==bool]:
    all_data[c]=all_data[c].map({True:1,False:0})
    print(all_data[c].value_counts())

bool_list= ['cano_lastlocdt2_shift1','cano_lastlocdt2_shiftm1','bacno_stscd_equal2_shift1','bacno_stscd_equal2_shiftm1',\
            'bacno_ecfg_equal1_shift1','bacno_ecfg_equal1_shiftm1']
for c in bool_list:
    if c in all_data.columns:
        all_data[c]=all_data[c].map({'True':1,'False':0,'-1':-1})
        print(c)
        print(all_data[c].value_counts(dropna=False))
        print(all_data[c].value_counts().head())
    
## 切三種不同的訓練集驗證
X_train1 = all_data[all_data['locdt']<=60].drop(columns=delete_list)
y_train1 = all_data[all_data['locdt']<=60]['fraud_ind']
X_test1 = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test1 = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)]['fraud_ind']

X_train_all = all_data[all_data['locdt']<=90].drop(columns=delete_list)
y_train_all = all_data[all_data['locdt']<=90]['fraud_ind'] 
X_test_all = all_data[all_data['locdt']>90] .drop(columns=delete_list)
y_test_all = all_data[all_data['locdt']>90]['fraud_ind'] 

categorical_features_indices = np.where(X_train1.columns.isin(category_list))[0]
print(X_train1.dtypes[categorical_features_indices])

param_cat={
    'loss_function':'Logloss',
    'eval_metric':'F1',
    
    'iterations':6000,
    'scale_pos_weight':1,
    'target_border':0.5,
    'random_seed':random_seed,
    'thread_count':1,
    'task_type':"GPU",
    'devices':'0:1',
    'verbose':20,

    # 'min_data_in_leaf':1,
    # 'has_time':True,

    'learning_rate':0.03,
    'l2_leaf_reg':1.12,#20
    'depth':15,
    'max_leaves':38,
    'bagging_temperature':0.32,#10
    'random_strength':10,
    # 'rsm':0.8,

    # 'fold_permutation_block':1,
    # 'feature_border_type':'MinEntropy',
    # 'boosting_type':'Ordered',
    # 'leaf_estimation_backtracking':'Armijo',
    
    'one_hot_max_size':200,
    'grow_policy':'Lossguide',
    # 'grow_policy':'Depthwise',
}

In [None]:
model = CatBoostClassifier(**param_cat)
model.fit(X_train1, y_train1,
cat_features=categorical_features_indices,    
eval_set=(X_test1, y_test1),
early_stopping_rounds=800,
verbose=500) 

In [None]:
# y_test_pred_cat = model.predict_proba(X_test_all)[:,1]
# print(y_test_pred_cat.sum(),y_test_pred_cat.shape[0])

# th=0.000001
# p_id = y_test_pred_cat<=(th)
# n_id = y_test_pred_cat>(th)
# y_test_pred_cat2 = y_test_pred_cat.copy()
# y_test_pred_cat2[p_id]=1
# y_test_pred_cat2[n_id]=0
# print(y_test_pred_cat2.sum(),y_test_pred_cat2.sum()/y_test_pred_cat2.shape[0])
# X_test_all2 = all_data[all_data['locdt']>90]
# X_test_all2 = X_test_all2.loc[p_id]
# X_test_all2['fraud_ind']=0
# X_test_all2.to_csv('../data/preprocess/X_test_select_th0000001_AN7.csv',index=False)

In [None]:
y_test_pred_cat = model.predict_proba(X_test_all)[:,1]
print(y_test_pred_cat.sum(),y_test_pred_cat.shape[0])

# th=0.6
# p_id = y_test_pred_cat>(th)
# n_id = y_test_pred_cat<=(th)
# y_test_pred_cat2 = y_test_pred_cat.copy()
# y_test_pred_cat2[p_id]=1
# y_test_pred_cat2[n_id]=0
# print(y_test_pred_cat2.sum(),y_test_pred_cat2.sum()/y_test_pred_cat2.shape[0])
# X_test_all2 = all_data[all_data['locdt']>90]
# X_test_all2 = X_test_all2.loc[p_id]
# X_test_all2['fraud_ind']=1
# X_test_all2.to_csv('../data/preprocess/X_test_select_th06_AN7.csv',index=False)

# th=0.8
# p_id = y_test_pred_cat>(th)
# n_id = y_test_pred_cat<=(th)
# y_test_pred_cat2 = y_test_pred_cat.copy()
# y_test_pred_cat2[p_id]=1
# y_test_pred_cat2[n_id]=0
# print(y_test_pred_cat2.sum(),y_test_pred_cat2.sum()/y_test_pred_cat2.shape[0])
# X_test_all2 = all_data[all_data['locdt']>90]
# X_test_all2 = X_test_all2.loc[p_id]
# X_test_all2['fraud_ind']=1
# X_test_all2.to_csv('../data/preprocess/X_test_select_th08_AN7.csv',index=False)

th=0.9
p_id = y_test_pred_cat>(th)
n_id = y_test_pred_cat<=(th)
y_test_pred_cat2 = y_test_pred_cat.copy()
y_test_pred_cat2[p_id]=1
y_test_pred_cat2[n_id]=0
print(y_test_pred_cat2.sum(),y_test_pred_cat2.sum()/y_test_pred_cat2.shape[0])
X_test_all2 = all_data[all_data['locdt']>90]
X_test_all2 = X_test_all2.loc[p_id]
X_test_all2['fraud_ind']=1
X_test_all2.to_csv('../../data/preprocess/X_test_select_th09_AN3.csv',index=False)


In [None]:
def find_new_category(x,target_name):
    x_train = x[x['locdt']<=90][target_name].unique()
    x_test = x[x['locdt']>90][target_name].unique()
    
    print(target_name)
    print('{} categories in Training data:'.format(x_train.shape[0]))
    print('{} categories in Testing data:'.format(x_test.shape[0]))

    x_new_test=[]
    for b in x_test:
        if b not in x_train:
            x_new_test.append(b)

    print('{} new categories'.format(len(x_new_test)))
    return x_new_test

In [None]:
# all_data2 = all_data.copy()
# # 幾乎都太多只出現在test data上的新類別,這樣得轉換可以用來訓練(TODO)

# for c in ['mchno','acqic','mcc','stocn','scity','csmcu']:
#     new_category = find_new_category(all_data2,c)
#     tmp_df = pd.DataFrame(new_category,columns=[c])
#     tmp_df['new_{}'.format(c)]=1
#     all_data2 = pd.merge(all_data2,tmp_df,on=c,how='left')
#     all_data2['new_{}'.format(c)].fillna(value=0,inplace=True)
# print(all_data2)

In [None]:
# for c in ['mchno','acqic','mcc','stocn','scity','csmcu']:
#     print(all_data2['new_{}'.format(c)].sum())
    
# all_data2['new_category']=0

# for c in ['mchno','acqic','mcc','stocn','scity','csmcu']:
#     all_data2['new_category']+=all_data2['new_{}'.format(c)]
# print(all_data2['new_category'])
# print((all_data2['new_category']>0).sum())

In [None]:
# test_all_data = all_data2[all_data2['locdt']>90]
# test_all_data_good = test_all_data['new_category']<1
# test_all_data_bad = test_all_data['new_category']>=1
# np.save('../data/preprocess/test_data_good_index.npy',test_all_data['new_category']<1)
# np.save('../data/preprocess/test_data_bad_index.npy',test_all_data['new_category']>=1)