In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import math

%matplotlib inline
data_path = '../data'

In [None]:
train_data_path = os.path.join(data_path,'train.zip')
train_data = pd.read_csv(train_data_path, encoding = "big5")

test_data_path = os.path.join(data_path,'test.zip')
test_data = pd.read_csv(test_data_path, encoding = "big5")

train_data_num = train_data.shape[0]
test_data_txkey = test_data['txkey'].copy()

train_data = train_data.sort_values(by=['bacno','locdt','loctm']).reset_index(drop=True)
label_data = train_data['fraud_ind'].copy()

all_data = pd.concat([train_data,test_data],axis=0).reset_index(drop=True)
print(all_data.index)
print(train_data.shape)
print(test_data.shape)
print(all_data.shape)

In [None]:
train_data.head()

In [None]:
all_data.ecfg = all_data.ecfg.map({'N':0,'Y':1})
all_data.ovrlt = all_data.ovrlt.map({'N':0,'Y':1})
all_data.insfg = all_data.insfg.map({'N':0,'Y':1})
all_data.flbmk = all_data.flbmk.map({'N':0,'Y':1})
all_data.flg_3dsmk = all_data.flg_3dsmk.map({'N':0,'Y':1})
all_data.loctm = all_data.loctm.astype(int)
all_data = all_data.infer_objects()

# print(all_data.dtypes)
print('Missing value training data:\n',train_data.isna().sum()[train_data.isna().sum()>0])
print('Missing value testing data:\n',test_data.isna().sum()[test_data.isna().sum()>0])

## not neccessary to fill null value, since we use lgb model
all_data.flbmk = all_data.flbmk.fillna(value=all_data.flbmk.mean(skipna=True))
all_data.flg_3dsmk = all_data.flg_3dsmk.fillna(value=all_data.flg_3dsmk.mean(skipna=True))

## Anomaly detection
* one class svm
* isolation tree
* replicator NN
* KNN(take too much time)

In [None]:
weird1 = (all_data['insfg']==1)&(all_data['iterm']==0)
print(weird1.value_counts())

## Feature engineering
* train & valid only

In [None]:
positive_data = all_data[all_data['fraud_ind']==1].copy()
# print(positive_data)

In [None]:
## transform large type category features
## 將太多種category的feature bin cut

# th=100
# category_list = all_data['mchno'].value_counts()[:th].index
# all_data2 = all_data.copy()
# all_data2[~all_data['mchno'].isin(category_list)]=-1
# print(all_data2['mchno'].value_counts()[:th])

# th=100
# category_list = all_data['acqic'].value_counts()[:th].index
# all_data2 = all_data.copy()
# all_data2[~all_data['acqic'].isin(category_list)]=-1
# print(all_data2['acqic'].value_counts()[:th])

# th=100
# category_list = all_data['mcc'].value_counts()[:th].index
# all_data2 = all_data.copy()
# all_data2[~all_data['mcc'].isin(category_list)]=-1
# print(all_data2['mcc'].value_counts()[:th])

th=15
category_list = all_data['stocn'].value_counts()[:th].index
all_data2 = all_data.copy()
all_data2[~all_data['stocn'].isin(category_list)]=-1
print(all_data2['stocn'].value_counts()[:th])

th=20
category_list = all_data['scity'].value_counts()[:th].index
all_data2 = all_data.copy()
all_data2[~all_data['scity'].isin(category_list)]=-1
print(all_data2['scity'].value_counts()[:th])

th=10
category_list = all_data['csmcu'].value_counts()[:th].index
all_data2 = all_data.copy()
all_data2[~all_data['csmcu'].isin(category_list)]=-1
print(all_data2['csmcu'].value_counts()[:th])

In [None]:
all_data['conam'] = np.log(all_data['conam']+1)
all_data['iterm'] = all_data['iterm'].apply(lambda x:1 if x>0 else 0)

all_data['locdt_week'] = all_data['locdt']%7+1
all_data['locdt_month'] = all_data['locdt']%30+1

all_data['loctm_hr'] = all_data['loctm'].apply(lambda s:s//10000).astype(int)
all_data['loctm_hr2'] = all_data['loctm'].apply(lambda s:s//1000).astype(int)
# all_data['loctm_hr_sin'] = all_data['loctm_hr'].apply(lambda s:math.sin(s/24*math.pi)).astype(int)
# all_data['loctm_hr2_sin'] = all_data['loctm_hr2'].apply(lambda s:math.sin(s/240*math.pi)).astype(int)

mean_df = all_data.groupby(['bacno'])['cano'].nunique().reset_index()
mean_df.columns = ['bacno', 'cano'+'_count']
all_data = pd.merge(all_data, mean_df, on='bacno', how='left')

mean_df = all_data.groupby(['bacno'])['txkey'].nunique().reset_index()
mean_df.columns = ['bacno', 'txkey'+'_count']
all_data = pd.merge(all_data, mean_df, on='bacno', how='left')

mean_df = all_data.groupby(['bacno'])['loctm_hr'].mean().reset_index()
mean_df.columns = ['bacno', 'loctm_hr'+'_mean']
all_data = pd.merge(all_data, mean_df, on='bacno', how='left')

mean_df = all_data.groupby(['bacno'])['loctm_hr'].var().reset_index()
mean_df.columns = ['bacno', 'loctm_hr'+'_var']
mean_df.fillna(value=-1,inplace=True)
# print(mean_df.value_count)
all_data = pd.merge(all_data, mean_df, on='bacno', how='left')


In [None]:

# for i in range(500):
#     print(i,all_data.groupby(['bacno']).get_group(i)[['ecfg','fraud_ind']])
# mean_df = all_data.groupby(['bacno'])['fraud_ind'].mean().reset_index()
# mean_df.columns = ['bacno', 'loctm_hr'+'_mean']
# all_data = pd.merge(all_data, mean_df, on='bacno', how='left')

# print(all_data[['bacno','locdt','loctm']])

# 該交易的歸戶帳號是否曾經被盜刷 0->沒 1->有 -1->無紀錄


# 該交易的歸戶帳號是否曾經被盜刷卻又復原
# 該交易的歸戶帳號是否第一次刷卡
# 該交易的歸戶帳號第幾次刷卡

# 該交易的卡號是否曾經被盜刷
# 該交易的卡號是否曾經被盜刷卻又復原
# 該交易的卡號是否第一次刷卡
# 該交易的卡號第幾次刷卡

mean_df = all_data.groupby(['bacno']).apply(lambda s:s.mode()).reset_index()
mean_df.columns = ['bacno', 'stocn'+'_mode']
mean_df.fillna(-1,inplace=True)
print(mean_df.stocn_mode.value_counts())
# all_data = pd.merge(all_data, mean_df, on='bacno', how='left')

# 消費國別是否跟自己所有消費的眾數不一樣
# 消費城市是否跟自己所有消費的眾數不一樣
# 消費地幣別是否跟自己所有消費的眾數不一樣
# 支付型態是否跟自己所有消費的眾數不一樣
# 分期期數是否跟自己所有消費的眾數不一樣

# 是否第一次網路消費且過去有非網路消費的經驗



In [None]:
# data = pd.concat([df[:train_num], train_Y], axis=1)
# for c in df.columns:
#     mean_df = data.groupby([c])['SalePrice'].mean().reset_index()
#     mean_df.columns = [c, f'{c}_mean']
#     data = pd.merge(data, mean_df, on=c, how='left')
#     data = data.drop([c] , axis=1)


# all_data['howmany_cano'] = 
# all_data['howmany_txkey'] = 

## bacno刷卡頻率分佈

# all_data['fraud_before'] =
# all_data['fraud_last_time'] =

# 印出某個被盜刷的人的刷卡使用時間分佈


In [None]:
delete_list = ['bacno','txkey','locdt','loctm','cano','fraud_ind']
binary_list=['ecfg','insfg','ovrlt','flbmk','flg_3dsmk']
category_list=['contp','etymd','hcefg','stocn','scity','stscd','csmcu']

## Parse train,valid

In [None]:
X_train = all_data[all_data['locdt']<=60].drop(columns=delete_list)
y_train = all_data[all_data['locdt']<=60]['fraud_ind']
X_test = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)].drop(columns=delete_list)
y_test = all_data[(all_data['locdt']>60) & (all_data['locdt']<=90)]['fraud_ind']

# all_train = all_data[all_data['locdt']<=90]
# all_test = all_data[all_data['locdt']<=90]['fraud_ind']
# test_data = all_data[all_data['locdt']>90]

In [None]:
from sklearn.ensemble import IsolationForest

c_ratio = y_train.sum()/y_train.shape[0]
# fit the model
clf = IsolationForest(behaviour='new', max_samples=0.8, max_features=1,
                      random_state=1, contamination=c_ratio)
clf.fit(X_train)

In [None]:
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

y_pred_test2 = -y_pred_test
y_pred_test2[y_pred_test2==-1]=0
y_pred_test2.sum()

y_pred_train2 = -y_pred_train
y_pred_train2[y_pred_train2==-1]=0
y_pred_train2.sum()

from sklearn.metrics import f1_score
print(f1_score(y_train, y_pred_train2))
print(f1_score(y_test, y_pred_test2))

In [None]:
random_seed = 33

print(X_train.shape)
print(y_train.sum()/y_train.shape[0])
print(y_test.sum()/y_test.shape[0])


import lightgbm as lgb
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import f1_score
def lgb_f1_score(y_true, y_pred):
    y_pred = np.round(y_pred) # scikits f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_pred), True

param_dist_lgb = {
#                   'num_leaves':25, 
#                   'max_depth':5, 
                  'learning_rate':0.1, 
                  'n_estimators':3500,
                  'objective': 'binary',
#                   'subsample': 1, 
#                   'colsample_bytree': 0.5, 
#                   'lambda_l1': 0.1,
#                   'lambda_l2': 0,
#                   'min_child_weight': 1,
                  'random_state': random_seed,
                 }
evals_result = {}

lgb_clf = LGBMClassifier(**param_dist_lgb)
lgb_clf.fit(X_train, y_train,
        eval_set=[(X_train, y_train),(X_test, y_test)],
        eval_metric=lgb_f1_score,
        early_stopping_rounds=600,
        verbose=True,
        callbacks=[lgb.record_evaluation(evals_result)]
        )

print('F1',f1_score(y_test, lgb_clf.predict(X_test)))

In [None]:
print('Plotting metrics recorded during training...')
ax = lgb.plot_metric(evals_result, metric='f1')
plt.show()

print('Plotting feature importances...')
ax = lgb.plot_importance(lgb_clf, max_num_features=10)
plt.show()

print('Plotting 4th tree...')  # one tree use categorical feature to split
ax = lgb.plot_tree(lgb_clf, tree_index=3, figsize=(15, 15), show_info=['split_gain'])
plt.show()

print('Plotting 4th tree with graphviz...')
graph = lgb.create_tree_digraph(lgb_clf, tree_index=3, name='Tree4')
graph.render(view=True)

## write csv

In [None]:
# lgb_clf = LGBMClassifier(**param_dist_lgb)
# lgb_clf.fit(train_data,label_data)

# result = lgb_clf.predict(test_data)
# print(result.sum())
# print(result.sum()/result.shape[0])
# print(label_data.sum()/label_data.shape[0])

# test_data_txkey = test_data['txkey'].copy()

# import csv
# with open('../prediction/submit_lgb.csv','w') as f:
#     writer = csv.writer(f)
#     writer.writerow(['txkey','fraud_ind'])
#     for i in range(result.shape[0]):
#         writer.writerow([test_data_txkey[i], result[i]])

## 觀測 train, validation, test的distribution
* 跟fraud_ind相關的，檢查train和validation就好（假設test data跟validation類似）
* 每筆feature的distribution，檢查train,validation和test


In [None]:
def analze_distribution(data, target_col, feature, data_test):
#     if data[feature].nunique()!=data_test[feature].nunique():
#         print('data nunique not the same')
    
    mean_data = data.groupby(feature)[target_col].mean()
    mean_data_test = data_test.groupby(feature)[target_col].mean()
    distribution_data = data[feature].value_counts(dropna=False)
    distribution_data_test = data_test[feature].value_counts(dropna=False)
    
    fig, axs = plt.subplots(2,2,figsize=(10,10))
    axs[0,0].plot(mean_data.index, mean_data.values, marker='o')
    axs[0,0].set_title('Average {} wrt {}'.format(target_col,feature))
    axs[0,0].set_ylabel('mean of {}'.format(target_col))
    axs[0,0].set_xlabel(feature)
    axs[0,1].bar(distribution_data.index, distribution_data.values, alpha=0.5)
    axs[0,1].set_title('distribution of {}'.format(feature))
    axs[0,1].set_ylabel('count of {}'.format(feature))
    axs[0,1].set_xlabel(feature)
    
    # Add text in figure coordinates
    plt.figtext(0.5, 1,   'Train data plot', ha='center', va='center', fontsize=15)
    plt.figtext(0.5, 0.5, 'Test data plot', ha='center', va='center', fontsize=15)
    axs[1,0].plot(mean_data_test.index, mean_data_test.values, marker='o')
    axs[1,0].set_title('Average {} wrt {}'.format(target_col,feature))
    axs[1,0].set_ylabel('mean of {}'.format(target_col))
    axs[1,0].set_xlabel(feature)
    
    axs[1,1].bar(distribution_data_test.index, distribution_data_test.values, alpha=0.5)
    distribution_data_test.hist
    axs[1,1].set_title('distribution of {}'.format(feature))
    axs[1,1].set_ylabel('count of {}'.format(feature))
    axs[1,1].set_xlabel(feature)
    plt.tight_layout(pad=3)


In [None]:
train_d =pd.concat([X_train,y_train],axis=1)
valid_d =pd.concat([X_test,y_test],axis=1)
# print(train_d.csmcu.value_counts())
# print(valid_d.csmcu.value_counts())

# 自己做的圖表function，畵資料分佈的bar chart和對fraud_ind的關係
for c in train_d.columns:
    analze_distribution(train_d,'fraud_ind',c,valid_d)
# analze_distribution(train_d,'fraud_ind','hcefg',valid_d)
# analze_distribution(train_d,'fraud_ind','csmcu',valid_d)
# analze_distribution(train_d,'fraud_ind','mchno',valid_d)


from featexp import get_univariate_plots
# get_univariate_plots(data=train_d, target_col='fraud_ind', features_list=['hcefg'], bins=100, data_test=valid_d)
# get_univariate_plots(data=train_d, target_col='fraud_ind', features_list=['loctm_hr'], bins=10, data_test=valid_d)
# get_univariate_plots(data=train_d, target_col='fraud_ind', features_list=['flg_3dsmk'], bins=10, data_test=valid_d)
# get_univariate_plots(data=train_d, target_col='fraud_ind', features_list=['csmcu'], bins=100, data_test=valid_d)

## 檢查train和test data的分佈有哪裡不一樣

In [None]:
def describe(train_data,test_data,target_name):
    target = train_data[target_name]
    target2 = test_data[target_name]
    print(target_name)
    print('nunique train',target.nunique())
    print('nunique test',target2.nunique())
    print('max train',target.max())
    print('max test',target2.max())

    fig, axs = plt.subplots(1,2,figsize=(10,3))
    
    bins = target.nunique() if target.nunique()<100 else 100
    target.hist(bins=bins,ax=axs[0])
    
    bins = target2.nunique() if target2.nunique()<100 else 100
    target2.hist(bins=bins,ax=axs[1])
    plt.show()


for c in all_data.drop(columns='fraud_ind'):
    describe(all_data[all_data['locdt']<=90].drop(columns='fraud_ind'),all_data[all_data['locdt']>90].drop(columns='fraud_ind'),c)

## fraud資料和normal資料的各個特徵差異

In [None]:
normal_data = all_data[all_data['fraud_ind']==0]
fraud_data = all_data[all_data['fraud_ind']==1]

for c in normal_data.columns:
    print(c)
    plt.clf()
    fig, axs = plt.subplots(1,2,figsize=(10,3))

    bins = normal_data[c].nunique() if normal_data[c].nunique()<100 else 100
    normal_data[c].hist(bins=bins,ax=axs[0])
    axs[0].set_title('Normal data')
    bins = fraud_data[c].nunique() if fraud_data[c].nunique()<100 else 100
    fraud_data[c].hist(bins=bins,ax=axs[1])
    axs[1].set_title('Fraud data')
    plt.show()


## PCA on fraud data in some person

In [None]:
from sklearn.decomposition import PCA
def PCA_plot(x):
    ## 應該先轉dummy,標準化,再PCA
    dummy_list=['contp','etymd','stscd','hcefg']
    dummy_list2=['stocn','scity','csmcu']#'mchno','acqic','mcc',
    x[dummy_list] = x[dummy_list].astype(object)
    x[dummy_list2] = x[dummy_list2].astype(object)
    x = pd.get_dummies(x).drop(columns=['mchno','acqic'])    
    
    print(x)
    label = x.fraud_ind
    print(x.shape,label.sum())

    PCA_model = PCA(n_components=2)
    train_data_pca = PCA_model.fit_transform(x.drop(columns='fraud_ind'))
    train_data_pca1 = train_data_pca[label==1]
    train_data_pca0 = train_data_pca[label==0]
    
    plt.clf()
    plt.figure(figsize=(10,10))
    plt.scatter(train_data_pca1[:, 0], train_data_pca1[:, 1], c='r',label='fraud transaction',s=100)
    plt.scatter(train_data_pca0[:, 0], train_data_pca0[:, 1], c='b',label='normal transaction',s=3)
    plt.legend()
    plt.show()
    
bacno_hasfraud = all_data[all_data['fraud_ind']==1]['bacno'].unique()
print(bacno_hasfraud.shape[0])
print(all_data[all_data['fraud_ind']==1].shape[0])

for i in range(bacno_hasfraud.shape[0]):
    if all_data[all_data['bacno']==bacno_hasfraud[i]].shape[0]>300:
        print('Ploting PCA on bacno-{}'.format(bacno_hasfraud[i]))
        PCA_plot(all_data[all_data['bacno']==bacno_hasfraud[i]].drop(columns=delete_list))

## TSNE, Kmeans作圖?

## 觀察large category的feature是否有很多種bacno的fraud

In [None]:
mchno_bacno_count = all_data.groupby('mchno').apply(lambda s:s[s['fraud_ind']==1]['bacno'].nunique())
acqic_bacno_count = all_data.groupby('acqic').apply(lambda s:s[s['fraud_ind']==1]['bacno'].nunique())
mcc_bacno_count = all_data.groupby('mcc').apply(lambda s:s[s['fraud_ind']==1]['bacno'].nunique())

In [None]:
# print(mchno_bacno_count.sum())
# print(mchno_bacno_count[mchno_bacno_count>1].sort_values())

# print(acqic_bacno_count.sum())
# print(acqic_bacno_count[acqic_bacno_count>1].sort_values())

# print(mcc_bacno_count.sum())
# print(mcc_bacno_count[mcc_bacno_count>1].sort_values())

## 以每個bacno來看fraud情況

In [None]:
bacno_fraud_count = all_data.groupby('bacno').apply(lambda s:s[s['fraud_ind']==1]['bacno'].shape[0])

In [None]:
print('All fraud instance',all_data.fraud_ind.sum(skipna=True))
print('{} different bacno'.format(bacno_fraud_count[bacno_fraud_count>0].shape[0]))

bacno_fraud_count[(bacno_fraud_count<10)&(bacno_fraud_count>0)].hist(bins=9)
plt.show()
plt.clf()
bacno_fraud_count[(bacno_fraud_count>0)].hist(bins=100)


In [None]:
## important!

bacno_10fraud_more_list = bacno_fraud_count[(bacno_fraud_count>10)].index

for b in bacno_10fraud_more_list:
    print()
    print(b,all_data[all_data['bacno']==b]['fraud_ind'].sum())
    print(all_data[all_data['bacno']==b][['locdt','mchno','fraud_ind']].sort_values(by=['locdt']))