# Import packages

In [1]:
import math, os, collections
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from evaluation import Evaluation
from db_connection.utils import get_conn
from utils_load import load_w103, load_w106
from mlaas_tools.config_build import config_set

In [2]:
## Configure env
if not os.path.isfile('config.ini'):
    config_set()

In [3]:
## Load db connection
rawdata_conn = get_conn('edu')

read key file
login as edu-cytsao


# fund feature preprocess (w106)

In [4]:
sql = """SELECT * FROM sinica.witwo106"""
fund = pd.read_sql(sql, rawdata_conn)
fund.head()

Unnamed: 0,channel_callcenter_ind,channel_frontdesk_ind,channel_mobile_ind,channel_web_ind,counterparty_code,etl_dt,fee_rate,fee_type_code,high_yield_bond_ind,invest_limited_code,invest_type,isin_code,mkt_rbot_ctg_ic,prod_ccy,prod_detail_type_code,prod_name,prod_risk_code,prod_type_code,wm_prod_code
0,YYYYY,YYYYY,YYYYY,YYYYY,AA,2019-10-17,0.03,A,N,AA,F01,LU1005136848,F0201,AUD,FNDF,富達全球入息基金Ｈ月配澳幣避險,RR3,4,AAC4
1,NNYYN,NYYYN,NYYYN,NYYYN,AD,2019-10-17,0.0,B,Y,AD,F02,LU1013768343,F1301,USD,FNDF,聯博歐洲收益基金ＢＡ穩定月配美元避險,RR3,4,AD70
2,NNNNN,NNNNN,NNNNN,NNNNN,AF,2019-10-17,0.03,A,N,AF,F01,LU0271656307,F0102,EUR,FNDF,鋒匯理Ⅱ－歐洲潛力Ａ２歐元,RR4,4,AF46
3,YYYYY,YYYYY,YYYYY,YYYYY,AO,2019-10-17,0.02,A,N,AO,F02,IE00B0M2Y900,F1201,USD,FNDF,ＰＩＭＣＯ總回報債券基金Ｅ級類別收息股份,RR2,4,AO14
4,NNNNN,NNNNN,NNNNN,NNNNN,CC,2019-10-17,0.03,A,N,CC,F01,,F0105,JPY,FNDF,Ｆ日本槓桿基金,RR2,4,CC13


In [5]:
fund = load_w106(rawdata_conn)
fund.head()

Unnamed: 0,wm_prod_code,can_rcmd_ind,high_yield_bond_ind,counterparty_code,invest_limited_code,invest_type,mkt_rbot_ctg_ic,prod_ccy,prod_detail_type_code,prod_risk_code
0,AAC4,1,N,AA,AA,F01,F0201,AUD,FNDF,RR3
1,AD70,0,Y,AD,AD,F02,F1301,USD,FNDF,RR3
2,AF46,0,N,AF,AF,F01,F0102,EUR,FNDF,RR4
3,AO14,1,N,AO,AO,F02,F1201,USD,FNDF,RR2
4,CC13,0,N,CC,CC,F01,F0105,JPY,FNDF,RR2


In [6]:
len(fund)

4169

In [7]:
fund.describe(include='object')

Unnamed: 0,wm_prod_code,high_yield_bond_ind,counterparty_code,invest_limited_code,invest_type,mkt_rbot_ctg_ic,prod_ccy,prod_detail_type_code,prod_risk_code
count,4169,4169,4169,4169,4169,4169,4169,4169,4169
unique,4169,2,162,156,6,53,14,2,5
top,3716,N,CC,CC,F01,F0801,USD,FNDF,RR3
freq,1,2837,155,155,2103,552,1825,2608,1541


In [8]:
fund.isna().sum()
# no NA value

wm_prod_code             0
can_rcmd_ind             0
high_yield_bond_ind      0
counterparty_code        0
invest_limited_code      0
invest_type              0
mkt_rbot_ctg_ic          0
prod_ccy                 0
prod_detail_type_code    0
prod_risk_code           0
dtype: int64

In [9]:
fund[fund['counterparty_code']!=fund['invest_limited_code']]

Unnamed: 0,wm_prod_code,can_rcmd_ind,high_yield_bond_ind,counterparty_code,invest_limited_code,invest_type,mkt_rbot_ctg_ic,prod_ccy,prod_detail_type_code,prod_risk_code
29,WL11,0,Y,WL,BB,F02,F1502,USD,FNDF,RR3
42,II11,0,N,II,EE,F01,F0301,USD,FNDF,RR5
43,II13,0,N,II,EE,F01,F0101,USD,FNDF,RR1
44,II25,0,N,II,EE,F01,F0404,USD,FNDF,RR5
82,MM12,0,N,MM,BB,F01,F0102,EUR,FNDF,RR3
...,...,...,...,...,...,...,...,...,...,...
4047,MM18,0,N,MM,BB,F01,F0903,EUR,FNDF,RR5
4048,MM19,0,N,MM,BB,F01,F0903,EUR,FNDF,RR5
4075,KK05,0,N,KK,JJ,F01,F0102,USD,FNDF,RR4
4105,AI16,0,N,AI,AJ,F01,F0401,USD,FNDF,RR5


In [10]:
fund[fund['high_yield_bond_ind']=='Y']['invest_type'].value_counts()
# 高收益這個指標跟風險收益等級看起來不重複 -> 保留

F02    1000
F03     250
F05      63
F01      19
Name: invest_type, dtype: int64

In [11]:
#fund.to_csv('fund_w106.csv')

## features with number of categories > 10

In [12]:
fund['counterparty_code'].value_counts()[:10]
# discard count < 100
# ['CC', 'AA', 'AF', 'AG', 'NN', 'KK', 'AD', 'PP']

CC    155
AA    146
AF    142
AG    142
NN    126
KK    112
AD    101
PP    100
UF     88
BF     69
Name: counterparty_code, dtype: int64

In [13]:
fund.loc[fund['counterparty_code'].value_counts()[fund['counterparty_code']].values<100, 'counterparty_code'] = 'other'
fund['counterparty_code'].value_counts()

other    3145
CC        155
AA        146
AG        142
AF        142
NN        126
KK        112
AD        101
PP        100
Name: counterparty_code, dtype: int64

In [14]:
fund['mkt_rbot_ctg_ic'].value_counts()[:10]

F0801    552
F0201    331
F0101    308
F1501    265
F0102    245
F1301    179
F1404    172
F1201    171
F1402    169
F0409    162
Name: mkt_rbot_ctg_ic, dtype: int64

In [15]:
fund['prod_ccy'].value_counts()[:10]

USD    1825
TWD     919
EUR     546
CNY     280
AUD     276
ZAR     170
JPY      44
GBP      43
NZD      16
SGD      14
Name: prod_ccy, dtype: int64

In [16]:
def w106_process(df):
    # discard categorization
    discard_condition = {'counterparty_code': 100, 'mkt_rbot_ctg_ic': 200, 'prod_ccy': 500}
    for col, n in discard_condition.items(): 
        df.loc[df[col].value_counts()[df[col]].values<n, col] = col+'_other'
    # convert int to categorical
    df['high_yield_bond_ind'] = df['high_yield_bond_ind'].map({'Y': 'high_yield', 'N': 'not_high_yield'})
    df['can_rcmd_ind'] = df['can_rcmd_ind'].map({1:'can_rcmd', 0: 'can_rcmd_N'})
    del df['invest_limited_code']
    return df

In [17]:
w106_process(fund)

Unnamed: 0,wm_prod_code,can_rcmd_ind,high_yield_bond_ind,counterparty_code,invest_type,mkt_rbot_ctg_ic,prod_ccy,prod_detail_type_code,prod_risk_code
0,AAC4,can_rcmd,not_high_yield,AA,F01,F0201,prod_ccy_other,FNDF,RR3
1,AD70,can_rcmd_N,high_yield,AD,F02,mkt_rbot_ctg_ic_other,USD,FNDF,RR3
2,AF46,can_rcmd_N,not_high_yield,AF,F01,F0102,EUR,FNDF,RR4
3,AO14,can_rcmd,not_high_yield,other,F02,mkt_rbot_ctg_ic_other,USD,FNDF,RR2
4,CC13,can_rcmd_N,not_high_yield,CC,F01,mkt_rbot_ctg_ic_other,prod_ccy_other,FNDF,RR2
...,...,...,...,...,...,...,...,...,...
4164,4826,can_rcmd,high_yield,other,F02,F1501,prod_ccy_other,FNDD,RR3
4165,WD12,can_rcmd_N,not_high_yield,other,F02,mkt_rbot_ctg_ic_other,prod_ccy_other,FNDF,RR3
4166,5929,can_rcmd,not_high_yield,other,F03,F0801,prod_ccy_other,FNDD,RR3
4167,AV04,can_rcmd_N,high_yield,other,F02,F1501,USD,FNDF,RR3


# save data table to csv

In [23]:
import os
## Load data
exp_dates = ['2018-12-31', '2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30', '2019-05-31', '2019-06-30']
span = [18, 6]
evaluation_span = '1m'
def load_tables(date, span):
    print('\n', date, span)
    rawdata_conn = get_conn('edu')
    # interaction train w103
    w103_df = load_w103(date, rawdata_conn, span)
    purchase_hist = w103_df.groupby("cust_no")["wm_prod_code"].apply(lambda x: list(set(x.values.tolist()))).to_dict()
    # evaluation w103
    evaluation = Evaluation(date, None, evaluation_span, purchase_hist)
    evaluate_w103 = evaluation.read(date, rawdata_conn, evaluation_span)
    # warm_users, cold_users = evaluation.warm_cold_list()
    # print('warm-users:', len(warm_users), 'cold-users:', len(cold_users))
    # fund
    w106_df = load_w106(rawdata_conn)
    w106_df = w106_process(w106_df)
    _filter = w106_df.wm_prod_code.isin(w103_df['wm_prod_code'].tolist())
    w106_df_filter = w106_df[_filter]
    # user
    
    # save data to csv
    path = '../../raw_datasets/'+date+'_'+str(span)
    if not os.path.exists(path):
        os.makedirs(path)
    w103_df.to_csv(path+'/train_w103.csv')
    evaluate_w103.to_csv(path+'/evaluate_w103.csv')
    w106_df_filter.to_csv(path+'/item_features.csv')
    print('data saved!')
    

In [24]:
print("Loading Data...")
for d in exp_dates:
    for s in span:
        load_tables(d, s)

Loading Data...

 2018-12-31 18
read key file
login as edu-cytsao
read key file
login as edu-cytsao
data saved!

 2018-12-31 6
read key file
login as edu-cytsao
read key file
login as edu-cytsao
data saved!

 2019-01-31 18
read key file
login as edu-cytsao
read key file
login as edu-cytsao
data saved!

 2019-01-31 6
read key file
login as edu-cytsao
read key file
login as edu-cytsao
data saved!

 2019-02-28 18
read key file
login as edu-cytsao
read key file
login as edu-cytsao
data saved!

 2019-02-28 6
read key file
login as edu-cytsao
read key file
login as edu-cytsao
data saved!

 2019-03-31 18
read key file
login as edu-cytsao
read key file
login as edu-cytsao
data saved!

 2019-03-31 6
read key file
login as edu-cytsao
read key file
login as edu-cytsao
data saved!

 2019-04-30 18
read key file
login as edu-cytsao
read key file
login as edu-cytsao
data saved!

 2019-04-30 6
read key file
login as edu-cytsao
read key file
login as edu-cytsao
data saved!

 2019-05-31 18
read key file