In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from fea_utils import *
from sklearn import preprocessing

In [3]:
user_fn = '../../dataset/t_user.csv'
click_fn = '../../dataset/t_click.csv'
order_fn = '../../dataset/t_order.csv'
loan_fn = '../../dataset/t_loan.csv'
fea_fn = '../../fea/fea_gender_date.csv'

In [4]:
user_df = pd.read_csv(user_fn)
clk_df = pd.read_csv(click_fn)
ord_df = pd.read_csv(order_fn)
loan_df = pd.read_csv(loan_fn)
loan_df['real_loan_amount'] = loan_df['loan_amount'].map(lambda la : to_real_loan(la))

In [5]:
user_df.head(5)

Unnamed: 0,uid,age,sex,active_date,limit
0,26308,30,1,2016-02-16,5.974677
1,78209,40,1,2016-02-21,5.292154
2,51930,35,1,2016-04-19,6.292055
3,10113,25,1,2016-03-12,6.292055
4,17067,35,1,2016-02-16,5.974677


In [6]:
sexes = set(user_df.sex)
sex_df = pd.DataFrame({'sex':list(sexes)})
sex_df = sex_df.sort_values(['sex'])

sex_df.head(5)

Unnamed: 0,sex
0,1
1,2


In [7]:
date_all = loan_df['loan_time'].str.split(' ', expand=True)[0]
start_date = pd.to_datetime(date_all.min(), format='%Y-%m-%d')
end_date = pd.to_datetime(date_all.max(), format='%Y-%m-%d')
date_list = map(lambda d : datetime.strftime(d, '%Y-%m-%d'), pd.date_range(start_date, end_date).tolist())
date_df = pd.DataFrame(data={'date' : date_list})

date_df.head(5)

Unnamed: 0,date
0,2016-08-03
1,2016-08-04
2,2016-08-05
3,2016-08-06
4,2016-08-07


In [8]:
sex_df['key'] = 1
date_df['key'] = 1

sex_date_df = pd.merge(sex_df, date_df, on=['key'], how='left')
del sex_date_df['key']

sex_date_df.head(5)

Unnamed: 0,sex,date
0,1,2016-08-03
1,1,2016-08-04
2,1,2016-08-05
3,1,2016-08-06
4,1,2016-08-07


In [9]:
user_df['key'] = 1
user_date_df = pd.merge(user_df, date_df, on=['key'], how='left')
del user_date_df['key']

user_date_df.head(5)

Unnamed: 0,uid,age,sex,active_date,limit,date
0,26308,30,1,2016-02-16,5.974677,2016-08-03
1,26308,30,1,2016-02-16,5.974677,2016-08-04
2,26308,30,1,2016-02-16,5.974677,2016-08-05
3,26308,30,1,2016-02-16,5.974677,2016-08-06
4,26308,30,1,2016-02-16,5.974677,2016-08-07


## 生成点击特征数据

In [10]:
clk_df['date'] = clk_df['click_time'].map(lambda ct: ct.split(' ')[0])

In [11]:
user_sex_clk_df = pd.merge(user_df[['uid', 'sex']], clk_df, on=['uid'])
user_sex_clk_df.head(5)

Unnamed: 0,uid,sex,click_time,pid,param,date
0,26308,1,2016-08-29 00:16:52,10,28,2016-08-29
1,26308,1,2016-11-14 11:14:34,8,1,2016-11-14
2,26308,1,2016-11-14 11:14:46,8,1,2016-11-14
3,26308,1,2016-09-06 07:51:03,7,9,2016-09-06
4,26308,1,2016-09-07 21:49:43,10,28,2016-09-07


In [12]:
sex_date_clk_df = pd.DataFrame({'sd_clk_cnt' : user_sex_clk_df.groupby(['sex', 'date']).size()}).reset_index()
sex_date_clk_df.head(5)

Unnamed: 0,sex,date,sd_clk_cnt
0,1,2016-08-03,134970
1,1,2016-08-04,107731
2,1,2016-08-05,95511
3,1,2016-08-06,92332
4,1,2016-08-07,85006


In [13]:
sex_date_fea_df = pd.merge(sex_date_df, sex_date_clk_df, on=['sex', 'date'], how='left')
sex_date_fea_df.head(5)

Unnamed: 0,sex,date,sd_clk_cnt
0,1,2016-08-03,134970
1,1,2016-08-04,107731
2,1,2016-08-05,95511
3,1,2016-08-06,92332
4,1,2016-08-07,85006


In [14]:
sex_date_fea_df['sd_clk_cnt'] = sex_date_fea_df['sd_clk_cnt'].fillna(value=0)

In [15]:
gc = sex_date_fea_df.groupby(['sex']).sd_clk_cnt

sex_date_fea_df['sd_clk_cnt_3d'] = gc.apply(lambda x: x.rolling(3).sum()).fillna(value=-1)
sex_date_fea_df['sd_clk_cnt_7d'] = gc.apply(lambda x: x.rolling(7).sum()).fillna(value=-1)
sex_date_fea_df['sd_clk_cnt_14d'] = gc.apply(lambda x: x.rolling(14).sum()).fillna(value=-1)
sex_date_fea_df['sd_clk_cnt_21d'] = gc.apply(lambda x: x.rolling(21).sum()).fillna(value=-1)
sex_date_fea_df['sd_clk_cnt_30d'] = gc.apply(lambda x: x.rolling(30).sum()).fillna(value=-1)
sex_date_fea_df['sd_clk_cnt_60d'] = gc.apply(lambda x: x.rolling(60).sum()).fillna(value=-1)
sex_date_fea_df['sd_clk_cnt_90d'] = gc.apply(lambda x: x.rolling(90).sum()).fillna(value=-1)

In [17]:
sex_date_fea_df.head(15)

Unnamed: 0,sex,date,sd_clk_cnt,sd_clk_cnt_3d,sd_clk_cnt_7d,sd_clk_cnt_14d,sd_clk_cnt_21d,sd_clk_cnt_30d,sd_clk_cnt_60d,sd_clk_cnt_90d
0,1,2016-08-03,134970,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,1,2016-08-04,107731,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,1,2016-08-05,95511,338212.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,1,2016-08-06,92332,295574.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,1,2016-08-07,85006,272849.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,1,2016-08-08,92544,269882.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
6,1,2016-08-09,118609,296159.0,726703.0,-1.0,-1.0,-1.0,-1.0,-1.0
7,1,2016-08-10,109115,320268.0,700848.0,-1.0,-1.0,-1.0,-1.0,-1.0
8,1,2016-08-11,104249,331973.0,697366.0,-1.0,-1.0,-1.0,-1.0,-1.0
9,1,2016-08-12,111364,324728.0,713219.0,-1.0,-1.0,-1.0,-1.0,-1.0


## 生成订单特征数据

In [18]:
ord_df['date'] = ord_df['buy_time']
ord_df.head(5)

Unnamed: 0,uid,buy_time,price,qty,cate_id,discount,date
0,45370,2016-11-23,3.995009,1,22,0.0,2016-11-23
1,66975,2016-11-23,3.26941,1,26,0.0,2016-11-23
2,75358,2016-11-23,2.255235,1,14,0.0,2016-11-23
3,40597,2016-11-23,1.635284,1,20,0.0,2016-11-23
4,83886,2016-11-23,1.920573,2,22,0.0,2016-11-23


In [19]:
user_sex_ord_df = pd.merge(user_df[['uid', 'sex']], ord_df, on=['uid'])
user_sex_ord_df.head(5)

Unnamed: 0,uid,sex,buy_time,price,qty,cate_id,discount,date
0,26308,1,2016-08-09,4.070415,1,22,0.0,2016-08-09
1,26308,1,2016-10-06,3.462174,1,33,0.0,2016-10-06
2,26308,1,2016-10-13,3.722706,1,36,2.730425,2016-10-13
3,78209,1,2016-10-07,3.070415,2,3,3.150268,2016-10-07
4,78209,1,2016-09-15,2.603179,1,33,0.0,2016-09-15


In [20]:
sex_date_ord_df = pd.DataFrame({'sd_ord_cnt' : user_sex_ord_df.groupby(['sex', 'date']).size()}).reset_index()
sex_date_ord_df.head(5)

Unnamed: 0,sex,date,sd_ord_cnt
0,1,2016-08-03,103521
1,1,2016-08-04,75694
2,1,2016-08-05,54316
3,1,2016-08-06,51884
4,1,2016-08-07,47507


In [21]:
sex_date_fea_df = pd.merge(sex_date_fea_df, sex_date_ord_df, on=['sex', 'date'], how='left')
sex_date_fea_df['sd_ord_cnt'] = sex_date_fea_df['sd_ord_cnt'].fillna(value=0)

sex_date_fea_df.head(5)

Unnamed: 0,sex,date,sd_clk_cnt,sd_clk_cnt_3d,sd_clk_cnt_7d,sd_clk_cnt_14d,sd_clk_cnt_21d,sd_clk_cnt_30d,sd_clk_cnt_60d,sd_clk_cnt_90d,sd_ord_cnt
0,1,2016-08-03,134970,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,103521
1,1,2016-08-04,107731,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,75694
2,1,2016-08-05,95511,338212.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,54316
3,1,2016-08-06,92332,295574.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,51884
4,1,2016-08-07,85006,272849.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,47507


In [22]:
gc = sex_date_fea_df.groupby('sex').sd_ord_cnt

sex_date_fea_df['sd_ord_cnt_3d'] = gc.apply(lambda x: x.rolling(3).sum()).fillna(value=-1)
sex_date_fea_df['sd_ord_cnt_7d'] = gc.apply(lambda x: x.rolling(7).sum()).fillna(value=-1)
sex_date_fea_df['sd_ord_cnt_14d'] = gc.apply(lambda x: x.rolling(14).sum()).fillna(value=-1)
sex_date_fea_df['sd_ord_cnt_21d'] = gc.apply(lambda x: x.rolling(21).sum()).fillna(value=-1)
sex_date_fea_df['sd_ord_cnt_30d'] = gc.apply(lambda x: x.rolling(30).sum()).fillna(value=-1)
sex_date_fea_df['sd_ord_cnt_60d'] = gc.apply(lambda x: x.rolling(60).sum()).fillna(value=-1)
sex_date_fea_df['sd_ord_cnt_90d'] = gc.apply(lambda x: x.rolling(90).sum()).fillna(value=-1)

In [23]:
sex_date_fea_df.head(5)

Unnamed: 0,sex,date,sd_clk_cnt,sd_clk_cnt_3d,sd_clk_cnt_7d,sd_clk_cnt_14d,sd_clk_cnt_21d,sd_clk_cnt_30d,sd_clk_cnt_60d,sd_clk_cnt_90d,sd_ord_cnt,sd_ord_cnt_3d,sd_ord_cnt_7d,sd_ord_cnt_14d,sd_ord_cnt_21d,sd_ord_cnt_30d,sd_ord_cnt_60d,sd_ord_cnt_90d
0,1,2016-08-03,134970,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,103521,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,1,2016-08-04,107731,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,75694,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,1,2016-08-05,95511,338212.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,54316,233531.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,1,2016-08-06,92332,295574.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,51884,181894.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,1,2016-08-07,85006,272849.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,47507,153707.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


## 生成点击率特征数据

In [24]:
sex_date_fea_df['sd_ctr'] = (sex_date_fea_df['sd_ord_cnt'] + 0.1) / (sex_date_fea_df['sd_clk_cnt'] + 0.5)
sex_date_fea_df['sd_ctr_3d'] = (sex_date_fea_df['sd_ord_cnt_3d'] + 0.1) / (sex_date_fea_df['sd_clk_cnt_3d'] + 0.5)
sex_date_fea_df['sd_ctr_7d'] = (sex_date_fea_df['sd_ord_cnt_7d'] + 0.1) / (sex_date_fea_df['sd_clk_cnt_7d'] + 0.5)
sex_date_fea_df['sd_ctr_14d'] = (sex_date_fea_df['sd_ord_cnt_14d'] + 0.1) / (sex_date_fea_df['sd_clk_cnt_14d'] + 0.5)
sex_date_fea_df['sd_ctr_21d'] = (sex_date_fea_df['sd_ord_cnt_21d'] + 0.1) / (sex_date_fea_df['sd_clk_cnt_21d'] + 0.5)
sex_date_fea_df['sd_ctr_30d'] = (sex_date_fea_df['sd_ord_cnt_30d'] + 0.1) / (sex_date_fea_df['sd_clk_cnt_30d'] + 0.5)
sex_date_fea_df['sd_ctr_60d'] = (sex_date_fea_df['sd_ord_cnt_60d'] + 0.1) / (sex_date_fea_df['sd_clk_cnt_60d'] + 0.5)
sex_date_fea_df['sd_ctr_90d'] = (sex_date_fea_df['sd_ord_cnt_90d'] + 0.1) / (sex_date_fea_df['sd_clk_cnt_90d'] + 0.5)

In [26]:
sex_date_fea_df['sd_ctr'] = sex_date_fea_df.apply(lambda x: -1 if x['sd_ord_cnt'] < 0 or x['sd_clk_cnt'] < 0 else x['sd_ctr'], axis=1)
sex_date_fea_df['sd_ctr_3d'] = sex_date_fea_df.apply(lambda x: -1 if x['sd_ord_cnt_3d'] < 0 or x['sd_clk_cnt_3d'] < 0 else x['sd_ctr_3d'], axis=1)
sex_date_fea_df['sd_ctr_7d'] = sex_date_fea_df.apply(lambda x: -1 if x['sd_ord_cnt_7d'] < 0 or x['sd_clk_cnt_7d'] < 0 else x['sd_ctr_7d'], axis=1)
sex_date_fea_df['sd_ctr_14d'] = sex_date_fea_df.apply(lambda x: -1 if x['sd_ord_cnt_14d'] < 0 or x['sd_clk_cnt_14d'] < 0 else x['sd_ctr_14d'], axis=1)
sex_date_fea_df['sd_ctr_21d'] = sex_date_fea_df.apply(lambda x: -1 if x['sd_ord_cnt_21d'] < 0 or x['sd_clk_cnt_21d'] < 0 else x['sd_ctr_21d'], axis=1)
sex_date_fea_df['sd_ctr_30d'] = sex_date_fea_df.apply(lambda x: -1 if x['sd_ord_cnt_30d'] < 0 or x['sd_clk_cnt_30d'] < 0 else x['sd_ctr_30d'], axis=1)
sex_date_fea_df['sd_ctr_60d'] = sex_date_fea_df.apply(lambda x: -1 if x['sd_ord_cnt_60d'] < 0 or x['sd_clk_cnt_60d'] < 0 else x['sd_ctr_60d'], axis=1)
sex_date_fea_df['sd_ctr_90d'] = sex_date_fea_df.apply(lambda x: -1 if x['sd_ord_cnt_90d'] < 0 or x['sd_clk_cnt_90d'] < 0 else x['sd_ctr_90d'], axis=1)

## 生成贷款特征数据

In [29]:
loan_df['date'] = loan_df['loan_time'].map(lambda lt: lt.split(' ')[0])

In [30]:
user_sex_loan_df = pd.merge(user_df[['uid', 'sex']], loan_df, on=['uid'])
user_sex_loan_df.head(5)

Unnamed: 0,uid,sex,loan_time,loan_amount,plannum,real_loan_amount,date
0,26308,1,2016-09-04 22:36:57,4.544373,1,1500.0,2016-09-04
1,26308,1,2016-10-18 09:09:12,4.723017,1,2000.0,2016-10-18
2,26308,1,2016-10-05 17:08:05,4.292651,1,1000.0,2016-10-05
3,26308,1,2016-10-25 21:32:56,4.723017,3,2000.0,2016-10-25
4,26308,1,2016-10-31 15:38:07,4.292651,1,1000.0,2016-10-31


In [31]:
sex_date_loan_df = pd.DataFrame({'sd_loan' : user_sex_loan_df.groupby(['sex', 'date'])['real_loan_amount'].sum()}).reset_index()

In [32]:
sex_date_loan_df.head(5)

Unnamed: 0,sex,date,sd_loan
0,1,2016-08-03,8580100.0
1,1,2016-08-04,7217800.0
2,1,2016-08-05,7415800.0
3,1,2016-08-06,7471300.0
4,1,2016-08-07,5679300.0


In [None]:
sex_date_fea_df = pd.merge(sex_date_fea_df, sex_date_loan_df, on=['sex', 'date'], how='left')
sex_date_fea_df['sd_loan'] = sex_date_fea_df['sd_loan'].fillna(value=0)

gc = sex_date_fea_df.groupby('sex').sd_loan

sex_date_fea_df['sd_loan_norm_3d'] = gc.apply(lambda x: x.rolling(3).sum()).fillna(value=-1).map(lambda loan_amount: to_norm_loan(loan_amount))
sex_date_fea_df['sd_loan_norm_7d'] = gc.apply(lambda x: x.rolling(7).sum()).fillna(value=-1).map(lambda loan_amount: to_norm_loan(loan_amount))
sex_date_fea_df['sd_loan_norm_14d'] = gc.apply(lambda x: x.rolling(14).sum()).fillna(value=-1).map(lambda loan_amount: to_norm_loan(loan_amount))
sex_date_fea_df['sd_loan_norm_21d'] = gc.apply(lambda x: x.rolling(21).sum()).fillna(value=-1).map(lambda loan_amount: to_norm_loan(loan_amount))
sex_date_fea_df['sd_loan_norm_30d'] = gc.apply(lambda x: x.rolling(30).sum()).fillna(value=-1).map(lambda loan_amount: to_norm_loan(loan_amount))
sex_date_fea_df['sd_loan_norm_60d'] = gc.apply(lambda x: x.rolling(60).sum()).fillna(value=-1).map(lambda loan_amount: to_norm_loan(loan_amount))
sex_date_fea_df['sd_loan_norm_90d'] = gc.apply(lambda x: x.rolling(90).sum()).fillna(value=-1).map(lambda loan_amount: to_norm_loan(loan_amount))
