# Feature Engineering 4
***
在这个 FE 中，我希望onehot化所有的分类数据

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import time
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

sys.path.append('/root/code/elo/Tools/')
from tools import *

In [2]:
new = pd.read_csv('/root/data/new_merchant_transactions.csv')
his = pd.read_csv('/root/data/historical_transactions.csv')
train = pd.read_csv('/root/data/train.csv')
test = pd.read_csv('/root/data/test.csv')

In [3]:
#对 his 与 new 的时间数据进行处理，创建特征该次交易距离起始时间的days
his['purchase_date'] = pd.to_datetime(his.purchase_date)
new['purchase_date'] = pd.to_datetime(new.purchase_date)

startime = his.purchase_date.min()

his['days_to_start'] = (his.purchase_date - startime).dt.days
new['days_to_start'] = (new.purchase_date - startime).dt.days

In [4]:
his.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,days_to_start
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,175
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,195
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,220
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34,244
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37,68


In [5]:
cate_col = [
    'category_3','category_2', 
    'state_id', 'subsector_id'
]


In [6]:
# authorized_flag

his.category_2.fillna(0, inplace=True)
his.category_3.fillna('D', inplace=True)
his.merchant_id.fillna('NaN', inplace=True)


new.category_2.fillna(0, inplace=True)
new.category_3.fillna('D', inplace=True)
new.merchant_id.fillna('NaN', inplace=True)


new['authorized_flag'] = new.authorized_flag.map({'N':0,'Y':1})
his['authorized_flag'] = his.authorized_flag.map({'N':0,'Y':1})

auth = his[his['authorized_flag'] == 1]
unauth = his[his['authorized_flag'] == 0]


agg = {
    'authorized_flag':['count', 'mean', 'sum']
}
auth_flag = his.groupby('card_id').agg(agg)
auth_flag.columns = ['_'.join(col).strip() for col in auth_flag.columns.values]
auth_flag.reset_index(inplace=True)


In [7]:
agg = {'authorized_flag': ['mean', 'count']}
flag_month = his.groupby(['card_id', 'month_lag']).agg(agg)
flag_month.columns = ['_'.join(col).strip() for col in flag_month.columns.values]
flag_month.reset_index(inplace=True)

# 找到对于每id来说，距离评估日最后的月成交数量与成交率
col_idx = ['authorized_flag_mean', 'authorized_flag_count']
row_idx = flag_month.groupby('card_id').month_lag.apply(lambda x: x.idxmax()).values
final = flag_month.loc[row_idx,col_idx].reset_index(drop=True)
final.columns = ['final_authorized_flag_mean', 'final_authorized_flag_count']
pd.merge(auth_flag, final, left_index=True, right_index=True)

# 找到对于每id来说，最开始的月成交数量与成交率
col_idx = ['authorized_flag_mean', 'authorized_flag_count']
row_idx = flag_month.groupby('card_id').month_lag.apply(lambda x: x.idxmin()).values
start = flag_month.loc[row_idx,col_idx].reset_index(drop=True)
start.columns = ['start_authorized_flag_mean', 'start_authorized_flag_count']
pd.merge(auth_flag, final, left_index=True, right_index=True)

# 找到对于每id来说，成交数量最高的月成交数量与成交率
auth_flag['max_authorized_flag_per_month'] = flag_month.groupby('card_id').authorized_flag_count.max().values

# 找到对于每id来说，成交数量最少的月成交数量与成交率
auth_flag['min_authorized_flag_per_month'] = flag_month.groupby('card_id').authorized_flag_count.min().values

del flag_month, his

In [9]:
def cal_mode(serise):
    return serise.mode()[0]


def agg_dataframe(df):
    
    df['category_1'] = df.category_1.map({'N':0, 'Y':1})
    df['log_amount'] = np.log(df.purchase_amount+0.75)
    df['month_diff'] = ((datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']
    df['dayofweek'] = df.purchase_date.apply(lambda t: t.dayofweek)
    df['month'] = df.purchase_date.apply(lambda t: t.month)
    
    agg = {
        'category_1': ['mean', 'count'],
        'city_id': ['nunique', cal_mode],
        'merchant_category_id': ['nunique', cal_mode],
        'merchant_id': ['nunique', cal_mode],
        'installments': ['sum', 'mean', 'max', 'min', 'std',np.median],
        'month_lag': ['min', 'max', np.ptp, 'std'],
        'days_to_start': [np.ptp, 'min', 'max', 'std', np.median],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std', np.median],
        'log_amount': ['sum', 'mean', 'max', 'min', 'std',np.median],
        'month_diff': ['mean'],
        'dayofweek': [cal_mode, 'std'],
        'month': ['std']
    }
    agg_df = df.groupby(['card_id']).agg(agg)
    agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
    agg_df.reset_index(inplace=True)
    
    return agg_df

In [10]:
agg_auth = agg_dataframe(auth)
agg_auth.columns = ['auth_' + c if c != 'card_id' else c for c in agg_auth.columns]
agg_unauth = agg_dataframe(unauth)
agg_unauth.columns = ['unauth_' + c if c != 'card_id' else c for c in agg_unauth.columns]
agg_new = agg_dataframe(new)
agg_new.columns = ['new_' + c if c != 'card_id' else c for c in agg_new.columns]

In [11]:
def cate(df, cate_col):
    cate_df = pd.DataFrame({'card_id': df.card_id.unique()})
    for i in cate_col:
        subdf = pd.get_dummies(df[i]).astype('bool')
        subdf['card_id'] = df['card_id']
        mean = subdf.groupby('card_id').mean()
        mean.columns = [i+'_'+str(col) for col in mean.columns]
        mean.reset_index(inplace = True)
        cate_df = pd.merge(cate_df, mean, on='card_id', how='left')
        del mean, subdf
    return cate_df

In [12]:
cate_auth = cate(auth, cate_col)
cate_auth.columns = ['auth_' + c if c != 'card_id' else c for c in cate_auth.columns]
cate_unauth = cate(unauth, cate_col)
cate_unauth.columns = ['unauth_' + c if c != 'card_id' else c for c in cate_unauth.columns]
cate_new = cate(new, cate_col)
cate_new.columns = ['new_' + c if c != 'card_id' else c for c in cate_new.columns]

In [13]:
def per_month(df):
    df['log_amount'] = np.log(df.purchase_amount+0.75)
    agg = {
        'purchase_amount': ['count', 'sum', 'mean', 'max', 'min', 'std', np.median],
        'installments': ['count', 'sum', 'mean', 'max', 'min', 'std', np.median],
        'log_amount': ['sum', 'mean', 'max', 'min', 'std',np.median]
    }
    df_month = df.groupby(['card_id', 'month_lag']).agg(agg)
    df_month.columns = ['._'.join(col).strip() for col in df_month.columns.values]
    df_month.reset_index(inplace=True)
    
    df_final = df_month.groupby('card_id').agg(['mean', 'std', 'max', 'min', np.median])
    df_final.columns = ['._'.join(col).strip() for col in df_final.columns.values]
    df_final.reset_index(inplace=True)
    
    return df_final

In [14]:
auth_final = per_month(auth)
auth_final.columns = ['final_auth_' + c if c != 'card_id' else c for c in auth_final.columns]

In [15]:
def successive_aggregates(df, field1, field2):
    t = df.groupby(['card_id', field1])[field2].mean()
    u = pd.DataFrame(t).reset_index().groupby('card_id')[field2].agg(['mean', 'min', 'max', 'std', np.median])
    u.columns = [field1 + '_' + field2 + '_' + col for col in u.columns.values]
    u.reset_index(inplace=True)
    return u

In [16]:
additional_fields = successive_aggregates(new, 'category_1', 'purchase_amount')
additional_fields = additional_fields.merge(successive_aggregates(new, 'installments', 'purchase_amount'),
                                            on = 'card_id', how='left')
additional_fields = additional_fields.merge(successive_aggregates(new, 'city_id', 'purchase_amount'),
                                            on = 'card_id', how='left')
additional_fields = additional_fields.merge(successive_aggregates(new, 'category_1', 'installments'),
                                            on = 'card_id', how='left')

In [18]:
train = pd.merge(train, auth_flag, on='card_id', how='left')
train = pd.merge(train, agg_auth, on='card_id', how='left')
train = pd.merge(train, agg_unauth, on='card_id', how='left')
train = pd.merge(train, auth_final, on='card_id', how='left')
train = pd.merge(train, agg_new, on='card_id', how='left')
train = pd.merge(train, cate_unauth, on='card_id', how='left')
train = pd.merge(train, cate_auth, on='card_id', how='left')
train = pd.merge(train, cate_new, on='card_id', how='left')
train = pd.merge(train, additional_fields, on='card_id', how='left')

In [19]:
test = pd.merge(test, auth_flag, on='card_id', how='left')
test = pd.merge(test, agg_auth, on='card_id', how='left')
test = pd.merge(test, agg_unauth, on='card_id', how='left')
test = pd.merge(test, auth_final, on='card_id', how='left')
test = pd.merge(test, agg_new, on='card_id', how='left')
test = pd.merge(test, cate_unauth, on='card_id', how='left')
test = pd.merge(test, cate_auth, on='card_id', how='left')
test = pd.merge(test, cate_new, on='card_id', how='left')
test = pd.merge(test, additional_fields, on='card_id', how='left')

In [20]:
for i in train.columns:
    if train[i].dtype == object:
        print(i)

first_active_month
card_id
auth_merchant_id_cal_mode
unauth_merchant_id_cal_mode
new_merchant_id_cal_mode


In [21]:
train['first_active_month'] = (pd.to_datetime(train['first_active_month']) - startime).dt.days
test['first_active_month'] = (pd.to_datetime(test['first_active_month']) - startime).dt.days

In [22]:
str_label = [
    'auth_merchant_id_cal_mode',
    'unauth_merchant_id_cal_mode', 
    'new_merchant_id_cal_mode'
]
for i in str_label:
    le = LabelEncoder()
    unique = train[i].append(test[i]).unique().tolist()
    le.fit(unique)
    train[i] = le.transform(train[i].tolist())
    test[i] = le.transform(test[i].tolist())

In [25]:
train_drop33 = train[train.target>-30]

In [28]:
train_drop33.to_csv('/root/tempfile/train_final121701.csv',index=False)
test.to_csv('/root/tempfile/test_final121701.csv',index=False)