# Feature Engineering 7
***
我们这回使用一个更加精巧的方法吧

In [18]:
%load_ext autoreload
%autoreload 2

import gc
import sys
import time
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

sys.path.append('/root/code/elo/Tools/')
from tools import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
new = pd.read_csv('/root/data/new_merchant_transactions.csv')
his = pd.read_csv('/root/data/historical_transactions.csv')
train = pd.read_csv('/root/data/train.csv')
test = pd.read_csv('/root/data/test.csv')

In [3]:
def cal_mode(serise):
    return serise.mode()[0]

In [5]:
#对 his 与 new 的时间数据进行处理，创建特征该次交易距离起始时间的days
his['purchase_date'] = pd.to_datetime(his.purchase_date)
new['purchase_date'] = pd.to_datetime(new.purchase_date)

startime = his.purchase_date.min()

his['days_to_start'] = (his.purchase_date - startime).dt.days
new['days_to_start'] = (new.purchase_date - startime).dt.days


In [6]:
# authorized_flag

his.category_2.fillna(0, inplace=True)
his.category_3.fillna('D', inplace=True)
his.merchant_id.fillna('NaN', inplace=True)


new.category_2.fillna(0, inplace=True)
new.category_3.fillna('D', inplace=True)
new.merchant_id.fillna('NaN', inplace=True)


new['authorized_flag'] = new.authorized_flag.map({'N':0,'Y':1})
his['authorized_flag'] = his.authorized_flag.map({'N':0,'Y':1})

agg = {
    'authorized_flag':['count', 'mean', 'sum']
}
auth_flag = his.groupby('card_id').agg(agg)
auth_flag.columns = ['_'.join(col).strip() for col in auth_flag.columns.values]
auth_flag.reset_index(inplace=True)


In [21]:
agg = {'authorized_flag': ['mean', 'count']}
flag_month = his.groupby(['card_id', 'month_lag']).agg(agg)
flag_month.columns = ['_'.join(col).strip() for col in flag_month.columns.values]
flag_month.reset_index(inplace=True)

# 找到对于每id来说，距离评估日最后的月成交数量与成交率
col_idx = ['authorized_flag_mean', 'authorized_flag_count']
row_idx = flag_month.groupby('card_id').month_lag.apply(lambda x: x.idxmax()).values
final = flag_month.loc[row_idx,col_idx].reset_index(drop=True)
final.columns = ['final_authorized_flag_mean', 'final_authorized_flag_count']


# 找到对于每id来说，最开始的月成交数量与成交率
col_idx = ['authorized_flag_mean', 'authorized_flag_count']
row_idx = flag_month.groupby('card_id').month_lag.apply(lambda x: x.idxmin()).values
start = flag_month.loc[row_idx,col_idx].reset_index(drop=True)
start.columns = ['start_authorized_flag_mean', 'start_authorized_flag_count']


auth_flag['final_start_authorized_flag_count_percent'] = final.final_authorized_flag_count / start.start_authorized_flag_count
auth_flag['final_start_start_authorized_flag_mean_minues'] = final.final_authorized_flag_mean - start.start_authorized_flag_mean
del col_idx, row_idx, flag_month, final, start; gc.collect()

17597

In [8]:
def agg_dataframe(df):
    
    df['category_1'] = df.category_1.map({'N':0, 'Y':1})
    df = pd.get_dummies(df, columns=['category_2', 'category_3'])
    df['log_amount'] = np.log(df.purchase_amount+0.75)

    df['month_diff'] = ((datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']
    df['dayofweek'] = df.purchase_date.apply(lambda t: t.dayofweek)
    df['month'] = df.purchase_date.apply(lambda t: t.month)
    
    agg = {
        'city_id': ['nunique', cal_mode],
        'category_1': ['mean', 'count'],
        'merchant_category_id': ['nunique', cal_mode],
        'merchant_id': ['nunique', cal_mode],
        'state_id': ['nunique', cal_mode],
        'subsector_id': ['nunique', cal_mode],
        'installments': ['mean', 'max', 'min', 'std'],
        'category_2_0.0': ['mean'],
        'category_2_1.0': ['mean'],
        'category_2_2.0': ['mean'],
        'category_2_3.0': ['mean'],
        'category_2_4.0': ['mean'],
        'category_2_5.0': ['mean'],
        'category_3_A': ['mean'],
        'category_3_B': ['mean'],
        'category_3_C': ['mean'],
        'category_3_D': ['mean'],
        'month_lag': ['min', 'max', np.ptp, 'std'],
        'days_to_start': [np.ptp, 'min', 'max'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
        'log_amount': ['mean', 'max', 'min', 'std'],
        'month_diff': ['mean', 'max', 'min', 'std'],
        'dayofweek': [cal_mode, 'nunique', 'std'],
        'month': ['std']
    }
    agg_df = df.groupby(['card_id']).agg(agg)
    agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns.values]
    agg_df.reset_index(inplace=True)
    
    return agg_df

In [9]:
agg_his = agg_dataframe(his)
agg_his.columns = ['his_' + c if c != 'card_id' else c for c in agg_his.columns]
agg_new = agg_dataframe(new)
agg_new.columns = ['new_' + c if c != 'card_id' else c for c in agg_new.columns]

In [22]:
train = pd.merge(train, auth_flag, on='card_id', how='left')
train = pd.merge(train, agg_his, on='card_id', how='left')
train = pd.merge(train, agg_new, on='card_id', how='left')

test = pd.merge(test, auth_flag, on='card_id', how='left')
test = pd.merge(test, agg_his, on='card_id', how='left')
test = pd.merge(test, agg_new, on='card_id', how='left')

In [23]:
train['first_active_month'] = (pd.to_datetime(train['first_active_month']) - startime).dt.days
test['first_active_month'] = (pd.to_datetime(test['first_active_month']) - startime).dt.days

In [24]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [25]:
str_label = [
    'his_merchant_id_cal_mode',
    'new_merchant_id_cal_mode'
]
for i in str_label:
    le = LabelEncoder()
    unique = train[i].append(test[i]).unique().tolist()
    le.fit(unique)
    train[i] = le.transform(train[i].tolist())
    test[i] = le.transform(test[i].tolist())

In [26]:
train.to_csv('/root/tempfile/train_final121107.csv',index=False)
test.to_csv('/root/tempfile/test_final121107.csv',index=False)