In [1]:
import os
import pickle
import gc
import pandas as pd
import numpy as np
from tqdm import tqdm
from utils import (
    load_pickle, dump_pickle, get_nominal_dfal, feats_root, mem_usage, reduce_mem_usage,
    nominal_cate_cols, ordinal_cate_cols, identity_cols, continual_cols, 
)

pd.set_option('display.max_columns', 1000)

In [2]:
def gen_target_agg_features(data, last_da, win_das, col):
    data = data.copy()
    indexing = (data.da < last_da) & (data.da >= last_da - win_das)
    gp = data.loc[indexing, [col, 'is_trade']].groupby(col)['is_trade']
    avgs = gp.mean()
    sums = gp.sum()
    cnts = gp.size()
    skews = gp.skew()
    var = gp.var()
    sems = gp.sem()
    kurts = gp.apply(pd.DataFrame.kurt)
    
    indexing = data.da == last_da
    data.loc[indexing, 'agg_target_mean_{}_wd_{}'.format(col, win_das)] = data.loc[indexing, col].map(avgs)
    data.loc[indexing, 'agg_target_sum_{}_wd_{}'.format(col, win_das)] = data.loc[indexing, col].map(sums)
    data.loc[indexing, 'agg_target_count_{}_wd_{}'.format(col, win_das)] = data.loc[indexing, col].map(cnts)
    data.loc[indexing, 'agg_target_var_{}_wd_{}'.format(col, win_das)] = data.loc[indexing, col].map(var)
    data.loc[indexing, 'agg_target_sem_{}_wd_{}'.format(col, win_das)] = data.loc[indexing, col].map(sems)
    data.loc[indexing, 'agg_target_kurt_{}_wd_{}'.format(col, win_das)] = data.loc[indexing, col].map(kurts)
    
    return data

In [3]:
def gen_target_aggs(col, updata=False):
    feat_path = os.path.join(feats_root,'target_aggs_{}.pkl'.format(col))
    if os.path.exists(feat_path) and updata == False:
        print('Found ' + feat_path)
    else:
        print('Generating ' + feat_path)
        dfal = get_nominal_dfal()[[col, 'da', 'is_trade']]
        dmax = dfal.da.max()
        dmin = dfal.da.min()
        for da in sorted(dfal.da.unique())[1:]:
            for win_das in [1, 2, 3]:
                if da - win_das < dmin:
                    continue
                dfal = gen_target_agg_features(dfal, da, win_das, col)
        dfal = dfal.loc[dfal.da>17,:]
        dfal.drop(['is_trade'], inplace=True, axis=1)
        dfal.drop_duplicates([col, 'da'], inplace=True)
        dfal.fillna(0, inplace=True)
        dfal, _ = reduce_mem_usage(dfal)
        dump_pickle(dfal, feat_path)

In [4]:
def gen_target_features():
    for c in tqdm(nominal_cate_cols + ordinal_cate_cols + identity_cols + ['hm', 'mi', 'ho']):
        gen_target_aggs(c)

In [5]:
def add_target_features(data, col):
    feat_path = os.path.join(feats_root,'target_aggs_{}.pkl'.format(col))
    if not os.path.exists(feat_path):
        gen_target_aggs(col)
    agg = load_pickle(feat_path)
    return pd.merge(data, agg, how='left',on=[col, 'da'])

In [6]:
if __name__ == '__main__':
    gen_target_features()

  0%|          | 0/20 [00:00<?, ?it/s]

Generating ./feats/target_aggs_item_category_list.pkl


  5%|▌         | 1/20 [00:05<01:52,  5.90s/it]

Memory usage of dataframe is : 0.014392852783203125  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.004486083984375  MB
This is  31.16883116883117 % of the initial size
Generating ./feats/target_aggs_item_brand_id.pkl


 10%|█         | 2/20 [00:21<03:09, 10.55s/it]

Memory usage of dataframe is : 1.4493703842163086  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.4394865036010742  MB
This is  30.322580645161292 % of the initial size
Generating ./feats/target_aggs_item_city_id.pkl


 15%|█▌        | 3/20 [00:27<02:33,  9.05s/it]

Memory usage of dataframe is : 0.11587715148925781  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.0361175537109375  MB
This is  31.16883116883117 % of the initial size
Generating ./feats/target_aggs_user_gender_id.pkl


 20%|██        | 4/20 [00:32<02:08,  8.06s/it]

Memory usage of dataframe is : 0.00411224365234375  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.00133514404296875  MB
This is  32.467532467532465 % of the initial size
Generating ./feats/target_aggs_user_occupation_id.pkl


 25%|██▌       | 5/20 [00:37<01:53,  7.54s/it]

Memory usage of dataframe is : 0.0051403045654296875  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.0016689300537109375  MB
This is  32.467532467532465 % of the initial size
Generating ./feats/target_aggs_item_price_level.pkl


 30%|███       | 6/20 [00:42<01:40,  7.17s/it]

Memory usage of dataframe is : 0.012042999267578125  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.003753662109375  MB
This is  31.16883116883117 % of the initial size
Generating ./feats/target_aggs_item_sales_level.pkl


 35%|███▌      | 7/20 [00:48<01:30,  6.94s/it]

Memory usage of dataframe is : 0.018505096435546875  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.005527496337890625  MB
This is  29.87012987012987 % of the initial size
Generating ./feats/target_aggs_item_collected_level.pkl


 40%|████      | 8/20 [00:53<01:20,  6.67s/it]

Memory usage of dataframe is : 0.018505096435546875  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.005527496337890625  MB
This is  29.87012987012987 % of the initial size
Generating ./feats/target_aggs_item_pv_level.pkl


 45%|████▌     | 9/20 [00:58<01:11,  6.47s/it]

Memory usage of dataframe is : 0.021001815795898438  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.0062732696533203125  MB
This is  29.87012987012987 % of the initial size
Generating ./feats/target_aggs_user_age_level.pkl


 50%|█████     | 10/20 [01:03<01:03,  6.30s/it]

Memory usage of dataframe is : 0.009252548217773438  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.0028839111328125  MB
This is  31.16883116883117 % of the initial size
Generating ./feats/target_aggs_user_star_level.pkl


 55%|█████▌    | 11/20 [01:07<00:55,  6.16s/it]

Memory usage of dataframe is : 0.01233673095703125  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.00368499755859375  MB
This is  29.87012987012987 % of the initial size
Generating ./feats/target_aggs_context_page_id.pkl


 60%|██████    | 12/20 [01:12<00:48,  6.06s/it]

Memory usage of dataframe is : 0.02056121826171875  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.00667572021484375  MB
This is  32.467532467532465 % of the initial size
Generating ./feats/target_aggs_shop_review_num_level.pkl


 65%|██████▌   | 13/20 [01:17<00:41,  5.96s/it]

Memory usage of dataframe is : 0.024379730224609375  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.007282257080078125  MB
This is  29.87012987012987 % of the initial size
Generating ./feats/target_aggs_shop_star_level.pkl


 70%|███████   | 14/20 [01:22<00:35,  5.88s/it]

Memory usage of dataframe is : 0.0223236083984375  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.0066680908203125  MB
This is  29.87012987012987 % of the initial size
Generating ./feats/target_aggs_item_id.pkl


 75%|███████▌  | 15/20 [02:00<00:40,  8.02s/it]

Memory usage of dataframe is : 5.758457183837891  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  1.6346588134765625  MB
This is  28.387096774193548 % of the initial size
Generating ./feats/target_aggs_shop_id.pkl


 80%|████████  | 16/20 [02:21<00:35,  8.81s/it]

Memory usage of dataframe is : 2.744565010070801  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.7791023254394531  MB
This is  28.387096774193548 % of the initial size
Generating ./feats/target_aggs_user_id.pkl
Memory usage of dataframe is : 32.3357629776001  MB


 85%|████████▌ | 17/20 [07:46<01:22, 27.43s/it]

___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  8.856291770935059  MB
This is  27.388535031847134 % of the initial size
Generating ./feats/target_aggs_hm.pkl


 90%|█████████ | 18/20 [07:59<00:53, 26.65s/it]

Memory usage of dataframe is : 1.4798212051391602  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.41053104400634766  MB
This is  27.741935483870968 % of the initial size
Generating ./feats/target_aggs_mi.pkl


 95%|█████████▌| 19/20 [08:04<00:25, 25.52s/it]

Memory usage of dataframe is : 0.06168365478515625  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.017223358154296875  MB
This is  27.92207792207792 % of the initial size
Generating ./feats/target_aggs_ho.pkl


100%|██████████| 20/20 [08:09<00:00, 24.49s/it]

Memory usage of dataframe is : 0.0246734619140625  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.00720977783203125  MB
This is  29.22077922077922 % of the initial size





In [7]:
# dfal = get_nominal_dfal()

In [8]:
# dfal.shape

In [9]:
# dfal = dfal.loc[dfal.da>20,:]

In [10]:
# for c in tqdm_notebook(['hm']):
#     dfal = add_target_features(dfal, c)

In [11]:
# del dfal['dt']
# for c in dfal.columns:
#     if c.endswith('_wd_6'):
#         del dfal[c]

In [12]:
# dfal.head()

In [13]:
# dfal.groupby(['ho'])['is_trade'].mad()

In [14]:
# dfal, _ = reduce_mem_usage(dfal)

In [15]:
# dfal.columns.values

In [16]:
# X_tr = dfal.loc[dfal.da<=22,:].drop(['da', 'hm', 'instance_id', 'is_trade'] + identity_cols, axis=1)
# y_tr = dfal.loc[dfal.da<=22,'is_trade']
# X_va = dfal.loc[dfal.da==23,:].drop(['da', 'hm', 'instance_id', 'is_trade'] + identity_cols, axis=1)
# y_va = dfal.loc[dfal.da==23,'is_trade']

In [17]:
# %matplotlib inline
# import matplotlib.pyplot as plt
# import catboost as cb
# import xgboost as xg
# import lightgbm as lg

In [18]:
# def print_feature_importance_lgb(gbm):
#     print(80 * '*')
#     print(31 * '*' + 'Feature Importance' + 31 * '*')
#     print(80 * '.')
#     print("\n".join((".%50s => %9.5f" % x) for x in sorted(
#         zip(gbm.feature_name(), gbm.feature_importance("gain")),
#         key=lambda x: x[1],
#         reverse=True)))
#     print(80 * '.')

# def fit_lgb(X_tr, y_tr, X_va, y_va, cates_cols):
#     params = {
#         'max_depth': 8,
#         'num_leaves': 128,
#         'objective':'binary',
#         'min_data_in_leaf': 20,
#         'learning_rate': 0.01,
#         'feature_fraction': 0.9,
#         'bagging_fraction': 0.8,
#         'subsample':0.85,
#         'bagging_freq': 1,
#         'random_state':2018,
#         'metric': ['binary_logloss'],
#         'num_threads': 16,
#         #'is_unbalance': True
#     }

#     MAX_ROUNDS = 10000
#     dtr = lg.Dataset(X_tr, label=y_tr, categorical_feature=cates_cols)
#     dva = lg.Dataset(X_va, label=y_va, categorical_feature=cates_cols, reference=dtr)
    
#     cls = lg.train(
#         params,
#         dtr,
#         num_boost_round=MAX_ROUNDS,
#         valid_sets=(dva, dtr),
#         valid_names=['valid', 'train'],
#         early_stopping_rounds=125,
#         verbose_eval=50)
#     print_feature_importance_lgb(cls)
#     lg.plot_importance(cls, importance_type='gain', figsize=(11,12), max_num_features=50, grid=False)
#     return cls

In [19]:
# gbm = fit_lgb(X_tr, y_tr, X_va, y_va, nominal_cate_cols)

## CatBoostClassifier

In [20]:
# cates_idx = [X_tr.columns.values.tolist().index(c) for c in nominal_cate_cols]

In [21]:
# import operator
# def verbose_feature_importance_cat(cls, X_tr):
#     cat_feature_importance = {
#         X_tr.columns.values.tolist()[idx]: score
#         for idx, score in enumerate(cls.feature_importances_)
#     }
    
#     cat_feature_importance = sorted(cat_feature_importance.items(), 
#                                     key=operator.itemgetter(1), 
#                                     reverse=False)
    
#     print(80 * '*')
#     print(31 * '*' + 'Feature Importance' + 31 * '*')
#     print(80 * '.')
#     for feature, score in reversed(cat_feature_importance):
#         print(".%50s => %9.5f" % (feature, score))
#     print(80 * '.')
    
#     feature_score = pd.DataFrame(cat_feature_importance, columns=['Feature','Score'])
    
#     plt.rcParams["figure.figsize"] = (11, 12)
#     ax = feature_score.tail(50).plot('Feature', 'Score', kind='barh', color='b')
#     ax.set_title("Catboost Feature Importance Ranking", fontsize=8)
#     ax.set_xlabel('')
#     rects = ax.patches
#     # get feature score as labels round to 2 decimal
#     labels = feature_score.tail(50)['Score'].round(2)
#     for rect, label in zip(rects, labels):
#         width = rect.get_width()
#         ax.text(width + 0.2,rect.get_y()+0.02, label, ha='center', va='bottom')
#     plt.show()


# def fit_cat(X_tr, y_tr, X_va, y_va, cates_idx):
#     print('Fitting CatBoostClassifier ...')
#     cls = cb.CatBoostClassifier(
#         iterations=2000,
#         od_type='Iter',
#         od_wait=120,
#         max_depth=8,
#         learning_rate=0.02,
#         l2_leaf_reg=9,
#         random_seed=2018,
#         metric_period=50,
#         fold_len_multiplier=1.1,
#         loss_function='Logloss',
#         logging_level='Verbose')
#     fine_model = cls.fit(X_tr, y_tr, eval_set=(X_va, y_va), cat_features=cates_idx)
#     verbose_feature_importance_cat(fine_model, X_tr)
#     return fine_model

In [22]:
# cat = fit_cat(X_tr, y_tr, X_va, y_va, cates_idx)