In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
from tqdm.notebook import tqdm
import random

import os
import gc

import json

In [2]:
path_input = {}
path_input_label = {}
path_input_aidsum = {}
path_input_imf = {}
path_input_bpr = {}
path_input_bpr_simitem = {}
path_input_count = {}
path_input_rank = {}

path_output = {}
path_output_label = {}
path_output_n_truth = {}
for i in ["train1"]:
    path_input[i] = f"/kaggle/input/otto-make-data-candidate/candidate_{i}.jsonl.gz"
    path_input_label[i] = f"/kaggle/input/otto-make-data-label/label_{i}.parquet"
    path_input_aidsum[i] = f"/kaggle/input/otto-make-data-aid-summary/aid_summary_{i}.pickle"
    path_input_count[i] = f"/kaggle/input/otto-feat-count-by-time/count_by_time_{i}.pickle"
    path_input_rank[i] = f"/kaggle/input/otto-feat-count-by-time/rank_by_time_{i}.pickle"
    path_input_imf[i] = {}
    path_input_bpr[i] = {}
    path_input_bpr_simitem[i] = {}
    for j in ["clicksxcarts", "cartsxorders", "ordersxclicks"]:
        path_input_bpr[i][j] = {}
        path_input_bpr_simitem[i][j] = f"/kaggle/input/otto-feat-bprsim-{j}/sim_items_bpr_{j}_{i}.pickle"
        for k in ["session2idx", "user_items", "model", "list_aid"]:
            path_input_bpr[i][j][k] = f"/kaggle/input/otto-cf-bpr-{j}/{k}_{j}_{i}.pickle"
    path_output[i] = f"feature_{i}.csv"
    path_output_label[i] = {}
    path_output_n_truth[i] = {}
    for j in ["clicks", "carts", "orders"]:
        path_input_imf[i][j] = {}
        for k in ["imfsession2idx", "imfuser_items", "imfmodel", "imflist_aid"]:
            path_input_imf[i][j][k] = f"/kaggle/input/otto-cf-imf-{j}/{k}_{j}_{i}.pickle"
        path_output_label[i][j] = f"feature_label_{i}_{j}_{{}}.pickle"
        path_output_n_truth[i][j] = f"n_ground_truth_{i}_{j}.csv"
    path_output_label[i]["feat"] = f"feature_label_{i}_feat_{{}}.pickle"
    
sample_size = 50000
sampling_rate = 0.2 # 負例をサンプルする割合
# sampling_rate = 6 # 負例をサンプルする割合

debug = False
n_session = {"train1": 2237925, "valid": 1735763}
time_min = {"train1":1660514400152, "valid": 1661119200060}
time_max = {"train1": 1661119199547, "valid": 1661723998043}

In [3]:
# 特徴量一覧
list_id = [
    "session", # session id
    "aid", # candidate aid 
    "last_clicks_aid", # 最後にclickしたaid
    "last_carts_aid", # 最後にcartしたaid
    "last_orders_aid", # 最後にorderしたaid
]
list_candidate_rule = [
#     "popularity_clicks", # clicksの上位
#     "popularity_carts", # clicsの上位
#     "popularity_orders", # clicksの上位
    "past_clicks", # 過去clicks実績あり
    "past_carts", # 過去carts実績あり
    "past_orders", # 過去orders実績あり
#     "association_clicks", # アソシエーションルール
#     "association_carts", # アソシエーションルール
#     "association_orders", # アソシエーションルール
    "imf_clicks", # IMF
#     "imf_carts", # IMF
#     "imf_orders", # IMF
#     "bpr_clicks", # BPR
#     "bpr_carts", # BPR
#     "bpr_orders", # BPR
    "bpr_clicksxcarts", # BPR
#     "bpr_cartsxorders", # BPR
#     "bpr_ordersxclicks", # BPR
#     "prone_clicks",
#     "prone_carts",
#     "prone_orders",
#     "popularity_clicks_after",
    "popularity_carts_after",
#     "popularity_orders_after",

]
list_feat = [
    "label_clicks", # clicksの正解ラベル
    "label_carts", # cartsの正解ラベル
    "label_orders", # ordersの正解ラベル
    "n_type_all", # eventの総数
    "n_type_clicks", # clicksの総数
    "n_type_carts", # cartsの総数
    "n_type_orders", # ordersの総数
    "rate_type_clicks", # clicksの割合
    "rate_type_carts", # cartsの割合
    "rate_type_orders", # ordersの割合
    "n_type_all_aid", # aidごとのeventの総数
    "n_type_clicks_aid", # aidごとのclicksの総数
    "n_type_carts_aid", # aidごとのcartsの総数
    "n_type_orders_aid", # aidごとのordersの総数
    "is_last_clicks", # 最後にclickしたかaidかどうか
    "is_last_carts", # 最後にclickしたかaidかどうか
    "is_last_orders", # 最後にclickしたかaidかどうか
    "num_type_rev", # sessionの最後から数えて何番目のaidか
    "elapsed_time", # sessionの経過時間
    "imf_score_clicks", # IMFのスコア
    "bpr_score_clicksxcarts", # BPRのスコア
#     "bpr_score_cartsxorders", # BPRのスコア
#     "bpr_score_ordersxclicks", # BPRのスコア
#     "n_clicks_simitem1_bpr_clicksxcarts", # BPRの類似aid
#     "n_clicks_simitem2_bpr_clicksxcarts", # BPRの類似aid
#     "n_clicks_simitem3_bpr_clicksxcarts", # BPRの類似aid
#     "n_clicks_simitem1_bpr_cartsxorders", # BPRの類似aid
#     "n_clicks_simitem2_bpr_cartsxorders", # BPRの類似aid
#     "n_clicks_simitem3_bpr_cartsxorders", # BPRの類似aid
#     "n_clicks_simitem1_bpr_ordersxclicks", # BPRの類似aid
#     "n_clicks_simitem2_bpr_ordersxclicks", # BPRの類似aid
#     "n_clicks_simitem3_bpr_ordersxclicks", # BPRの類似aid
#     "n_carts_simitem1_bpr_clicksxcarts", # BPRの類似aid
#     "n_carts_simitem2_bpr_clicksxcarts", # BPRの類似aid
#     "n_carts_simitem3_bpr_clicksxcarts", # BPRの類似aid
#     "n_carts_simitem1_bpr_cartsxorders", # BPRの類似aid
#     "n_carts_simitem2_bpr_cartsxorders", # BPRの類似aid
#     "n_carts_simitem3_bpr_cartsxorders", # BPRの類似aid
#     "n_carts_simitem1_bpr_ordersxclicks", # BPRの類似aid
#     "n_carts_simitem2_bpr_ordersxclicks", # BPRの類似aid
#     "n_carts_simitem3_bpr_ordersxclicks", # BPRの類似aid
#     "n_orders_simitem1_bpr_clicksxcarts", # BPRの類似aid
#     "n_orders_simitem2_bpr_clicksxcarts", # BPRの類似aid
#     "n_orders_simitem3_bpr_clicksxcarts", # BPRの類似aid
#     "n_orders_simitem1_bpr_cartsxorders", # BPRの類似aid
#     "n_orders_simitem2_bpr_cartsxorders", # BPRの類似aid
#     "n_orders_simitem3_bpr_cartsxorders", # BPRの類似aid
#     "n_orders_simitem1_bpr_ordersxclicks", # BPRの類似aid
#     "n_orders_simitem2_bpr_ordersxclicks", # BPRの類似aid
#     "n_orders_simitem3_bpr_ordersxclicks", # BPRの類似aid
#     "n_clicks_after_ses", # セッション後のclicksの数
#     "n_carts_after_ses", # セッション後のcartsの数
#     "n_orders_after_ses", # セッション後のordersの数
    "rank_clicks_after_ses", # セッション後のclicksの数
    "rank_carts_after_ses", # セッション後のcartsの数
    "rank_orders_after_ses", # セッション後のordersの数
    "last_ts",
    "ses_clicks2clicks", # sessionごとのrebuy
    "ses_clicks2carts",
    "ses_clicks2orders",
    "ses_carts2clicks",
    "ses_carts2carts",
    "ses_carts2orders",
    "ses_orders2clicks",
    "ses_orders2carts",
    "ses_orders2orders",
    "n_alltypes_0-2",
    "n_alltypes_3-5",
    "n_alltypes_6-8",
    "n_alltypes_9-11",
    "n_alltypes_12-14",
    "n_alltypes_15-17",
    "n_alltypes_18-20",
    "n_alltypes_21-23",

]
list_aidsum = [
    "n_clicks_allses", # すべてのセッションのclicks総数
    "n_carts_allses", # すべてのセッションのcarts総数
    "n_orders_allses", # すべてのセッションのorders総数
    "n_alltypes_allses",# すべてのセッションのevents総数
    "n_clicks_allses_lastweek", # すべてのセッションのclicks総数(先週)
    "n_carts_allses_lastweek", # すべてのセッションのcarts総数(先週)
    "n_orders_allses_lastweek", # すべてのセッションのorders総数(先週)
    "n_alltypes_allses_lastweek",# すべてのセッションのevents総数(先週)
    "n_clicks_allses_diff", # すべてのセッションのclicks総数(先週との差)
    "n_carts_allses_diff", # すべてのセッションのcarts総数(先週との差)
    "n_orders_allses_diff", # すべてのセッションのorders総数(先週との差)
    "n_alltypes_allses_diff",# すべてのセッションのevents総数(先週との差)
    "n_clicks_allses_diffrate", # すべてのセッションのclicks総数(先週との比)
    "n_carts_allses_diffrate", # すべてのセッションのcarts総数(先週との比)
    "n_orders_allses_diffrate", # すべてのセッションのorders総数(先週との比)
    "n_alltypes_allses_diffrate",# すべてのセッションのevents総数(先週との比)
    "n_clicks2clicks_allses",
    "n_clicks2carts_allses",
    "n_clicks2orders_allses",
    "n_carts2clicks_allses",
    "n_carts2carts_allses",
    "n_carts2orders_allses",
    "n_orders2clicks_allses",
    "n_orders2carts_allses",
    "n_orders2orders_allses",
    "n_clicks2clicks_allses_lastweek",
    "n_clicks2carts_allses_lastweek",
    "n_clicks2orders_allses_lastweek",
    "n_carts2clicks_allses_lastweek",
    "n_carts2carts_allses_lastweek",
    "n_carts2orders_allses_lastweek",
    "n_orders2clicks_allses_lastweek",
    "n_orders2carts_allses_lastweek",
    "n_orders2orders_allses_lastweek",
#     "rate_clicks2clicks",
#     "rate_clicks2carts",
#     "rate_clicks2orders",
#     "rate_carts2clicks",
#     "rate_carts2carts",
#     "rate_carts2orders",
#     "rate_orders2clicks",
#     "rate_orders2carts",
#     "rate_orders2orders",
    "rate_clicks2clicks_lastweek",
    "rate_clicks2carts_lastweek",
    "rate_clicks2orders_lastweek",
    "rate_carts2clicks_lastweek",
    "rate_carts2carts_lastweek",
    "rate_carts2orders_lastweek",
    "rate_orders2clicks_lastweek",
    "rate_orders2carts_lastweek",
    "rate_orders2orders_lastweek",
]

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
# 特徴量を作成
for i in ["train1"]:
    df_label = pd.read_parquet(path_input_label[i]) # 正解ラベル
    df_label = reduce_mem_usage(df_label)
    # sessionごとのラベル数データを出力
    for types in ["clicks", "carts", "orders"]:
        df_label_type = df_label[["session", f"label_{types}"]].copy()        
        # sessionごとのラベル数データを出力
        if types == "clicks": df_label_type["n_ground_truth"] = [1] * len(df_label_type)
        else: df_label_type["n_ground_truth"] = [len(x) for x in df_label_type[f"label_{types}"].values]
        df_label_type.to_csv(path_output_n_truth[i][types], index = False)
    dict_label = {}
    for types in ["clicks", "carts", "orders", "any"]:
        dict_label[types] = dict(zip(df_label["session"], df_label[f"label_{types}"]))
    del df_label_type, df_label
    gc.collect()
    
    # aidの特徴量
    df_aidsum = pd.read_pickle(path_input_aidsum[i])
    df_aidsum = df_aidsum[["aid"]+list_aidsum]
    flg_first_chunk = True
    chunks = pd.read_json(path_input[i],lines = True, chunksize = sample_size)
    tot_ses = 0
    tot_aid = 0
    
    # IMF
    imfmodel, imfsession2idx, imfuser_items, imflist_aid, imfaid2idx = {}, {}, {}, {}, {}
    for types in ["clicks"]:
        imfmodel[types] = pd.read_pickle(path_input_imf[i][types]["imfmodel"])
        imfsession2idx[types] = pd.read_pickle(path_input_imf[i][types]["imfsession2idx"])
        imfuser_items[types] = pd.read_pickle(path_input_imf[i][types]["imfuser_items"])
        imflist_aid[types] = pd.read_pickle(path_input_imf[i][types]["imflist_aid"])
        imfaid2idx[types] = dict(zip(imflist_aid[types], range(len(imflist_aid[types]))))

    
    # BPR
    model, session2idx, user_items, list_aid, aid2idx, bpr_simitem = {}, {}, {}, {}, {}, {}
    for types in ["clicksxcarts"]:
        model[types] = pd.read_pickle(path_input_bpr[i][types]["model"])
        session2idx[types] = pd.read_pickle(path_input_bpr[i][types]["session2idx"])
        user_items[types] = pd.read_pickle(path_input_bpr[i][types]["user_items"])
        list_aid[types] = pd.read_pickle(path_input_bpr[i][types]["list_aid"])
        aid2idx[types] = dict(zip(list_aid[types], range(len(list_aid[types]))))
        
#     for types in ["clicksxcarts"]:    
#         bpr_simitem[types] = pd.read_pickle(path_input_bpr_simitem[i][types])
    
#     count_after = pd.read_pickle(path_input_count[i])
    rank_after = pd.read_pickle(path_input_rank[i])


    for n_chunk,  df_chunk in enumerate(tqdm(chunks, total = n_session[i]//sample_size+1)):
        # listのdictに出力する特徴量を格納する
        dict_id = {feat:[] for feat in list_id}
        dict_feat = {feat:[] for feat in list_feat + list_candidate_rule}

#         for j, session in tqdm(df_chunk.iterrows(), total = sample_size):
        for j, session in df_chunk.iterrows():
            tot_ses += 1
            ses = session["session"]
            dict_feat_ses = {}
            list_candidate = set()
            # candidate aidを作成
            for cand in list_candidate_rule: list_candidate.update(session["candidate"][cand])
#             list_candidate.update(set(dict_label["any"][ses])) # 正例をすべて追加
            list_candidate = list(list_candidate)
            if i == "train1":
                # 負例のみサンプリング
                set_truth = set(dict_label["any"][ses])
                list_candidate = [x for x in list_candidate if x in set_truth or random.random() < sampling_rate]
#                 list_neg = list(set(list_candidate) - set_truth)
#                 list_pos = list(set(list_candidate) & set_truth)
#                 list_candidate = list(list_pos) + random.sample(list_neg, min(len(list_neg), len(list_pos)*sampling_rate))
            tot_aid += len(list_candidate)
            if len(list_candidate) == 0:continue

            for types in ["clicks", "carts", "orders"]:
                dl = set(dict_label[types][ses])
                dict_feat_ses[f"label_{types}"] = [1 if x in dl else 0 for x in list_candidate]

            # train, valid共にcutoff以前のデータのみで特徴量を集計する
            event_train = session["events"][:session["cutoff"]]
            len_event = len(event_train)
            n_cand = len(list_candidate)
            # session×candidate aid
            dict_feat_ses["session"] = [ses] * n_cand
            dict_feat_ses["aid"] = list_candidate
            # 各candidateがどの条件に該当するか
            for cand in list_candidate_rule:
                ses_cand = set(session["candidate"][cand])
                dict_feat_ses[cand] = [1 if x in ses_cand else 0 for x in list_candidate]
            
            dict_last_aid = {"clicks": -1, "carts": -1, "orders": -1} # session中の最後のaid
            dict_count_event = {}
            dict_num_aid = {}
            dict_count_event2 = {}
            dict_count_time = {}
            # 全eventを走査
            for n, event in enumerate(event_train):
                # 各event typeの出現回数をカウントする
                if event["aid"] not in dict_count_event: dict_count_event[event["aid"]] = {"clicks": 0, "carts": 0, "orders": 0} # aidごとに回数を初期化
                for types in ["clicks", "carts", "orders"]:
                    if dict_count_event[event["aid"]][types] > 0:
                        dict_count_event2[f"ses_{types}2{event['type']}"] = dict_count_event2.get(f"ses_{types}2{event['type']}",0) +1

                dict_count_event[event["aid"]][event["type"]] = dict_count_event[event["aid"]][event["type"]] + 1
                # typeごとの最後のaidを保存する
                dict_last_aid[event["type"]] = event["aid"]
                # 最後から数えて何番目のaidか
                dict_num_aid[event["aid"]] = len_event - n
                last_ts = event["ts"]
                # eventの時間
                ts = pd.to_datetime(event["ts"], unit = 'ms').hour
                dict_count_time[ts//3] = dict_count_time.get(ts//3, 0) + 1
            for n in range(8):
                dict_feat_ses[f"n_alltypes_{n*3}-{n*3+2}"] = [dict_count_time.get(n,0)] * n_cand
                
            dict_feat_ses["num_type_rev"] = [dict_num_aid.get(aid,999) for aid in list_candidate]
            idx_box = (last_ts - time_min[i]) // (1000*60*60*3)
            dict_feat_ses["last_ts"] = [idx_box] * n_cand
            for pretypes in ["clicks", "carts", "orders"]:
                for types in ["clicks", "carts", "orders"]:
                    dict_feat_ses[f"ses_{pretypes}2{types}"] = [dict_count_event2.get(f"ses_{pretypes}2{types}",0)]*n_cand

            for types in ["clicks", "carts", "orders"]:
                # typeごとの出現回数を集計
                n_type = sum([dict_count_event[aid][types] for aid in dict_count_event.keys()])
                dict_feat_ses[f"n_type_{types}"] = [n_type] * n_cand
                dict_feat_ses[f"rate_type_{types}"] = [n_type/len(event_train)] * n_cand

                # aid×typeごとの出現回数（candidateのみカウント）
                dict_feat_ses[f"n_type_{types}_aid"] = [
                    dict_count_event.get(aid, {"clicks": 0, "carts": 0, "orders": 0})[types] \
                    for aid in list_candidate
                ]
                # aidごとの
                # typeごとの最後のaid
                dict_feat_ses[f"last_{types}_aid"] = [dict_last_aid[types]] * n_cand
                # cand_aidが最後のaidかどうか
                dict_feat_ses[f"is_last_{types}"] = [1 if x == y else 0 for x,y in zip(list_candidate, dict_feat_ses[f"last_{types}_aid"])]
                
                dict_feat_ses[f"rank_{types}_after_ses"] = [rank_after[idx_box][types].get(aid, 0) for aid in list_candidate]

            # すべてのtypeの合計 
            dict_feat_ses["n_type_all"] = [len(event_train)] * n_cand
            dict_feat_ses["n_type_all_aid"] = [
                sum(dict_count_event.get(aid, {"clicks": 0, "carts": 0, "orders": 0}).values()) \
                for aid in list_candidate
            ]
            # IMFのスコア
            for types in ["clicks"]:
                if ses in imfsession2idx[types]:
                    list_absent_idx = [idx for idx, aid in enumerate(list_candidate) if aid not in imfaid2idx[types]]
                    list_recom_aid = [imfaid2idx[types][aid] for aid in list_candidate  if aid in imfaid2idx[types]]
                    if len(list_recom_aid) > 0:
                        recom = imfmodel[types].rank_items(imfsession2idx[types][ses], imfuser_items[types], list_recom_aid)
                    else:
                        recom = []
                    score = [pair[1] for pair in recom]
                    for idx in list_absent_idx: score.insert(idx,0)
                    dict_feat_ses[f"imf_score_{types}"] = score
                else:
                    dict_feat_ses[f"imf_score_{types}"] = [0] * len(list_candidate)

            # BPRのスコア
            for types in ["clicksxcarts"]:
                if ses in session2idx[types]:
                    list_absent_idx = [idx for idx, aid in enumerate(list_candidate) if aid not in aid2idx[types]]
                    list_recom_aid = [aid2idx[types][aid] for aid in list_candidate  if aid in aid2idx[types]]
                    if len(list_recom_aid) > 0:
                        recom = model[types].rank_items(session2idx[types][ses], user_items[types], list_recom_aid)
                    else:
                        recom = []
                    score = [pair[1] for pair in recom]
                    for idx in list_absent_idx: score.insert(idx,0)
                    dict_feat_ses[f"bpr_score_{types}"] = score
                else:
                    dict_feat_ses[f"bpr_score_{types}"] = [0] * len(list_candidate)
            # sessionの経過時間
            dict_feat_ses["elapsed_time"] = [(event_train[-1]["ts"] - event_train[0]["ts"])//(60 * 60 * 1000)] * n_cand
            # sessionごとの集計結果を結合
            for feat in list_id: dict_id[feat] += dict_feat_ses[feat]
            for feat in list_feat + list_candidate_rule: dict_feat[feat] += dict_feat_ses[feat]
            if debug: break
        if debug: break
        # 出力用DF作成
        df_feature = pd.DataFrame(dict_feat)
        df_id = pd.DataFrame(dict_id, dtype = np.int32)
        df_feature = pd.concat([df_id, df_feature], axis = 1)
        # aidの特徴量と結合
        df_feature = pd.merge(df_feature, df_aidsum, how = "left", on = "aid")
        df_feature[list_aidsum] = df_feature[list_aidsum].fillna(0)#.astype(np.int32)
        df_feature = reduce_mem_usage(df_feature, verbose = False)
        
        df_feature = df_feature.drop([
            "aid", "last_clicks_aid", "last_carts_aid", "last_orders_aid"
        ], axis = 1)
        
        df_feature.drop(["label_clicks", "label_carts", "label_orders"], axis = 1).to_pickle(path_output_label[i]["feat"].format(n_chunk))
        # event typeごとに出力
        for types in ["clicks", "carts", "orders"]:
            df_feature[f"label_{types}"].to_pickle(path_output_label[i][types].format(n_chunk))
        del df_feature, dict_feat, dict_id, df_id, dict_feat_ses
        gc.collect()
print(f"num_session: {tot_ses}, num_candidate: {tot_aid}, avg_candidate: {tot_aid/tot_ses}")

Mem. usage decreased to 81.10 Mb (20.8% reduction)


  0%|          | 0/45 [00:00<?, ?it/s]

num_session: 2237925, num_candidate: 17764974, avg_candidate: 7.938145380207112


In [6]:
def process_session():
            ses = session["session"]
            dict_feat_ses = {}
            list_candidate = set()
            # candidate aidを作成
            for cand in list_candidate_rule: list_candidate.update(session["candidate"][cand])
#             list_candidate.update(set(dict_label["any"][ses])) # 正例をすべて追加
            list_candidate = list(list_candidate)
            if i == "train1":
                # 負例のみサンプリング
                set_truth = set(dict_label["any"][ses])
                list_candidate = [x for x in list_candidate if x in set_truth or random.random() < sampling_rate]
#                 list_neg = list(set(list_candidate) - set_truth)
#                 list_pos = list(set(list_candidate) & set_truth)
#                 list_candidate = list(list_pos) + random.sample(list_neg, min(len(list_neg), len(list_pos)*sampling_rate))

            for types in ["clicks", "carts", "orders"]:
                dl = set(dict_label[types][ses])
                dict_feat_ses[f"label_{types}"] = [1 if x in dl else 0 for x in list_candidate]

            # train, valid共にcutoff以前のデータのみで特徴量を集計する
            event_train = session["events"][:session["cutoff"]]
            len_event = len(event_train)
            n_cand = len(list_candidate)
            # session×candidate aid
            dict_feat_ses["session"] = [ses] * n_cand
            dict_feat_ses["aid"] = list_candidate
            # 各candidateがどの条件に該当するか
            for cand in list_candidate_rule:
                ses_cand = set(session["candidate"][cand])
                dict_feat_ses[cand] = [1 if x in ses_cand else 0 for x in list_candidate]
            
            dict_last_aid = {"clicks": -1, "carts": -1, "orders": -1} # session中の最後のaid
            dict_count_event = {}
            dict_num_aid = {}
            dict_count_event2 = {}
            dict_count_time = {}
            # 全eventを走査
            for n, event in enumerate(event_train):
                # 各event typeの出現回数をカウントする
                if event["aid"] not in dict_count_event: dict_count_event[event["aid"]] = {"clicks": 0, "carts": 0, "orders": 0} # aidごとに回数を初期化
                for types in ["clicks", "carts", "orders"]:
                    if dict_count_event[event["aid"]][types] > 0:
                        dict_count_event2[f"ses_{types}2{event['type']}"] = dict_count_event2.get(f"ses_{types}2{event['type']}",0) +1

                dict_count_event[event["aid"]][event["type"]] = dict_count_event[event["aid"]][event["type"]] + 1
                # typeごとの最後のaidを保存する
                dict_last_aid[event["type"]] = event["aid"]
                # 最後から数えて何番目のaidか
                dict_num_aid[event["aid"]] = len_event - n
                last_ts = event["ts"]
                # eventの時間
                ts = pd.to_datetime(event["ts"], unit = 'ms').hour
                dict_count_time[ts//3] = dict_count_time.get(ts//3, 0) + 1
            for n in range(8):
                dict_feat_ses[f"n_alltypes_{n*3}-{n*3+2}"] = [dict_count_time.get(n,0)] * n_cand
                
            dict_feat_ses["num_type_rev"] = [dict_num_aid.get(aid,999) for aid in list_candidate]
            idx_box = (last_ts - time_min[i]) // (1000*60*60*3)
            dict_feat_ses["last_ts"] = [idx_box] * n_cand
            for pretypes in ["clicks", "carts", "orders"]:
                for types in ["clicks", "carts", "orders"]:
                    dict_feat_ses[f"ses_{pretypes}2{types}"] = [dict_count_event2.get(f"ses_{pretypes}2{types}",0)]*n_cand

            for types in ["clicks", "carts", "orders"]:
                # typeごとの出現回数を集計
                n_type = sum([dict_count_event[aid][types] for aid in dict_count_event.keys()])
                dict_feat_ses[f"n_type_{types}"] = [n_type] * n_cand
                dict_feat_ses[f"rate_type_{types}"] = [n_type/len(event_train)] * n_cand

                # aid×typeごとの出現回数（candidateのみカウント）
                dict_feat_ses[f"n_type_{types}_aid"] = [
                    dict_count_event.get(aid, {"clicks": 0, "carts": 0, "orders": 0})[types] \
                    for aid in list_candidate
                ]
                # aidごとの
                # typeごとの最後のaid
                dict_feat_ses[f"last_{types}_aid"] = [dict_last_aid[types]] * n_cand
                # cand_aidが最後のaidかどうか
                dict_feat_ses[f"is_last_{types}"] = [1 if x == y else 0 for x,y in zip(list_candidate, dict_feat_ses[f"last_{types}_aid"])]
                
                dict_feat_ses[f"rank_{types}_after_ses"] = [rank_after[idx_box][types].get(aid, 0) for aid in list_candidate]

            # すべてのtypeの合計 
            dict_feat_ses["n_type_all"] = [len(event_train)] * n_cand
            dict_feat_ses["n_type_all_aid"] = [
                sum(dict_count_event.get(aid, {"clicks": 0, "carts": 0, "orders": 0}).values()) \
                for aid in list_candidate
            ]
            # IMFのスコア
            for types in ["clicks"]:
                if ses in imfsession2idx[types]:
                    list_absent_idx = [idx for idx, aid in enumerate(list_candidate) if aid not in imfaid2idx[types]]
                    list_recom_aid = [imfaid2idx[types][aid] for aid in list_candidate  if aid in imfaid2idx[types]]
                    if len(list_recom_aid) > 0:
                        recom = imfmodel[types].rank_items(imfsession2idx[types][ses], imfuser_items[types], list_recom_aid)
                    else:
                        recom = []
                    score = [pair[1] for pair in recom]
                    for idx in list_absent_idx: score.insert(idx,0)
                    dict_feat_ses[f"imf_score_{types}"] = score
                else:
                    dict_feat_ses[f"imf_score_{types}"] = [0] * len(list_candidate)

#             # BPRのスコア
#             for types in ["clicksxcarts", "cartsxorders", "ordersxclicks"]:
#                 if ses in session2idx[types]:
#                     list_absent_idx = [idx for idx, aid in enumerate(list_candidate) if aid not in aid2idx[types]]
#                     list_recom_aid = [aid2idx[types][aid] for aid in list_candidate  if aid in aid2idx[types]]
#                     if len(list_recom_aid) > 0:
#                         recom = model[types].rank_items(session2idx[types][ses], user_items[types], list_recom_aid)
#                     else:
#                         recom = []
#                     score = [pair[1] for pair in recom]
#                     for idx in list_absent_idx: score.insert(idx,0)
#                     dict_feat_ses[f"bpr_score_{types}"] = score
#                 else:
#                     dict_feat_ses[f"bpr_score_{types}"] = [0] * len(list_candidate)
            # sessionの経過時間
            dict_feat_ses["elapsed_time"] = [(event_train[-1]["ts"] - event_train[0]["ts"])//(60 * 60 * 1000)] * n_cand
            # sessionごとの集計結果を結合
            for feat in list_id: dict_id[feat] += dict_feat_ses[feat]
            for feat in list_feat + list_candidate_rule: dict_feat[feat] += dict_feat_ses[feat]


In [7]:
# debug = True

In [8]:
if debug:
    import line_profiler
    pr = line_profiler.LineProfiler()#LineProfilerクラスのインスタンス生成
    pr.add_function(process_session)#対象の関数をadd_functionで登録
    pr.runcall(process_session)#runcallで実行
    pr.print_stats()#結果を表示

In [9]:
# import requests

# notebook_name = "OTTO - make data - feature"
# message = f"{notebook_name}のコミットが完了しました。"

# line_token = 'emaWMItl4NShAMkB7JxdLzxGuz5EZKKYEaVKtCgDHD7'
# endpoint = 'https://notify-api.line.me/api/notify'
# message = "\n{}".format(message)
# payload = {'message': message}
# headers = {'Authorization': 'Bearer {}'.format(line_token)}
# requests.post(endpoint, data=payload, headers=headers)