In [1]:
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm
import random
random.seed(42)

import gc

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [2]:
path_input = {}
path_output = {}
for i in ["train1", "valid"]:
    path_input[i] = f"/kaggle/input/otto-make-data-train-test-split/{i}.jsonl"
    path_output[i] = f"candidate_{i}.jsonl"
    
sample_size = 150000
num_top_item = 5000
event_type = "orders"

debug = False # 1つ分のchunkで動作確認するときTrue
if debug: num_top_item = 100

In [3]:
# aidごとのevent数をカウントする
dict_count = {"clicks": {}, "carts": {}, "orders": {}}
for i in ["train1", "valid"]:
    chunks = pd.read_json(path_input[i],lines = True, chunksize = sample_size)
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            if i == "valid": list_event = session["events"][:session["cutoff"]] # validはcutoff以前のeventのみ使える
            else: list_event = session["events"] # trainはすべてのeventを使える
            for event in list_event:
                dict_count[event["type"]][event["aid"]] = dict_count[event["type"]].get(event["aid"], 0) + 1
        if debug: break

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [4]:
# event_typeごとに回数の多い順にソートしてkeyだけ取り出す
list_rank = {}
for i in ["clicks", "carts", "orders"]:
    list_rank[i] = [j[0] for j in sorted(dict_count[i].items(), key = lambda x : x[1], reverse = True)]
# order数上位のaidのリスト
list_aid = list_rank[event_type][:num_top_item]

In [5]:
# list_aidのorder履歴のあるsessionを記録する
list_session = []
for i in ["train1", "valid"]:
    chunks = pd.read_json(path_input[i], lines = True, chunksize = sample_size)
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            session = session.to_dict()
            if i == "valid": list_event = session["events"][:session["cutoff"]] # validはcutoff以前のeventのみ使える
            else: list_event = session["events"] # trainはすべてのeventを使える
            list_aid_ses = [event["aid"] for event in list_event if event["aid"] in list_aid]
            if len(list_aid_ses) > 1: # 2つ以上orderがあるsessionのみ保持する
                list_session.append(session["session"])
        if debug: break

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [6]:
print(len(list_aid))
print(len(list_session))

5000
774321


In [7]:
# 空の評価値行列を作成する
mat = np.full((len(list_session), len(list_aid)), False)
mat = pd.DataFrame(mat)
mat.index = list_session
mat.columns = list_aid

In [8]:
if not debug: sample_size = 10000

In [9]:
list_session = set(list_session)
list_aid = set(list_aid)

In [10]:
# 評価値行列を作成する
for i in ["train1", "valid"]:
    chunks = pd.read_json(path_input[i],lines = True, chunksize = sample_size)
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            if i == "valid": list_event = session["events"][:session["cutoff"]] # validはcutoff以前のeventのみ使える
            else: list_event = session["events"] # trainはすべてのeventを使える
            for event in list_event:
                if event["type"] == event_type and event["aid"] in list_aid and session["session"] in list_session:
                    mat.loc[session["session"], event["aid"]] = True
        if debug: break

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [11]:
mat

Unnamed: 0,1603001,231487,450505,986164,166037,1733943,332654,409620,801774,258353,...,919479,1450933,1188782,571564,1478957,603622,340581,791120,105110,969402
8643223,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8643224,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8643228,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8643229,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8643231,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12899551,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12899554,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12899580,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
12899602,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
freq_items = apriori(pd.DataFrame(mat), min_support = 0.00001, use_colnames = True, low_memory = True)
freq_items

Unnamed: 0,support,itemsets
0,0.000558,(1603001)
1,0.000499,(231487)
2,0.000456,(450505)
3,0.000430,(986164)
4,0.000418,(166037)
...,...,...
5604,0.000039,"(933686, 202224, 1446430)"
5605,0.000026,"(202224, 150889, 933686)"
5606,0.000023,"(202224, 150889, 1446430)"
5607,0.000012,"(925887, 1527333, 1699431)"


In [13]:
# アソシエーション・ルール抽出
df_rules = association_rules(
    freq_items,             # supportとitemsetsを持つデータフレーム
    metric = "confidence",  # アソシエーション・ルールの評価指標
    min_threshold = 0.01,    # metricsの閾値
)
df_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(756588),(231487),0.000342,0.000499,0.000080,0.233962,469.331332,0.000080,1.304768
1,(231487),(756588),0.000499,0.000342,0.000080,0.160622,469.331332,0.000080,1.190950
2,(231487),(1111967),0.000499,0.000211,0.000010,0.020725,98.454623,0.000010,1.020949
3,(1111967),(231487),0.000211,0.000499,0.000010,0.049080,98.454623,0.000010,1.051089
4,(1689044),(231487),0.000218,0.000499,0.000019,0.088757,178.048487,0.000019,1.096856
...,...,...,...,...,...,...,...,...,...
1289,"(202224, 1446430)","(150889, 933686)",0.000039,0.000045,0.000023,0.600000,13274.074286,0.000023,2.499887
1290,(933686),"(202224, 150889, 1446430)",0.000092,0.000023,0.000023,0.253521,10905.929577,0.000023,1.339592
1291,(150889),"(202224, 933686, 1446430)",0.000076,0.000039,0.000023,0.305085,7874.450847,0.000023,1.438969
1292,(202224),"(150889, 933686, 1446430)",0.000074,0.000039,0.000023,0.315789,8150.747368,0.000023,1.461482


In [14]:
df_rules[["antecedents", "consequents", "support", "confidence", "lift"]].to_pickle(f"association_rules_{event_type}.pickle")