In [1]:
!git clone https://github.com/lykeven/ProNE

Cloning into 'ProNE'...
remote: Enumerating objects: 174, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 174 (delta 11), reused 25 (delta 3), pack-reused 137[K
Receiving objects: 100% (174/174), 41.01 MiB | 18.29 MiB/s, done.
Resolving deltas: 100% (64/64), done.


In [2]:
%cd ProNE
!ls

/kaggle/working/ProNE
LICENSE    README.md	  data	 proNE.py
ProNE.cpp  classifier.py  frpca  requirements.txt


In [3]:
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
def load_embeddings(embeddings_file):
    # load embeddings from word2vec format file
    model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)
    features_matrix = np.asarray([model[str(node)] for node in range(len(model.index_to_key))])
    return features_matrix

In [4]:
import pandas as pd
from numpy import dot

import json
from tqdm.notebook import tqdm

In [5]:
path_input = {}
path_input_label = {}
path_input_past = {}
path_output = {}
for i in ["train1", "valid"]:
    path_input[i] = f"/kaggle/input/otto-make-data-train-test-split/{i}.jsonl"
    path_input_label[i] = f"/kaggle/input/otto-make-data-label/label_{i}.parquet"
    path_input_past[i] = f"/kaggle/input/otto-make-data-past-event/past_event_{i}.pickle"
    path_output[i] = f"edgelist.txt"
    
sample_size = 150000
num_top_item = 10000

N = 60

event_type = "clicks"

In [6]:
# for i in ["train", "valid"]:
for i in ["train1"]:
    dict_count = {"clicks": {}, "carts": {}, "orders": {}}
    chunks = pd.read_json(path_input[i],lines = True, chunksize = sample_size)
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            list_event = session["events"][:session["cutoff"]] # cutoff以前のeventのみ使える
            for event in list_event:
                dict_count[event["type"]][event["aid"]] = dict_count[event["type"]].get(event["aid"], 0) + 1
    # event_typeごとに回数の多い順にソートしてkeyだけ取り出す
    list_rank = {}
    for types in ["clicks", "carts", "orders"]:
        list_rank[types] = [j[0] for j in sorted(dict_count[types].items(), key = lambda x : x[1], reverse = True)]
    # order数上位のaidのリスト
    list_aid_rank = list_rank[event_type][:num_top_item]
    set_aid_rank = set(list_aid_rank)      
    
    # list_aidのorder履歴のあるsessionを記録する
    list_session = []
    set_aid = set()
    chunks = pd.read_json(path_input[i], lines = True, chunksize = sample_size)
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            session = session.to_dict()
            list_event = session["events"][:session["cutoff"]]
            list_aid_ses = [event["aid"] for event in list_event if event["aid"] in set_aid_rank and event["type"] == event_type]
            if len(list_aid_ses) > 0: # 1つ以上orderがあるsessionのみ保持する
                list_session.append(session["session"])
                set_aid |= set(list_aid_ses)
    list_aid = list(set_aid)
    set_session = set(list_session)
    print(i, len(list_aid))
    print(i, len(list_session))
    
    # idとindexの変換dictを作成する
    session2idx = dict(zip(list_session, range(len(list_session))))
    aid2idx = dict(zip(list_aid, range(len(list_session),len(list_session) + len(list_aid))))

    file = open(path_output[i], "w")
    chunks = pd.read_json(path_input[i],lines = True, chunksize = sample_size)
    for df_chunk in tqdm(chunks):
        for j, session in df_chunk.iterrows():
            if not session["session"] in set_session: continue
            list_event = session["events"][:session["cutoff"]] # validはcutoff以前のeventのみ使える
            for event in list_event:
                if event["type"] == event_type and event["aid"] in set_aid:
                    file.write(str(session2idx[session["session"]]) + "\t" + str(aid2idx[event["aid"]]) + "\n")
    file.close()
    !python proNE.py -graph edgelist.txt -emb1 emb_sparse.emb -emb2 emb_spectral.emb -dimension 128 -step 100 -theta 0.5 -mu 0.2  
    emb = load_embeddings("emb_spectral.emb")
    display(emb.shape)
    pred = {}
    dict_past = pd.read_pickle(path_input_past[i])
    chunksize = 100
    ses_idx_from = 0
    pred = {}
    pbar = tqdm(total = len(list_session)//chunksize + 1)
    while(ses_idx_from < len(list_session)):
        if (ses_idx_from + chunksize) < len(list_session): ses_idx_to = ses_idx_from + chunksize
        else: ses_idx_to = len(list_session)
        list_session_chunk = list_session[ses_idx_from:ses_idx_to]   
        list_session_idx = [session2idx[session] for session in list_session_chunk]
        mat_dist = dot(emb[list_session_idx], emb[range(len(list_session),len(list_session) + len(list_aid))].T)
        mat_dist_p = [[dist if list_aid[idx] not in dict_past[event_type][list_session[ses_idx_from+sesidx]] else -99999 for idx, dist in enumerate(ses) ] for sesidx, ses in enumerate(mat_dist)]   
        pred_chunk = dict(zip(list_session_chunk, [[list_aid[aididx] for aididx in np.argsort(x)[::-1][:N]] for x in mat_dist_p]))
        pred.update(pred_chunk)
        ses_idx_from = ses_idx_to
        pbar.update(1)
        
    pd.to_pickle(pred, f"prone_rules_{event_type}_{i}.pickle")

    # カバー率を計算
    df_label = pd.read_parquet(path_input_label[i])
    n_all, n_hit= 0,0
    for ses, label in tqdm(zip(df_label["session"],df_label[f"label_{event_type}"])):
        n_all += len(label)
        pred_ses = pred.get(ses, [])
        n_hit += sum([1 for x in pred_ses if x in label])

    print(f"{i}: {n_hit/n_all}")


0it [00:00, ?it/s]

0it [00:00, ?it/s]

train1 10000
train1 1150277


0it [00:00, ?it/s]

(1160277, 1160277)
neg 1.172891616821289
svd sparse 3.5700753959714758e-06
sparsesvd time 188.05779695510864
Chebyshev Series -----------------
Bessell time 2 15.322747230529785
Bessell time 3 23.252431392669678
Bessell time 4 30.919480085372925
Bessell time 5 38.68861484527588
Bessell time 6 46.36025524139404
Bessell time 7 53.987035274505615
Bessell time 8 61.639336585998535
Bessell time 9 69.275381565094
Bessell time 10 76.95679879188538
Bessell time 11 84.69575834274292
Bessell time 12 92.37884640693665
Bessell time 13 100.11037135124207
Bessell time 14 107.65508198738098
Bessell time 15 115.33188343048096
Bessell time 16 123.0478925704956
Bessell time 17 130.67584490776062
Bessell time 18 138.26266312599182
Bessell time 19 145.94109749794006
Bessell time 20 153.57037019729614
Bessell time 21 161.26533579826355
Bessell time 22 168.96313786506653
Bessell time 23 176.81374382972717
Bessell time 24 184.47855639457703
Bessell time 25 192.08760976791382
Bess

(1160277, 128)

  0%|          | 0/11503 [00:00<?, ?it/s]

0it [00:00, ?it/s]

train1: 0.0859706494279646
