In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.2.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.4/647.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.2-cp310-cp310-linux_x86_64.whl size=582738 sha256=913880408d03a80ee4aafb3871dd9f26ffffece0d64c11b956140a662a509ab5
  Stored in directory: /root/.cache/pip/wheels/7a/d9/59/473fa56df8e39430eeda369500b4e7127f5b243ba24c3c4297
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.2


In [4]:
from collections import defaultdict, Counter
from typing import List, Dict
import pickle
import time

from tqdm import tqdm
import numpy as np
import pandas as pd
from annoy import AnnoyIndex
import polars as pl
import networkx as nx
import scipy.sparse
import scipy.sparse as sp
from scipy import linalg
from scipy.special import iv
from sklearn import preprocessing
from sklearn.utils.extmath import randomized_svd

In [5]:
LOCALES = ["UK", "JP", "DE"]
VER = "03"
DIR = "/gdrive/MyDrive/amazon_kdd_2023/"
TOP_N = 25

# hyper-parameter for proNE
EMB_DIM = 1024
N_EPOCH = 10
MU = 0.0
THETA = 0.5

# functions

In [6]:
def preprocess(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("sequence_num"))
    )
    return df

In [7]:
def build_graph(df:pl.DataFrame):
    df = df.sort(["session_id", "sequence_num"], descending=[False, False])
    df = df.with_columns(
        pl.col("item").shift().over("session_id").alias("prev_item")
    )
    df = df.filter(
        (pl.col("prev_item").is_not_null()) &
        (pl.col("prev_item") != pl.col("item"))
    )
    df = df[["item", "prev_item"]]
    # df = df.unique()
    return df

In [8]:
def convert_item_id(df:pl.DataFrame):
    unique_item_ids = sorted(list(set(df["item"].unique().to_list() + df["prev_item"].unique().to_list())))
    item_id2index = dict(zip(unique_item_ids, range(len(unique_item_ids))))
    df = df.with_columns([
        pl.col("item").map_dict(item_id2index).alias("item"),
        pl.col("prev_item").map_dict(item_id2index).alias("prev_item"),
    ])
    return df, unique_item_ids, item_id2index

In [9]:
# https://github.com/THUDM/ProNE/blob/master/proNE.py
class ProNE():
	def __init__(self, graph_file, emb_file1, emb_file2, dimension):
		self.graph = graph_file
		self.emb1 = emb_file1
		self.emb2 = emb_file2
		self.dimension = dimension

		self.G = nx.read_edgelist(self.graph, nodetype=int, create_using=nx.DiGraph())
		self.G = self.G.to_undirected()
		self.node_number = self.G.number_of_nodes()
		matrix0 = scipy.sparse.lil_matrix((self.node_number, self.node_number))

		for e in self.G.edges():
			if e[0] != e[1]:
				matrix0[e[0], e[1]] = 1
				matrix0[e[1], e[0]] = 1
		self.matrix0 = scipy.sparse.csr_matrix(matrix0)
		print(matrix0.shape)

	def get_embedding_rand(self, matrix):
		# Sparse randomized tSVD for fast embedding
		t1 = time.time()
		l = matrix.shape[0]
		smat = scipy.sparse.csc_matrix(matrix)  # convert to sparse CSC format
		print('svd sparse', smat.data.shape[0] * 1.0 / l ** 2)
		U, Sigma, VT = randomized_svd(smat, n_components=self.dimension, n_iter=5, random_state=None)
		U = U * np.sqrt(Sigma)
		U = preprocessing.normalize(U, "l2")
		print('sparsesvd time', time.time() - t1)
		return U

	def get_embedding_dense(self, matrix, dimension):
		# get dense embedding via SVD
		t1 = time.time()
		U, s, Vh = linalg.svd(matrix, full_matrices=False, check_finite=False, overwrite_a=True)
		U = np.array(U)
		U = U[:, :dimension]
		s = s[:dimension]
		s = np.sqrt(s)
		U = U * s
		U = preprocessing.normalize(U, "l2")
		print('densesvd time', time.time() - t1)
		return U

	def pre_factorization(self, tran, mask):
		# Network Embedding as Sparse Matrix Factorization
		t1 = time.time()
		l1 = 0.75
		C1 = preprocessing.normalize(tran, "l1")
		neg = np.array(C1.sum(axis=0))[0] ** l1

		neg = neg / neg.sum()

		neg = scipy.sparse.diags(neg, format="csr")
		neg = mask.dot(neg)
		print("neg", time.time() - t1)

		C1.data[C1.data <= 0] = 1
		neg.data[neg.data <= 0] = 1

		C1.data = np.log(C1.data)
		neg.data = np.log(neg.data)

		C1 -= neg
		F = C1
		features_matrix = self.get_embedding_rand(F)
		return features_matrix

	def chebyshev_gaussian(self, A, a, order=10, mu=0.5, s=0.5):
		# NE Enhancement via Spectral Propagation
		print('Chebyshev Series -----------------')
		t1 = time.time()

		if order == 1:
			return a

		A = sp.eye(self.node_number) + A
		DA = preprocessing.normalize(A, norm='l1')
		L = sp.eye(self.node_number) - DA

		M = L - mu * sp.eye(self.node_number)

		Lx0 = a
		Lx1 = M.dot(a)
		Lx1 = 0.5 * M.dot(Lx1) - a

		conv = iv(0, s) * Lx0
		conv -= 2 * iv(1, s) * Lx1
		for i in range(2, order):
			Lx2 = M.dot(Lx1)
			Lx2 = (M.dot(Lx2) - 2 * Lx1) - Lx0
			#         Lx2 = 2*L.dot(Lx1) - Lx0
			if i % 2 == 0:
				conv += 2 * iv(i, s) * Lx2
			else:
				conv -= 2 * iv(i, s) * Lx2
			Lx0 = Lx1
			Lx1 = Lx2
			del Lx2
			print('Bessell time', i, time.time() - t1)
		mm = A.dot(a - conv)
		emb = self.get_embedding_dense(mm, self.dimension)
		return emb

In [10]:
# https://github.com/THUDM/ProNE/blob/master/proNE.py
def save_embedding(emb_file, features):
	# save node embedding into emb_file with word2vec format
	f_emb = open(emb_file, 'w')
	f_emb.write(str(len(features)) + " " + str(features.shape[1]) + "\n")
	for i in range(len(features)):
		s = str(i) + " " + " ".join(str(f) for f in features[i].tolist())
		f_emb.write(s + "\n")
	f_emb.close()

In [11]:
def generate_annoy_index(matrix):
    index = AnnoyIndex(EMB_DIM, 'angular')

    for idx,idx_embedding in enumerate(matrix):
        index.add_item(idx, idx_embedding)
        
    index.build(50)
    
    return index

In [12]:
def make_nns_matrix(annoy_index, item_ids, item_id2index, k=100):
    aid_xs = []
    aid_ys = []
    dists = []

    for item_id in tqdm(item_ids):
        item_index = item_id2index[item_id]
        nns = annoy_index.get_nns_by_item(item_index, k+1, include_distances=True)
        aid_y = [item_ids[idx] for idx in list(nns[0][1:])]
        dist = list(nns[1][1:])
        aid_xs.extend([item_id] * k)
        aid_ys.extend(aid_y)
        dists.extend(dist)
    df = pl.DataFrame({"item": aid_xs, 'candidate_item': aid_ys, 'prone_distance': dists})

    # rank付与
    df = df.with_columns(
        pl.col("prone_distance").rank(descending=False, method="min").over("item").alias("prone_rank")
    )

    return df

# for local train

In [13]:
train = pl.read_parquet(DIR + "data/preprocessed/task1/train_task1.parquet")
test1_1 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase1.parquet")
test1_2 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase2.parquet")
test2_1 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase1.parquet").filter( pl.col("locale").is_in(LOCALES))
test2_2 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase2.parquet").filter( pl.col("locale").is_in(LOCALES))
test3_1 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase1.parquet").filter( pl.col("locale").is_in(LOCALES))
test3_2 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase2.parquet").filter( pl.col("locale").is_in(LOCALES))
test1_1 = test1_1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test1_2 = test1_2.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test3_1 = test3_1.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test3_2 = test3_2.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test = pl.concat([test1_1, test1_2, test2_1, test2_2, test3_1, test3_2])

In [14]:
train = preprocess(train)
test = preprocess(test)

In [15]:
df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])
df = df.rename({"prev_items":"item"})

In [None]:
candidates = []
for locale in LOCALES:
    # filter by loacle
    df_by_locale = df.filter(pl.col("locale") == locale)

    # build graph
    graph_df = build_graph(df_by_locale)
    graph_df, item_ids, item_id2index = convert_item_id(graph_df)
    graph_df.write_csv(DIR + f"data/interim/graph/task1/edge_list_{VER}_{locale}_task1_for_local_train_or_eval.txt", has_header=False, separator=" ")
    item_ids_name = f"item_id2index_{VER}_{locale}_for_train_or_eval.pickle"
    with open(DIR + "data/interim/graph/task1/graph_" + item_ids_name, "wb") as f:
        pickle.dump(item_id2index, f)

    # generate graph embedding by proNE 
    model = ProNE(
        DIR + f"data/interim/graph/task1/edge_list_{VER}_{locale}_task1_for_local_train_or_eval.txt",
        "",
        "",
        EMB_DIM,
    )
    features_matrix = model.pre_factorization(model.matrix0, model.matrix0)
    embeddings_matrix = model.chebyshev_gaussian(model.matrix0, features_matrix, N_EPOCH, MU, THETA)
    np.save(DIR + f"models/task1/graph_embedding_{VER}_{locale}_for_local_train_or_eval.npy", embeddings_matrix)

    # generate_candidates
    annoy_index = generate_annoy_index(embeddings_matrix)
    candidate = make_nns_matrix(annoy_index, item_ids, item_id2index, k=TOP_N)
    candidate = candidate.with_columns(pl.lit(locale).alias("locale"))
    candidates.append(candidate)

candidate = pl.concat(candidates)
candidate.write_parquet(DIR + f"data/interim/candidates/task1/prone_{VER}_for_local_or_eval.parquet")

(473418, 473418)
neg 1.9434046745300293
svd sparse 1.791839874422657e-05
sparsesvd time 891.044180393219
Chebyshev Series -----------------
Bessell time 2 52.31568717956543
Bessell time 3 81.43543481826782
Bessell time 4 107.40635776519775
Bessell time 5 133.32356810569763
Bessell time 6 159.31133365631104
Bessell time 7 185.92783665657043
Bessell time 8 224.64430856704712
Bessell time 9 250.43163800239563
densesvd time 93.66154932975769


100%|██████████| 473418/473418 [09:40<00:00, 816.06it/s]


(373807, 373807)
neg 1.3188929557800293
svd sparse 2.3035149620403156e-05
sparsesvd time 774.3039412498474
Chebyshev Series -----------------
Bessell time 2 60.33335590362549
Bessell time 3 85.61083340644836
Bessell time 4 108.29546689987183
Bessell time 5 130.85231232643127
Bessell time 6 153.3084397315979
Bessell time 7 175.57938289642334
Bessell time 8 197.8818645477295
Bessell time 9 220.40205097198486
densesvd time 96.77939987182617


100%|██████████| 373807/373807 [07:56<00:00, 783.96it/s]


(487500, 487500)
neg 0.9145834445953369
svd sparse 1.6712718474687705e-05
sparsesvd time 992.9665095806122
Chebyshev Series -----------------
Bessell time 2 66.25386333465576
Bessell time 3 93.42643618583679
Bessell time 4 120.89849805831909
Bessell time 5 147.9219901561737
Bessell time 6 175.4986608028412
Bessell time 7 202.8393177986145
Bessell time 8 229.74425148963928
Bessell time 9 256.54119873046875
densesvd time 103.26844429969788


100%|██████████| 487500/487500 [11:48<00:00, 688.01it/s]


In [None]:
candidate.head()

item,candidate_item,prone_distance,prone_rank,locale
str,str,f64,u32,str
"""0001821946""","""0008326045""",0.301143,1,"""UK"""
"""0001821946""","""0007215991""",0.591903,2,"""UK"""
"""0001821946""","""0007892314""",0.59199,3,"""UK"""
"""0001821946""","""1408367289""",0.628259,4,"""UK"""
"""0001821946""","""1408356155""",0.637967,5,"""UK"""


# MRR@100

In [None]:
train = pl.read_parquet(DIR + "data/preprocessed/task1/train_task1.parquet")

In [None]:
candidate = pl.read_parquet(DIR + f"data/interim/candidates/task1/prone_{VER}_for_local_or_eval.parquet")

In [None]:
# last_itemの抽出
last_item_list = []
prev_items_list = train["prev_items"].to_list()
for prev_items in prev_items_list:
    last_item_list.append(prev_items[-1])
train = train.with_columns(pl.Series(name="last_item", values=last_item_list))

In [None]:
label_lists = []
for locale in LOCALES:
    df = train.filter(pl.col("locale") == locale)
    df = df.join(candidate, left_on=["locale", "last_item"], right_on=["locale", "item"], how="left")
    df = df.filter(~pl.col("candidate_item").is_in(pl.col("prev_items")))
    df = df.sort(["session_id", "prone_distance"], descending=[False, False])
    df = df.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
    label_lists.extend(df.groupby("session_id", maintain_order=True).all()["label"].to_list())

In [None]:
# MRRの計算
rr = 0
for labels in label_lists:
    labels = labels[:100]
    for i, label in enumerate(labels):
        if label == 1:
            rr += 1 / (i+1)
            break
mrr = rr / len(label_lists)
print("MRR:", round(mrr, 5))

MRR: 0.11601


# for inference

In [None]:
train = pl.read_parquet(DIR + "data/preprocessed/task1/train_task1.parquet")
test1_1 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase1.parquet")
test1_2 = pl.read_parquet(DIR + "data/preprocessed/task1/test_task1_phase2.parquet")
test2_1 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase1.parquet").filter( pl.col("locale").is_in(LOCALES))
test2_2 = pl.read_parquet(DIR + "data/preprocessed/task2/test_task2_phase2.parquet").filter( pl.col("locale").is_in(LOCALES))
test3_1 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase1.parquet").filter( pl.col("locale").is_in(LOCALES))
test3_2 = pl.read_parquet(DIR + "data/preprocessed/task3/test_task3_phase2.parquet").filter( pl.col("locale").is_in(LOCALES))
test1_1 = test1_1.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test1_2 = test1_2.with_columns(
    (pl.col("session_id") + "_from_task1").alias("session_id")
)
test3_1 = test3_1.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test3_2 = test3_2.with_columns(
    (pl.col("session_id") + "_from_task3").alias("session_id")
)
test = pl.concat([test1_1, test1_2, test2_1, test2_2, test3_1, test3_2])

In [None]:
# trainのnext_itemをprev_itemsにappendする
prev_items_list = train["prev_items"].to_list()
next_item_list = train["next_item"].to_list()
prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train = train.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

In [None]:
train = preprocess(train)
test = preprocess(test)
df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])
df = df.rename({"prev_items":"item"})

In [None]:
candidates = []
for locale in LOCALES:
    # filter by loacle
    df_by_locale = df.filter(pl.col("locale") == locale)

    # build graph
    graph_df = build_graph(df_by_locale)
    graph_df, item_ids, item_id2index = convert_item_id(graph_df)
    graph_df.write_csv(DIR + f"data/interim/graph/task1/edge_list_{VER}_{locale}_task1_for_inference.txt", has_header=False, separator=" ")
    item_ids_name = f"item_id2index_{VER}_{locale}_for_inference.pickle"
    with open(DIR + "data/interim/graph/task1/graph_" + item_ids_name, "wb") as f:
        pickle.dump(item_id2index, f)

    # generate graph embedding by proNE 
    model = ProNE(
        DIR + f"data/interim/graph/task1/edge_list_{VER}_{locale}_task1_for_inference.txt",
        "",
        "",
        EMB_DIM,
    )
    features_matrix = model.pre_factorization(model.matrix0, model.matrix0)
    embeddings_matrix = model.chebyshev_gaussian(model.matrix0, features_matrix, N_EPOCH, MU, THETA)
    np.save(DIR + f"models/task1/graph_embedding_{VER}_{locale}_for_inference.npy", embeddings_matrix)

    # generate_candidates
    annoy_index = generate_annoy_index(embeddings_matrix)
    candidate = make_nns_matrix(annoy_index, item_ids, item_id2index, k=TOP_N)
    candidate = candidate.with_columns(pl.lit(locale).alias("locale"))
    candidates.append(candidate)

candidate = pl.concat(candidates)
candidate.write_parquet(DIR + f"data/interim/candidates/task1/prone_{VER}_for_inference.parquet")

(500011, 500011)
neg 1.487830400466919
svd sparse 2.1335357233955394e-05
sparsesvd time 1099.088896036148
Chebyshev Series -----------------
Bessell time 2 65.78074622154236
Bessell time 3 98.96438550949097
Bessell time 4 132.31445574760437
Bessell time 5 165.17317962646484
Bessell time 6 198.03654098510742
Bessell time 7 231.00388836860657
Bessell time 8 264.68839597702026
Bessell time 9 297.62259244918823
densesvd time 108.29792356491089


100%|██████████| 500011/500011 [11:03<00:00, 753.49it/s]


(394847, 394847)
neg 0.6726453304290771
svd sparse 2.680233918488267e-05
sparsesvd time 866.7321724891663
Chebyshev Series -----------------
Bessell time 2 54.446372270584106
Bessell time 3 81.49535989761353
Bessell time 4 108.56445717811584
Bessell time 5 135.0883378982544
Bessell time 6 178.81355047225952
Bessell time 7 208.0460982322693
Bessell time 8 236.47908878326416
Bessell time 9 261.9902503490448
densesvd time 85.89026021957397


100%|██████████| 394847/394847 [08:16<00:00, 794.73it/s]


(518182, 518182)
neg 1.2353928089141846
svd sparse 1.9364790658131597e-05
sparsesvd time 1092.1635653972626
Chebyshev Series -----------------
Bessell time 2 64.86362791061401
Bessell time 3 101.11236810684204
Bessell time 4 134.40990376472473
Bessell time 5 168.28982377052307
Bessell time 6 201.42335033416748
Bessell time 7 234.67707228660583
Bessell time 8 268.4160146713257
Bessell time 9 301.6223249435425
densesvd time 120.01172637939453


100%|██████████| 518182/518182 [11:04<00:00, 780.00it/s]
