In [1]:
import pickle
import walker
import polars as pl
from gensim.models import Word2Vec

# Load CoOcurrence Data from Amazon KDD 2023

[Download data](https://www.aicrowd.com/challenges/amazon-kdd-cup-23-multilingual-recommendation-challenge/problems/task-1-next-product-recommendation/dataset_files)

Steps: 

- Build a list of list of sessions, for simplicity I'll filter by 'ES'

```python

def str2list(x):
    x = x.replace('[', '').replace(']', '').replace("'", '').replace('\n', ' ').replace('\r', ' ')
    return [i for i in x.split() if i]

df = (
    pl.scan_csv("sessions_train.csv")
      .filter(pl.col("locale") == "ES")
      .select("prev_items")
      .collect()
      .to_series()
)

sessions = df.apply(str2list).to_list()
```


- from a list of list of sessions you can get the co-ocurrence graph pretty easily like so:

```python
import pickle
from itertools import chain

import networkx as nx
from networkx.algorithms import bipartite

G = nx.Graph()
G.add_nodes_from(range(len(sessions)), bipartite=0)
G.add_nodes_from(chain(*sessions), bipartite=1)

edges = [(i, tgt) for i, s in enumerate(sessions) for tgt in s]
G.add_edges_from(edges)

# Build the co-ocurrence graph
P = bipartite.projected_graph(G, chain(*sessions))

with open("co_ocurrence_graph.pickle", 'wb') as handle:
    pickle.dump(P, handle, protocol=pickle.HIGHEST_PROTOCOL)
```

In [2]:
with open("co_ocurrence_graph.pickle", "rb") as f:
    G = pickle.load(f)

print(f'{G.number_of_nodes()=}')
print(f'{G.number_of_edges()=}')

G.number_of_nodes()=38850
G.number_of_edges()=234347


In [3]:
%%time
walks = walker.random_walks(G, n_walks = 10, walk_len = 10, p=0.5, q=1.0).tolist()

Random walks - T=1.09s
CPU times: user 2.85 s, sys: 133 ms, total: 2.98 s
Wall time: 1.68 s


In [6]:
%%time
# train a Word2Vec model
model = Word2Vec(walks, vector_size=128, window=4, min_count=1, sg=1, workers=4, epochs=10, compute_loss=True)

CPU times: user 4min 20s, sys: 589 ms, total: 4min 20s
Wall time: 1min 7s


In [27]:
i2node = dict(enumerate(G.nodes()))
node2i = {v: k for k, v in i2node.items()}

In [None]:
# Load product train data just for metada and quality assessment of the embeddings
dicts = pl.read_csv("products_train_es.csv").to_dicts()
node2meta = {data['id']: data for data in dicts}

In [36]:
product_id = "B0BDKSHZK6"
src_title = node2meta[product_id]['title']
for i, similarity in model.wv.most_similar(node2i[product_id], topn=40):
    print(f'({src_title}, {node2meta[i2node[i]]["title"]}), {similarity=}')

(Apple iPhone 14 (256 GB) - Azul, Apple iPhone 14 (128 GB) - (Product) Red), similarity=0.893774688243866
(Apple iPhone 14 (256 GB) - Azul, Nuevo Apple iPhone 12 Mini (64 GB) - Azul), similarity=0.8876607418060303
(Apple iPhone 14 (256 GB) - Azul, Nuevo Apple iPhone 12 Mini (128 GB) - en Negro), similarity=0.8843467235565186
(Apple iPhone 14 (256 GB) - Azul, Nuevo Apple iPhone 12 Mini (128 GB) - Azul), similarity=0.8785219192504883
(Apple iPhone 14 (256 GB) - Azul, Nuevo Apple iPhone 12 Mini (128 GB) - en Blanco), similarity=0.870318591594696
(Apple iPhone 14 (256 GB) - Azul, Nuevo Apple iPhone 12 Mini (128 GB) - de en Verde), similarity=0.8670912384986877
(Apple iPhone 14 (256 GB) - Azul, Apple iPhone 12 Mini (64 GB) - de en Malva), similarity=0.8460684418678284
(Apple iPhone 14 (256 GB) - Azul, Apple iPhone 14 (256 GB) - (Product) Red), similarity=0.827444851398468
(Apple iPhone 14 (256 GB) - Azul, Apple iPhone 13 Mini (128 GB) - en Blanco Estrella), similarity=0.8270866274833679
(Ap