In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install polars



In [None]:
!pip install transformers==4.18.0 fugashi==1.1.0 ipadic==1.0.0

Collecting transformers==4.18.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fugashi==1.1.0
  Downloading fugashi-1.1.0.tar.gz (336 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.9/336.9 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ipadic==1.0.0
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacremoses (from transformers==4.18.0)
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1 (

In [None]:
!pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=550740 sha256=1a78359efc8253c9306aff64329adacaf31d5ea9124c476bb3b0c84287fb5901
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e3992465a
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [None]:
import concurrent.futures
import math
import pickle

from tqdm import tqdm
import numpy as np
import polars as pl
import pandas as pd
from annoy import AnnoyIndex
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cosine

In [None]:
from transformers import AutoModel, AutoTokenizer

In [None]:
LOCALE = "DE"
TOP_N = 100
EMB_DIM = 768

In [None]:
product = pl.read_parquet("/content/drive/MyDrive/kddcup2023/data/preprocessed/common/product.parquet")
product = product.filter(pl.col("locale") == LOCALE)

# generate candidates

In [None]:
product_id_list = product["id"].to_list()
product_id2index = dict(zip(product_id_list, range(len(product_id_list))))
title_list = product["title"].to_list()
desc_list = product["desc"].to_list()

# for debug
# product_id_list = product_id_list[:30000]
# desc_list = desc_list[:30000]

In [None]:
# BERT's German language model
MODEL_NAME = 'dbmdz/bert-base-german-uncased'

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
model = AutoModel.from_pretrained("dbmdz/bert-base-german-cased")
model = model.cuda()

# Format the data
max_length = 256

sentence_vectors = []
for id, title in tqdm(zip(product_id_list, title_list), total=len(product_id_list)):
    text = str(title)
    encoding = tokenizer(
        str(text),
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    encoding = {k: v.cuda() for k, v in encoding.items()}
    attention_mask = encoding['attention_mask']

    # Calculate sentence vectors
    with torch.no_grad():
        output = model(**encoding)
        last_hidden_state = output.last_hidden_state
        averaged_hidden_state = (last_hidden_state * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1, keepdim=True)
    sentence_vectors.append(averaged_hidden_state[0].cpu().numpy())

sentence_vectors = np.vstack(sentence_vectors)
print(sentence_vectors.shape)

Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/234k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/422M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 518327/518327 [1:59:32<00:00, 72.27it/s]


(518327, 768)


In [None]:
index = AnnoyIndex(EMB_DIM, 'angular')

for idx,idx_embedding in enumerate(sentence_vectors):
    index.add_item(idx, idx_embedding)

index.build(50)

True

In [None]:
aid_xs = []
aid_ys = []
dists = []
locales = []

for product_id in tqdm(product_id_list):
    item_index = product_id2index[product_id]
    nns = index.get_nns_by_item(item_index, TOP_N+1, include_distances=True)
    aid_y = [product_id_list[idx] for idx in list(nns[0][1:])]
    dist = list(nns[1][1:])
    aid_xs.extend([product_id] * TOP_N)
    aid_ys.extend(aid_y)
    dists.extend(dist)
    locales.extend([LOCALE] * TOP_N)
df = pl.DataFrame({"item": aid_xs, 'candidate_item': aid_ys, 'bert_distance': dists, 'locale': locales})

# rank付与
df = df.with_columns(
    pl.col("bert_distance").rank(descending=False, method="min").over("item").alias("bert_rank")
)

100%|██████████| 518327/518327 [17:03<00:00, 506.53it/s]


In [None]:
file_name = f"similar_products_19_{LOCALE}.parquet"
df.write_parquet("/content/drive/MyDrive/kddcup2023/data/interim/candidates/task1/" + file_name)

In [None]:
product_id2vec = {}
for product_id in product_id_list:
    product_index = product_id2index[product_id]
    product_id2vec[product_id] = sentence_vectors[product_index]

file_path = f"/content/drive/MyDrive/kddcup2023/models/task1/product_vector_19_{LOCALE}.pickle"
with open(file_path, mode='wb') as f:
    pickle.dump(product_id2vec, f)

# MRR@100

In [None]:
train = pl.read_parquet("/content/drive/MyDrive/kddcup2023/data/preprocessed/task1/train_task1.parquet")
train = train.filter(pl.col("locale") == LOCALE)

In [None]:
candidates = pl.read_parquet(f"/content/drive/MyDrive/kddcup2023/data/interim/candidates/task1/similar_products_19_{LOCALE}.parquet")

In [None]:
# last_item
last_item_list = []
prev_items_list = train["prev_items"].to_list()
for prev_items in prev_items_list:
    last_item_list.append(prev_items[-1])
train = train.with_columns(pl.Series(name="last_item", values=last_item_list))

In [None]:
train = train[["locale", "session_id", "last_item", "next_item"]]

In [None]:
label_lists = []
n_rows = 500_000
for df in tqdm(train.iter_slices(n_rows=n_rows), total=math.ceil(train.height/n_rows)): # specify "total" parameter to display tqdm progress bar
    # process data
    df = df.join(candidates, left_on=["last_item", "locale"], right_on=["item", "locale"], how="left")
    df = df.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
    label_lists.extend(df.groupby("session_id", maintain_order=True).all()["label"].to_list())

In [None]:
# MRR
rr = 0
for labels in label_lists:
    labels = labels[:100]
    for i, label in enumerate(labels):
        if label == 1:
            rr += 1 / (i+1)
            break
mrr = rr / len(label_lists)
print("MRR:", round(mrr, 5))