In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install transformers==4.18.0 fugashi==1.1.0 ipadic==1.0.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.18.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.0/4.0 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fugashi==1.1.0
  Downloading fugashi-1.1.0.tar.gz (336 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m336.9/336.9 kB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ipadic==1.0.0
  Downloading ipadic-1.0.0.tar.gz (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m106.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.1.0 (from transformers==4.18.0)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [4]:
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.2.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.4/647.4 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.2-cp310-cp310-linux_x86_64.whl size=581411 sha256=f0c52193279ca846a8e96852e2f5700c097dc9e58204e7a7c049a0c4a2a7ee11
  Stored in directory: /root/.cache/pip/wheels/7a/d9/59/473fa56df8e39430eeda369500b4e7127f5b243ba24c3c4297
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.2


In [5]:
import concurrent.futures
import math
import pickle

from tqdm import tqdm
import numpy as np
import polars as pl
import pandas as pd
from annoy import AnnoyIndex
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cosine

In [6]:
from transformers import BertTokenizer, BertModel

In [7]:
LOCALE = "UK"
TOP_N = 100
EMB_DIM = 768

In [8]:
product = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/common/product.parquet")
product = product.filter(pl.col("locale") == LOCALE)

# generate candidates

In [9]:
product_id_list = product["id"].to_list()
product_id2index = dict(zip(product_id_list, range(len(product_id_list))))
title_list = product["title"].to_list()
desc_list = product["desc"].to_list()

# for debug
# product_id_list = product_id_list[:30000]
# desc_list = desc_list[:30000]

In [10]:
# BERTの日本語モデル
MODEL_NAME = 'bert-base-uncased'

#トークナイザとモデルのロード
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
model = model.cuda()

#各データの形式を整える
max_length = 256

sentence_vectors = []
for id, title in tqdm(zip(product_id_list, title_list), total=len(product_id_list)):
    text = str(title)
    encoding = tokenizer(
        str(text),
        max_length = max_length,
        padding = 'max_length',
        truncation = True,
        return_tensors = 'pt'
        )
    encoding = {k: v.cuda() for k, v in encoding.items()}
    attention_mask = encoding['attention_mask']

    #文章ベクトルを計算
    with torch.no_grad():
        output = model(**encoding)
        last_hidden_state = output.last_hidden_state
        averaged_hidden_state =(last_hidden_state*attention_mask.unsqueeze(-1)).sum(1)/attention_mask.sum(1,keepdim=True) 
    sentence_vectors.append(averaged_hidden_state[0].cpu().numpy())

sentence_vectors = np.vstack(sentence_vectors)
print(sentence_vectors.shape)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 500180/500180 [1:53:08<00:00, 73.68it/s]


(500180, 768)


In [11]:
index = AnnoyIndex(EMB_DIM, 'angular')

for idx,idx_embedding in enumerate(sentence_vectors):
    index.add_item(idx, idx_embedding)
    
index.build(50)

True

In [12]:
aid_xs = []
aid_ys = []
dists = []
locales = []

for product_id in tqdm(product_id_list):
    item_index = product_id2index[product_id]
    nns = index.get_nns_by_item(item_index, TOP_N+1, include_distances=True)
    aid_y = [product_id_list[idx] for idx in list(nns[0][1:])]
    dist = list(nns[1][1:])
    aid_xs.extend([product_id] * TOP_N)
    aid_ys.extend(aid_y)
    dists.extend(dist)
    locales.extend([LOCALE] * TOP_N)
df = pl.DataFrame({"item": aid_xs, 'candidate_item': aid_ys, 'bert_distance': dists, 'locale': locales})

# rank付与
df = df.with_columns(
    pl.col("bert_distance").rank(descending=False, method="min").over("item").alias("bert_rank")
)

100%|██████████| 500180/500180 [15:48<00:00, 527.41it/s]


In [13]:
file_name = f"similar_products_19_{LOCALE}.parquet"
df.write_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task1/" + file_name)

In [14]:
product_id2vec = {}
for product_id in product_id_list:
    product_index = product_id2index[product_id]
    product_id2vec[product_id] = sentence_vectors[product_index]

file_path = f"/gdrive/MyDrive/amazon_kdd_2023/models/task1/product_vector_19_{LOCALE}.pickle"
with open(file_path, mode='wb') as f:
    pickle.dump(product_id2vec, f)

# MRR@100

In [15]:
train = pl.read_parquet("/gdrive/MyDrive/amazon_kdd_2023/data/preprocessed/task1/train_task1.parquet")
train = train.filter(pl.col("locale") == LOCALE)

In [16]:
candidates = pl.read_parquet(f"/gdrive/MyDrive/amazon_kdd_2023/data/interim/candidates/task1/similar_products_19_{LOCALE}.parquet")

In [17]:
# last_itemの抽出
last_item_list = []
prev_items_list = train["prev_items"].to_list()
for prev_items in prev_items_list:
    last_item_list.append(prev_items[-1])
train = train.with_columns(pl.Series(name="last_item", values=last_item_list))

In [18]:
train = train[["locale", "session_id", "last_item", "next_item"]]

In [19]:
label_lists = []
n_rows = 500_000
for df in tqdm(train.iter_slices(n_rows=n_rows), total=math.ceil(train.height/n_rows)): # specify "total" parameter to display tqdm progress bar 
    # process data
    df = df.join(candidates, left_on=["last_item", "locale"], right_on=["item", "locale"], how="left")
    df = df.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
    label_lists.extend(df.groupby("session_id", maintain_order=True).all()["label"].to_list())

100%|██████████| 3/3 [00:32<00:00, 10.75s/it]


In [20]:
# MRRの計算
rr = 0
for labels in label_lists:
    labels = labels[:100]
    for i, label in enumerate(labels):
        if label == 1:
            rr += 1 / (i+1)
            break
mrr = rr / len(label_lists)
print("MRR:", round(mrr, 5))

MRR: 0.16984
