# Setting

In [1]:
import os
from typing import List, Dict, Union

import numpy as np
import polars as pl
from tqdm import tqdm
from sklearn.neighbors import BallTree

from scripts.metrics import map_at_k



In [2]:
INPUT_DIR = "../../input/raw/"
OUTPUT_DIR = "./candidates/"

In [3]:
yados = pl.read_csv(os.path.join(INPUT_DIR, "yado.csv"))

In [4]:
yado_vec = yados \
    .to_dummies(columns = ["yad_type", "lrg_cd", "sml_cd"]) \
    .fill_null(0) \
    .sort("yad_no", descending=False) \
    .drop(["yad_no", "total_room_cnt", "wid_cd", "ken_cd"]) \
    .cast(pl.UInt8) \
    .to_numpy()

In [5]:
yado_vec.shape

(13806, 844)

In [6]:
# BallTreeの構築
tree = BallTree(yado_vec)

In [7]:
# k近傍の数
k_neighbors = 100

In [8]:
target_indice_list = []
nn_indice_list = []
distances_list = []

for idx in tqdm(range(0, len(yados))):
    # k近傍探索
    query_point = yado_vec[idx:idx+1, :]
    distances, indices = tree.query(query_point, k=k_neighbors)

    # 結果の格納
    target_indice_list.extend([idx + 1 for _ in range(k_neighbors)])
    nn_indice_list.extend(list(indices[0]+1))
    distances_list.extend(list(distances[0]+1))

similar_yados = pl.DataFrame({
    "yad_no" : target_indice_list,
    "candidate_yad_no" : nn_indice_list,
    "nn_distance" : distances_list,
}).filter(pl.col("yad_no") != pl.col("candidate_yad_no")).filter(pl.col("nn_distance") == 1.0)

100% 13806/13806 [04:22<00:00, 52.67it/s]


In [9]:
similar_yados.write_parquet(os.path.join(OUTPUT_DIR, "similar_yados.parquet"))

In [10]:
similar_yados.head()

yad_no,candidate_yad_no,nn_distance
i64,i64,f64
1,6118,1.0
1,7367,1.0
1,5206,1.0
1,6125,1.0
1,6193,1.0


# MAP@k=10

In [11]:
train_log = pl.read_csv(os.path.join(INPUT_DIR, "train_log.csv"))
train_label = pl.read_csv(os.path.join(INPUT_DIR, "train_label.csv")).rename({"yad_no":"label_yad_no"})

In [12]:
last_items = train_log.group_by("session_id").last()

In [13]:
similar_yados = pl.read_parquet(os.path.join(OUTPUT_DIR, "similar_yados.parquet"))

In [14]:
prediction = last_items \
    .join(similar_yados, on="yad_no", how="left") \
    .join(train_label, on="session_id", how="left") \
    .sort(["session_id", "nn_distance"], descending=[False, False]) \
    .with_columns((pl.col("candidate_yad_no") == pl.col("label_yad_no")).cast(pl.Int8).alias("user_relevance")) \
    .fill_null(0)

In [15]:
user_relevances = prediction.group_by("session_id", maintain_order=True).all()["user_relevance"].to_list()

In [16]:
map_at_k(user_relevances, 10)

0.06357899318527727