In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install polars



In [None]:
import concurrent.futures
import math

from tqdm import tqdm
import numpy as np
import polars as pl
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cosine

In [None]:
LOCALES = ["JP"]
TOP_N = 200

# Generate candidates

In [None]:
product = pd.read_parquet("/content/drive/MyDrive/kddcup2023/data/preprocessed/common/product.parquet")
# train = pl.read_parquet("/content/drive/MyDrive/kddcup2023-master/data/preprocessed/task1/train_task1.parquet")
# test = pl.read_parquet("/content/drive/MyDrive/kddcup2023-master/data/preprocessed/task1/test_task1.parquet")

In [None]:
def calculate_similar_items(product_id):
    # クエリ商品のベクトル取得
    query_vec = tfidf_matrix[df.index[df['id'] == product_id]][0]

    # 類似度を計算し、降順にソート
    similarity = cosine_similarity(tfidf_matrix, query_vec).flatten()
    similarity_scores = pd.Series(similarity, index=df.index)
    similarity_scores = similarity_scores.sort_values(ascending=False)

    # 類似度が高い上位を抽出
    similar_items = similarity_scores.iloc[1:TOP_N+1]
    similar_items_ids = df.loc[similar_items.index]["id"].values
    similar_items_scores = similar_items.values

    return (product_id, similar_items_ids, similar_items_scores)


for locale in LOCALES:
    print(f"start {locale}...")
    locales, items, candidate_items, similality_scores = [], [], [], []

    # localeで絞る
    df = product[product["locale"] == locale].reset_index()
    product_ids = df["id"].to_list()

    # TF-IDFベクトル化器を作成
    tfidf = TfidfVectorizer()

    # 商品情報を結合したテキストデータを作成
    text_data = df["title"].fillna("") + " " + df["color"].fillna("") + " " + df["size"].fillna("") + " " + \
                df["model"].fillna("") + " " + df["material"].fillna("") + " " + df["author"].fillna("") + " " + \
                df["brand"].fillna("") + " " + df["desc"].fillna("")

    # TF-IDF行列を作成
    tfidf_matrix = tfidf.fit_transform(text_data)

    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = list(tqdm(executor.map(calculate_similar_items, product_ids), total=len(product_ids)))

    for result in results:
        product_id, similar_items_ids, similar_items_scores = result

        # 結果を格納
        locales.extend([locale] * TOP_N)
        items.extend([product_id] * TOP_N)
        candidate_items.extend(list(similar_items_ids))
        similality_scores.extend(list(similar_items_scores))

    # 結果をデータフレームに変換
    similar_products = pd.DataFrame({
        "locale": locales,
        "item": items,
        "candidate_item": candidate_items,
        "similarity_score": similality_scores,
    })

   # rank付与
    similar_products = pl.from_pandas(similar_products)
    similar_products = similar_products \
    .sort(["item", "similarity_score"], descending=[False, True]) \
    .with_columns(pl.col("similarity_score").rank(descending=True, method="min").over("item").alias("similarity_rank"))

    # ファイル出力
    file_name = f"similar_products_10_{locale}.parquet"
    similar_products.write_parquet("/content/drive/MyDrive/kddcup2023/data/interim/candidates/task1/" + file_name)

start JP...


100%|██████████| 395009/395009 [4:59:17<00:00, 22.00it/s]
