In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install polars



In [None]:
import concurrent.futures

from tqdm import tqdm
import numpy as np
import polars as pl
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.spatial.distance import cosine

In [None]:
LOCALES = ["DE"]
TOP_N = 200

# Generate candidates

In [None]:
product = pd.read_parquet("/content/drive/MyDrive/kddcup2023/data/preprocessed/common/product.parquet")
train = pl.read_parquet("/content/drive/MyDrive/kddcup2023/data/preprocessed/task1/train_task1.parquet")
# test = pl.read_parquet("/content/drive/MyDrive/kddcup2023-master/data/preprocessed/task1/test_task1.parquet")

In [None]:
def calculate_similar_items(product_id):
    # Get query product vecto
    query_vec = tfidf_matrix[df.index[df['id'] == product_id]][0]

    # Calculate similarity and sort in descending order
    similarity = cosine_similarity(tfidf_matrix, query_vec).flatten()
    similarity_scores = pd.Series(similarity, index=df.index)
    similarity_scores = similarity_scores.sort_values(ascending=False)

    # Extract the top ones with high similarity
    similar_items = similarity_scores.iloc[1:TOP_N+1]
    similar_items_ids = df.loc[similar_items.index]["id"].values
    similar_items_scores = similar_items.values

    return (product_id, similar_items_ids, similar_items_scores)


for locale in LOCALES:
    print(f"start {locale}...")
    locales, items, candidate_items, similality_scores = [], [], [], []

    # Filter by locale
    df = product[product["locale"] == locale].reset_index()
    product_ids = df["id"].to_list()

    # Create TF-IDF vectorizer
    tfidf = TfidfVectorizer()

    # Create text data that combines product information
    text_data = df["title"].fillna("") + " " + df["color"].fillna("") + " " + df["size"].fillna("") + " " + \
                df["model"].fillna("") + " " + df["material"].fillna("") + " " + df["author"].fillna("") + " " + \
                df["brand"].fillna("") + " " + df["desc"].fillna("")

    # Create TF-IDF matrix
    tfidf_matrix = tfidf.fit_transform(text_data)

    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = list(tqdm(executor.map(calculate_similar_items, product_ids), total=len(product_ids)))

    for result in results:
        product_id, similar_items_ids, similar_items_scores = result

        # store results
        locales.extend([locale] * TOP_N)
        items.extend([product_id] * TOP_N)
        candidate_items.extend(list(similar_items_ids))
        similality_scores.extend(list(similar_items_scores))

    # Convert result to data frame
    similar_products = pd.DataFrame({
        "locale": locales,
        "item": items,
        "candidate_item": candidate_items,
        "similarity_score": similality_scores,
    })

    # rank granted
    similar_products = pl.from_pandas(similar_products)
    similar_products = similar_products \
    .sort(["item", "similarity_score"], descending=[False, True]) \
    .with_columns(pl.col("similarity_score").rank(descending=True, method="min").over("item").alias("similarity_rank"))

    # file output
    file_name = f"similar_products_10_{locale}.parquet"
    similar_products.write_parquet("/content/drive/MyDrive/kddcup2023/data/interim/candidates/task1/" + file_name)

start DE...


100%|██████████| 518327/518327 [13:21:49<00:00, 10.77it/s]
