In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.7.2


In [None]:
!pip install polars



In [None]:
import random
from collections import defaultdict, Counter
from typing import List, Dict, Union
import pickle

import numpy as np
import implicit
from scipy.sparse import lil_matrix
from tqdm import tqdm
import pandas as pd
import polars as pl



In [None]:
TOP_N = 200
LOCALE = "UK"
VER = "15"
DIR = "/content/drive/MyDrive/kddcup2023/"

# parameters for IMF
ALPHA = 50000
LAMBDA = 0.01
FACTORS = 200
N_EPOCHS = 15
SEED = 42

In [None]:
def preprocess(df:pl.DataFrame) -> pl.DataFrame:
    df = df.explode(["prev_items"])
    df = df.with_columns(
        df.select(pl.col("session_id").cumcount().over("session_id").alias("sequence_num"))
    )
    return df

In [None]:
def train_and_generate_candidates(df:pd.DataFrame):
    # generate sparse matrix
    unique_user_ids = sorted(df["session_id"].unique())
    unique_item_ids = sorted(df["prev_items"].unique())
    user_id2index = dict(zip(unique_user_ids, range(len(unique_user_ids))))
    item_id2index = dict(zip(unique_item_ids, range(len(unique_item_ids))))

    matrix = lil_matrix((len(unique_user_ids), len(unique_item_ids)))
    for _, row in tqdm(df.iterrows()):
        user_index = user_id2index[row["session_id"]]
        item_index = item_id2index[row["prev_items"]]
        matrix[user_index, item_index] = 1.0

    # convert LIL to CSR
    matrix = matrix.tocsr()

    # model initialization
    model = implicit.als.AlternatingLeastSquares(
        factors = FACTORS,
        alpha = ALPHA,
        iterations = N_EPOCHS,
        regularization = LAMBDA,
        calculate_training_loss = True,
        random_state = SEED,
    )

    # train model
    model.fit(matrix)

    # generate candidate
    users = []
    candidates = []
    imf_scores = []

    for user_id, user_index in tqdm(user_id2index.items()):
        item_indexes, scores = model.recommend(user_index, matrix[user_index], N=TOP_N, filter_already_liked_items=True)
        for item_index, score in zip(item_indexes, scores):
            users.append(user_id)
            candidates.append(unique_item_ids[item_index])
            imf_scores.append(score)

    candidates = pd.DataFrame({
        "session_id": users,
        "candidate_item": candidates,
        "imf_score": imf_scores,
    })


    candidates = pl.from_pandas(candidates)
    candidates = candidates \
    .sort(["session_id", "imf_score"], descending=[False, True]) \
    .with_columns(pl.col("imf_score").rank(descending=True, method="min").over("session_id").alias("imf_rank"))
    candidates = candidates.to_pandas()


    return model, user_id2index, item_id2index, candidates

## For local train/eval

In [None]:
train = pl.read_parquet("/content/drive/MyDrive/kddcup2023/data/preprocessed/task1/train_task1.parquet")
test = pl.read_parquet("/content/drive/MyDrive/kddcup2023/data/preprocessed/task1/test_task1_phase2.parquet")

train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])
session_df = session_df.to_pandas()
session_df = session_df[session_df["locale"] == LOCALE]

In [None]:
%%time
model, user_id2index, item_id2index, candidates = train_and_generate_candidates(session_df)

5336525it [05:08, 17320.14it/s]
  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

100%|██████████| 1298118/1298118 [5:33:29<00:00, 64.87it/s]


CPU times: user 1d 6h 2min 15s, sys: 13h 7min 18s, total: 1d 19h 9min 33s
Wall time: 5h 57min


In [None]:
file_name = f"imf_{VER}_{LOCALE}_for_train_or_eval.parquet"
candidates.to_parquet("/content/drive/MyDrive/kddcup2023/data/interim/candidates/task1/" + file_name)

In [None]:
# save models
model_name = f"imf_{VER}_{LOCALE}_model_for_train_or_eval.npz"
user_ids_name = f"imf_{VER}_{LOCALE}_user_id2index_for_train_or_eval.pickle"
item_ids_name = f"imf_{VER}_{LOCALE}_item_id2index_for_train_or_eval.pickle"
model.save(DIR + "models/task1/" + model_name)
with open(DIR + "models/task1/" + user_ids_name, "wb") as f:
    pickle.dump(user_id2index, f)
with open(DIR + "models/task1/" + item_ids_name, "wb") as f:
    pickle.dump(item_id2index, f)

# save candidates
file_name = f"imf_{VER}_{LOCALE}_for_train_or_eval.parquet"
candidates.to_parquet(DIR + "data/interim/candidates/task1/" + file_name)

# For test inference

In [None]:
train = pl.read_parquet("/content/drive/MyDrive/kddcup2023/data/preprocessed/task1/train_task1.parquet")
test = pl.read_parquet("/content/drive/MyDrive/kddcup2023/data/preprocessed/task1/test_task1_phase2.parquet")

In [None]:
# Append train's next_item to prev_items
prev_items_list = train["prev_items"].to_list()
next_item_list = train["next_item"].to_list()
prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train = train.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

In [None]:
train = preprocess(train)
test = preprocess(test)
session_df = pl.concat([
    train["prev_items", "locale", "session_id", "sequence_num"],
    test["prev_items", "locale", "session_id", "sequence_num"],
])
session_df = session_df.to_pandas()
session_df = session_df[session_df["locale"] == LOCALE]

In [None]:
%%time
model, user_id2index, item_id2index, candidates = train_and_generate_candidates(session_df)

In [None]:
# save models
model_name = f"imf_{VER}_{LOCALE}_model_for_inference.npz"
user_ids_name = f"imf_{VER}_{LOCALE}_user_id2index_for_inference.pickle"
item_ids_name = f"imf_{VER}_{LOCALE}_item_id2index_for_inference.pickle"
model.save(DIR + "models/task1/" + model_name)
with open(DIR + "models/task1/" + user_ids_name, "wb") as f:
    pickle.dump(user_id2index, f)
with open(DIR + "models/task1/" + item_ids_name, "wb") as f:
    pickle.dump(item_id2index, f)

# save candidates
file_name = f"imf_{VER}_{LOCALE}_for_inference.parquet"
candidates.to_parquet(DIR + "data/interim/candidates/task1/" + file_name)