**About :** Generates candidates.

In [1]:
cd ../src

/home/datvodinh/Workspace/otto-recsys/kaggle_otto_rs/src


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import gc
import sys
import cudf
import json
import glob
import pickle
import warnings
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from multiprocessing import Pool
from pandarallel import pandarallel
from numerize.numerize import numerize


warnings.simplefilter(action="ignore", category=FutureWarning)
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=True)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
from params import *
from utils.metrics import get_coverage

### Params

In [5]:
MODE = "val"

In [6]:
if MODE == "val":
    REGEX = "../output/val_parquet/*"
elif MODE == "test":
    REGEX = "../output/test_parquet/*"
elif MODE == "extra":
    REGEX = "../output/val_trimmed_parquet/*"
    GT_FILE = "../output/val_labels_trimmed.parquet"
else:
    raise NotImplementedError

In [7]:
ITEM_CT = 50
ITEM_CT2 = 50
ITEM_CT3 = 10

In [8]:
MATRIX_FOLDER = "../output/matrices/"
# MATRIX_FOLDER = "../output/matrices_2/"

In [9]:
CLICKS = True
MULTIPLIER = 1

if CLICKS:
    SUFFIX = "c-clicks-v7"  # 50
else:
#     SUFFIX = "c-orders-v7"  # 50
    SUFFIX = "c-orders-v8"  # 75
#     SUFFIX = "c-orders-v9"  # 100

## Chris

In [10]:
from data.candidates_chris import (
    load_parquets,
    df_parallelize_run,
    explode,
    matrix_to_candids_dict,
)

### Make valid lists

In [11]:
LIST_FOLDER = f"../output/lists/{MODE}/"

In [12]:
df_val = load_parquets(REGEX)

In [13]:
if MODE == "extra":  # Remove useless sessions for speed-up
    gt = pd.read_parquet(GT_FILE)
    kept_sessions = gt[gt["type"] != "clicks"].drop("ground_truth", axis=1)
    kept_sessions = kept_sessions.drop_duplicates(subset="session", keep="first")

    prev_len = len(df_val)
    df_val: pd.DataFrame = (
        df_val.merge(kept_sessions, on="session", how="left", suffixes=("", "_x"))
        .dropna()
        .drop("type_x", axis=1)
        .reset_index(drop=True)
    )

    factor = prev_len / len(df_val)
    print(f"Reduced dataset size by {factor:.1f}x")

### Matrices

#### Popular Items

In [14]:
top_clicks = (
    df_val.loc[df_val["type"] == 0, "aid"].value_counts().index.values[:100].tolist()
)
top_carts = (
    df_val.loc[df_val["type"] == 1, "aid"].value_counts().index.values[:100].tolist()
)
top_orders = (
    df_val.loc[df_val["type"] == 2, "aid"].value_counts().index.values[:100].tolist()
)

#### Matrices

In [15]:
MODE_ = "val" if MODE == "extra" else MODE

In [16]:
top_20_buy2buy = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_cpu-90_{MODE_}.pqt")
)

top_20_buy2buy2 = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_cpu-99_{MODE_}.pqt")
)

top_20_orders = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_cpu-95_{MODE_}.pqt")
)
top_20_carts = top_20_orders

top_20_test = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-116_{MODE_}.pqt")
)

top_20_test2 = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-115_{MODE_}.pqt")
)

In [17]:
top_20 = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-93_{MODE_}.pqt")
)

top_20b = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-217_{MODE_}.pqt")
)

top_20c = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-220_{MODE_}.pqt")
)

top_20d = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-226_{MODE_}.pqt")
)

top_20e = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-232_{MODE_}.pqt")
)

top_20f = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-235_{MODE_}.pqt")
)

In [18]:
top_20_buy = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-239_{MODE_}.pqt")
)

top_20_new = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-700_{MODE_}.pqt")
)

top_20_new2 = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-701_{MODE_}.pqt")
)

In [19]:
top_40_day = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-155_{MODE_}.pqt")
)

top_40_day2 = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-157_{MODE_}.pqt")
)

### Chris Functions

In [20]:
type_weight_multipliers = {0: 1, 1: 6, 2: 3}


def suggest_aids(df):
    # aids=df.aid.tolist()
    # types = df.type.tolist()
    session = df[0]
    aids = df[1]
    types = df[2]
    ds = df[4]
    ds2 = df[6]

    unique_aids = list(dict.fromkeys(aids[::-1]))

    # df2 = df.sort_values('ts',ascending=False).drop_duplicates('d')
    # aids2 = df2.aid.tolist()
    # unique_aids3 = list(dict.fromkeys(aids2[::-1] )) #last of each session
    unique_aids3 = list(
        dict.fromkeys([f for i, f in enumerate(aids) if ds2[i] == 1][::-1])
    )

    # mx = df.d.max()
    # aids2 = df.loc[df.d==mx].aid.tolist()
    # unique_aids4 = list(dict.fromkeys(aids2[::-1] ))
    mx = np.max(ds)
    unique_aids4 = list(
        dict.fromkeys([f for i, f in enumerate(aids) if ds[i] == mx][::-1])
    )

    # df = df.loc[ df['type'].isin([1,2]) ]
    # unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    unique_buys = list(
        dict.fromkeys([f for i, f in enumerate(aids) if types[i] in [1, 2]][::-1])
    )

    ln = len(unique_aids)

    if len(unique_aids) >= 15:
        weights = np.logspace(0.1, 1, len(aids), base=2, endpoint=True) - 1
        aids_temp = Counter()
        for aid, w, t in zip(aids, weights, types):
            aids_temp[aid] += w * type_weight_multipliers[t]
        aids3 = list(
            itertools.chain(
                *[top_20c[aid][:20] for aid in unique_aids[:2] if aid in top_20c]
            )
        )
        for i, aid in enumerate(aids3):
            aids_temp[aid] += 0.6
        aids3 = list(
            itertools.chain(
                *[top_20b[aid][:15] for aid in unique_aids3 if aid in top_20b]
            )
        )
        for i, aid in enumerate(aids3):
            aids_temp[aid] += 0.3
        aids3 = list(
            itertools.chain(
                *[
                    top_20_test2[aid][:20]
                    for aid in unique_aids[:2]
                    if aid in top_20_test2
                ]
            )
        )
        for i, aid in enumerate(aids3):
            aids_temp[aid] += 0.6

        result = [k for k, v in aids_temp.most_common(ITEM_CT2) if k not in unique_aids]

        if len(result) < 1:
            result += top_clicks[:1]
        return session, result[:ITEM_CT2]
    #         return session, (result + top_clicks[: ITEM_CT2 - len(result)])[:ITEM_CT2]

    aids_temp = Counter()

    weights3 = [2, 2] + [1] * 28
    if len(unique_aids) == 1:
        aids5 = list(
            itertools.chain(
                *[
                    top_20_new2[aid][:30]
                    for aid in unique_aids[-1:]
                    if aid in top_20_new2
                ]
            )
        )
        w5 = weights3 * int(len(aids5) // 30)
        for aid, w in zip(aids5, w5):
            aids_temp[aid] += w

    aids2 = list(
        itertools.chain(*[top_20[aid][:20] for aid in unique_aids if aid in top_20])
    )
    for i, aid in enumerate(aids2):
        m = 0.1 + 0.9 * (ln - (i // (20))) / ln
        aids_temp[aid] += m
        if i % (20) == 0:
            aids_temp[aid] += m

    aids3 = list(
        itertools.chain(
            *[top_20b[aid][:20] for aid in unique_aids[:2] if aid in top_20b]
        )
    )
    for i, aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i % (20) == 0:
            aids_temp[aid] += 1

    aids3 = list(
        itertools.chain(
            *[top_20_test2[aid][:20] for aid in unique_aids[:2] if aid in top_20_test2]
        )
    )
    for i, aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i % (20) == 0:
            aids_temp[aid] += 1

    aids4 = list(
        itertools.chain(*[top_20f[aid][:10] for aid in unique_aids4 if aid in top_20f])
    )
    for i, aid in enumerate(aids4):
        w = i // (10)
        aids_temp[aid] += 1 - w * 0.1
        if i % (10) == 0:
            aids_temp[aid] += 1 - w * 0.1

    aids5 = list(
        itertools.chain(*[top_20e[aid][:20] for aid in unique_aids3 if aid in top_20e])
    )
    for i, aid in enumerate(aids5):
        aids_temp[aid] += 1
        if i % (20) == 0:
            aids_temp[aid] += 1
    top_aids2 = [k for k, v in aids_temp.most_common(1) if k not in unique_aids]

    aids3 = list(
        itertools.chain(*[top_20c[aid][:10] for aid in top_aids2 if aid in top_20c])
    )
    for i, aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i % (10) == 0:
            aids_temp[aid] += 1
    top_aids2 = [k for k, v in aids_temp.most_common(ITEM_CT2) if k not in unique_aids]
    result = top_aids2

    if len(result) < 1:
        result += top_clicks[:1]
    return session, result[:ITEM_CT2]


#     return session, (result + top_clicks[: ITEM_CT2 - len(result)])[:ITEM_CT2]

In [21]:
def suggest_clicks(df):
    # aids=df.aid.tolist()
    # types = df.type.tolist()

    session = df[0]
    aids = df[1]
    types = df[2]
    tss = df[3]
    ds = df[4]
    ds2 = df[6]
    # days = df[7]

    top_day = top_40_day2
    click_aids = click_df[session][:ITEM_CT3]

    unique_aids = list(dict.fromkeys(aids[::-1]))

    # df2 = df.sort_values('ts',ascending=False).drop_duplicates('d')
    # aids2 = df2.aid.tolist()
    # unique_aids3 = list(dict.fromkeys(aids2[::-1] )) #last of each session
    unique_aids3 = list(
        dict.fromkeys([f for i, f in enumerate(aids) if ds2[i] == 1][::-1])
    )

    # mx = df.d.max()
    # aids2 = df.loc[df.d==mx].aid.tolist()
    # unique_aids4 = list(dict.fromkeys(aids2[::-1] ))
    mx = np.max(ds)
    unique_aids4 = list(
        dict.fromkeys([f for i, f in enumerate(aids) if ds[i] == mx][::-1])
    )

    # aids2 = df.loc[df.ts >= mx - 60*60*24].aid.tolist()
    # unique_aids6 = list(dict.fromkeys(aids2[::-1] )) #recent 1 day
    mx = np.max(tss)
    unique_aids6 = list(
        dict.fromkeys(
            [f for i, f in enumerate(aids) if tss[i] >= mx - 60 * 60 * 24][::-1]
        )
    )

    # df = df.loc[ df['type'].isin([1,2]) ]
    # unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    unique_buys = list(
        dict.fromkeys([f for i, f in enumerate(aids) if types[i] in [1, 2]][::-1])
    )

    ln = len(unique_aids)

    if len(unique_aids) >= 15:
        weights = np.logspace(0.1, 1, len(aids), base=2, endpoint=True) - 1
        aids_temp = Counter()
        for aid, w, t in zip(aids, weights, types):
            aids_temp[aid] += w * type_weight_multipliers[t]
        aids3 = list(
            itertools.chain(
                *[top_20c[aid][: 20 * 2] for aid in unique_aids[:2] if aid in top_20c]
            )
        )
        for i, aid in enumerate(aids3):
            aids_temp[aid] += 0.6
        aids3 = list(
            itertools.chain(
                *[top_20b[aid][: 15 * 2] for aid in unique_aids3 if aid in top_20b]
            )
        )
        for i, aid in enumerate(aids3):
            aids_temp[aid] += 0.3
        aids3 = list(
            itertools.chain(
                *[
                    top_20_test2[aid][: 20 * 2]
                    for aid in unique_aids[:2]
                    if aid in top_20_test2
                ]
            )
        )
        for i, aid in enumerate(aids3):
            aids_temp[aid] += 0.6

        # aids3 = list(itertools.chain(*[top_20[aid][:10] for aid in click_aids[:5] if aid in top_20]))
        # for i,aid in enumerate(aids3):
        #    aids_temp[aid] += 0.3

        result = [k for k, v in aids_temp.most_common(ITEM_CT)]
        return session, (result + top_clicks[: ITEM_CT - len(result)])[:ITEM_CT]
        # return sorted_aids

    aids_temp = Counter()

    # NEW
    MM = 4
    aids2 = list(
        itertools.chain(
            *[top_day[aid][: 10 * MM] for aid in unique_aids6 if aid in top_day]
        )
    )
    for i, aid in enumerate(aids2):
        aids_temp[aid] += 1

    # NEW NEW
    # ln0 = len(click_aids)
    aids2 = list(
        itertools.chain(*[top_20[aid][:20] for aid in click_aids if aid in top_20])
    )
    for i, aid in enumerate(aids2):
        aids_temp[aid] += 0.5
        # if i%20==0: aids_temp[aid] += 0.5

    weights3 = [2, 2] + [1] * 28
    if len(unique_aids) == 1:
        aids5 = list(
            itertools.chain(
                *[
                    top_20_new2[aid][:30]
                    for aid in unique_aids[-1:]
                    if aid in top_20_new2
                ]
            )
        )
        w5 = weights3 * int(len(aids5) // 30)
        for aid, w in zip(aids5, w5):
            aids_temp[aid] += w

    # aids2 = list(itertools.chain(*[top_20[aid][:20*2] for aid in unique_aids if aid in top_20]))
    # for i,aid in enumerate(aids2):
    #    m = 0.1 + 0.9*(ln-(i//(20*2)))/ln
    #    aids_temp[aid] += m
    #    if i%(20*2)==0: aids_temp[aid] += m

    # FROM GIBA
    for i, a in enumerate(unique_aids):
        w0 = np.max(
            [1 - (0.35 * i), 0.001]
        )  # Weight aid order starting from the last one.
        if a in top_20:
            for j, aj in enumerate(top_20[a]):
                w1 = np.max(
                    [1 - (0.005 * j), 0.01]
                )  # Weight the candidate aid from the dict
                aids_temp[aj] += w0 * w1

    aids3 = list(
        itertools.chain(
            *[top_20b[aid][: 20 * 2] for aid in unique_aids[:2] if aid in top_20b]
        )
    )
    for i, aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i % (20 * 2) == 0:
            aids_temp[aid] += 1

    aids3 = list(
        itertools.chain(
            *[
                top_20_test2[aid][: 20 * 2]
                for aid in unique_aids[:2]
                if aid in top_20_test2
            ]
        )
    )
    for i, aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i % (20 * 2) == 0:
            aids_temp[aid] += 1

    # TRY GIBA HERE
    aids4 = list(
        itertools.chain(
            *[top_20f[aid][: 10 * 2] for aid in unique_aids4 if aid in top_20f]
        )
    )
    for i, aid in enumerate(aids4):
        w = i // (10 * 2)
        aids_temp[aid] += 1 - w * 0.1
        if i % (10 * 2) == 0:
            aids_temp[aid] += 1 - w * 0.1

    aids5 = list(
        itertools.chain(
            *[top_20e[aid][: 20 * 2] for aid in unique_aids3 if aid in top_20e]
        )
    )
    for i, aid in enumerate(aids5):
        aids_temp[aid] += 1
        if i % (20 * 2) == 0:
            aids_temp[aid] += 1
    top_aids2 = [k for k, v in aids_temp.most_common(1) if k not in unique_aids]

    aids3 = list(
        itertools.chain(
            *[top_20c[aid][: 10 * 2] for aid in top_aids2 if aid in top_20c]
        )
    )
    for i, aid in enumerate(aids3):
        aids_temp[aid] += 1
        if i % (10 * 2) == 0:
            aids_temp[aid] += 1
    top_aids2 = [k for k, v in aids_temp.most_common(ITEM_CT) if k not in unique_aids]

    result = unique_aids + top_aids2[: ITEM_CT - len(unique_aids)]
    return session, (result + top_clicks[: ITEM_CT - len(result)])[:ITEM_CT]

In [22]:
def suggest_orders(df):
    session = df[0]
    aids = df[1]
    types = df[2]
    tss = df[3]
    ds = df[4]
    ds1 = df[5]
    ds2 = df[6]
    days = df[7]

    # top_day = top_40_day[ df.day.values[0] ]
    top_day = top_40_day
    # click_aids = click_df[df.session.values[0]][:ITEM_CT2]
    click_aids = click_df[session][:ITEM_CT3]

    # aids = df.aid.tolist()
    # types = df.type.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1]))

    # mx = df.d.max()
    # aids2 = df.loc[df.d==mx].aid.tolist()
    # unique_aids4 = list(dict.fromkeys(aids2[::-1] )) # last session
    mx = np.max(ds)
    unique_aids4 = list(
        dict.fromkeys([f for i, f in enumerate(aids) if ds[i] == mx][::-1])
    )

    # mx = df.ts.max()
    # aids2 = df.loc[df.ts >= mx - 60*60/2].aid.tolist()
    # unique_aids5 = list(dict.fromkeys(aids2[::-1] )) #recent 1 hour
    mx = np.max(tss)
    unique_aids5 = list(
        dict.fromkeys(
            [f for i, f in enumerate(aids) if tss[i] >= mx - 60 * 60 / 2][::-1]
        )
    )

    # aids2 = df.loc[df.ts >= mx - 60*60*24].aid.tolist()
    # unique_aids6 = list(dict.fromkeys(aids2[::-1] )) #recent 1 day
    unique_aids6 = list(
        dict.fromkeys(
            [f for i, f in enumerate(aids) if tss[i] >= mx - 60 * 60 * 24][::-1]
        )
    )

    # df2 = df.drop_duplicates('d')
    # aids2 = df2.aid.tolist()
    # unique_aids2 = list(dict.fromkeys(aids2[::-1] )) #first of each session
    unique_aids2 = list(
        dict.fromkeys([f for i, f in enumerate(aids) if ds1[i] == 1][::-1])
    )

    # df2 = df.sort_values('ts',ascending=False).drop_duplicates('d')
    # aids2 = df2.aid.tolist()
    # unique_aids3 = list(dict.fromkeys(aids2 )) #last of each session
    unique_aids3 = list(
        dict.fromkeys([f for i, f in enumerate(aids) if ds2[i] == 1][::-1])
    )

    # df = df.loc[ df['type'].isin([1,2]) ]
    # unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    unique_buys = list(
        dict.fromkeys([f for i, f in enumerate(aids) if types[i] in [1, 2]][::-1])
    )

    if len(unique_aids) >= 20:
        weights = np.logspace(0.5, 1, len(aids), base=2, endpoint=True) - 1
        aids_temp = Counter()
        for aid, w, t in zip(aids, weights, types):
            aids_temp[aid] += w * type_weight_multipliers[t]
        for aid in unique_aids2:
            aids_temp[aid] += 0.5
        for aid in unique_aids3:
            aids_temp[aid] += 0.5

        aids3 = list(
            itertools.chain(
                *[
                    top_20_buy2buy[aid][:40]
                    for aid in unique_buys
                    if aid in top_20_buy2buy
                ]
            )
        )
        for i, aid in enumerate(aids3):
            aids_temp[aid] += 0.05
            if i % 40 == 0:
                aids_temp[aid] += 0.05
        aids3 = list(
            itertools.chain(
                *[
                    top_20_buy2buy2[aid][:40]
                    for aid in unique_buys
                    if aid in top_20_buy2buy2
                ]
            )
        )
        for i, aid in enumerate(aids3):
            aids_temp[aid] += 0.1
            if i % 40 == 0:
                aids_temp[aid] += 0.1

        aids4 = list(
            itertools.chain(
                *[top_20_test[aid][:40] for aid in unique_aids if aid in top_20_test]
            )
        )
        for i, aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i % 40 == 0:
                aids_temp[aid] += 0.05
        aids5 = list(
            itertools.chain(
                *[top_20c[aid][:20] for aid in unique_aids[:1] if aid in top_20c]
            )
        )
        for i, aid in enumerate(aids5):
            aids_temp[aid] += 0.05
            if i % 20 == 0:
                aids_temp[aid] += 0.05
        aids6 = list(
            itertools.chain(
                *[top_20d[aid][:20] for aid in unique_buys[:1] if aid in top_20d]
            )
        )
        for i, aid in enumerate(aids6):
            aids_temp[aid] += 0.05
            if i % 20 == 0:
                aids_temp[aid] += 0.05

        aids7 = list(
            itertools.chain(
                *[top_20b[aid][:5] for aid in unique_aids3 if aid in top_20b]
            )
        )
        for i, aid in enumerate(aids7):
            aids_temp[aid] += 0.25
            if i % 5 == 0:
                aids_temp[aid] += 0.25
        aids7 = list(
            itertools.chain(
                *[top_20b[aid][:5] for aid in unique_aids2 if aid in top_20b]
            )
        )
        for i, aid in enumerate(aids7):
            aids_temp[aid] += 0.125
            if i % 5 == 0:
                aids_temp[aid] += 0.125

        # NEW STUFF
        aids4 = list(
            itertools.chain(
                *[top_day[aid][:40] for aid in unique_aids6 if aid in top_day]
            )
        )
        for i, aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i % 40 == 0:
                aids_temp[aid] += 0.05
        aids4 = list(
            itertools.chain(
                *[top_20_test[aid][:20] for aid in click_aids if aid in top_20_test]
            )
        )
        for i, aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i % 20 == 0:
                aids_temp[aid] += 0.05
        aids4 = list(
            itertools.chain(
                *[top_20_buy[aid][:20] for aid in click_aids if aid in top_20_buy]
            )
        )
        for i, aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i % 20 == 0:
                aids_temp[aid] += 0.05
        for aid in click_aids:
            aids_temp[aid] += 0.05

        result = [k for k, v in aids_temp.most_common(ITEM_CT)]

        if MODE != "test":
            if len(result) < 1:
                result += top_orders[:1]
            return session, result[:ITEM_CT]
        else:
            return session, (result + top_orders[: ITEM_CT - len(result)])[:ITEM_CT]

    #         return session, (result + top_orders[: ITEM_CT - len(result)])[:ITEM_CT]

    weights = [2, 2] + [1] * 8  # + [0]*30
    weights2 = [2, 2] + [1] * 53  # + [0]*25
    weights3 = [2, 2] + [1] * 18  # + [0]*70
    weights4 = [2, 2] + [1] * 38  # + [0]*70
    weights5 = [2, 2] + [1] * 28  # + [0]*70

    ln = len(unique_aids)

    MM = 3
    aids_temp = Counter()
    aids2 = list(
        itertools.chain(
            *[
                top_20_orders[aid][: 10 * MM]
                for aid in unique_aids
                if aid in top_20_orders
            ]
        )
    )
    w2 = weights5 * int(len(aids2) // (10 * MM))
    aids3 = list(
        itertools.chain(
            *[
                top_20_buy2buy[aid][: 10 * MM]
                for aid in unique_buys
                if aid in top_20_buy2buy
            ]
        )
    )
    w3 = weights5 * int(len(aids3) // (10 * MM))
    aids4 = list(
        itertools.chain(
            *[top_20_test[aid][: 10 * MM] for aid in unique_aids if aid in top_20_test]
        )
    )
    w4 = weights5 * int(len(aids4) // (10 * MM))
    aids5 = list(
        itertools.chain(
            *[
                top_20_buy2buy2[aid][: 10 * MM]
                for aid in unique_buys
                if aid in top_20_buy2buy2
            ]
        )
    )
    w5 = weights5 * int(len(aids5) // (10 * MM))
    for i, (aid, w) in enumerate(zip(aids2, w2)):
        m = 0.25 + 0.75 * (ln - (i // (10 * MM))) / ln
        aids_temp[aid] += w * m
    for i, (aid, w) in enumerate(zip(aids3, w3)):
        aids_temp[aid] += w / 2
    for i, (aid, w) in enumerate(zip(aids4, w4)):
        m = 0.25 + 0.75 * (ln - (i // (10 * MM))) / ln
        aids_temp[aid] += w * m
    for i, (aid, w) in enumerate(zip(aids5, w5)):
        aids_temp[aid] += w / 2

    # NEW
    MM = 4
    aids2 = list(
        itertools.chain(
            *[top_day[aid][: 10 * MM] for aid in unique_aids6 if aid in top_day]
        )
    )
    w2 = weights4 * int(len(aids2) // (10 * MM))
    for i, (aid, w) in enumerate(zip(aids2, w2)):
        m = 0.25 + 0.75 * (ln - (i // (10 * MM))) / ln
        aids_temp[aid] += 1  # w*m

    # NEW
    ln0 = len(click_aids)
    aids4 = list(
        itertools.chain(
            *[top_20_test[aid][:20] for aid in click_aids if aid in top_20_test]
        )
    )
    w4 = weights3 * int(len(aids4) // (20))
    for i, (aid, w) in enumerate(zip(aids4, w4)):
        m = 0.25 + 0.75 * (ln0 - (i // (20))) / ln0
        aids_temp[aid] += w * m
    aids4 = list(
        itertools.chain(
            *[top_20_buy[aid][:20] for aid in click_aids if aid in top_20_buy]
        )
    )
    w4 = weights3 * int(len(aids4) // (20))
    for i, (aid, w) in enumerate(zip(aids4, w4)):
        m = 0.25 + 0.75 * (ln0 - (i // (20))) / ln0
        aids_temp[aid] += w * m
    for aid in click_aids:
        aids_temp[aid] += 1

    aids5 = list(
        itertools.chain(
            *[top_20c[aid][:55] for aid in unique_aids[:1] if aid in top_20c]
        )
    )
    w5 = weights2 * int(len(aids5) // 55)
    for aid, w in zip(aids5, w5):
        aids_temp[aid] += w

    if len(unique_aids) == 1:
        aids5 = list(
            itertools.chain(
                *[
                    top_20_new2[aid][:20]
                    for aid in unique_aids[-1:]
                    if aid in top_20_new2
                ]
            )
        )
        w5 = weights3 * int(len(aids5) // 20)
        for aid, w in zip(aids5, w5):
            aids_temp[aid] += w
        aids5 = list(
            itertools.chain(
                *[top_20_new[aid][:20] for aid in unique_aids[-1:] if aid in top_20_new]
            )
        )
        w5 = weights3 * int(len(aids5) // 20)
        for aid, w in zip(aids5, w5):
            aids_temp[aid] += w

    aids5 = list(
        itertools.chain(
            *[top_20d[aid][:20] for aid in unique_buys[:1] if aid in top_20d]
        )
    )
    w5 = weights3 * int(len(aids5) // 20)
    for aid, w in zip(aids5, w5):
        aids_temp[aid] += w

    ln2 = len(unique_aids5)
    aids5 = list(
        itertools.chain(
            *[top_20_buy[aid][:20] for aid in unique_aids5 if aid in top_20_buy]
        )
    )
    w5 = weights3 * int(len(aids5) // 20)
    for aid, w in zip(aids5, w5):
        aids_temp[aid] += 2 * w / ln2

    aids4 = list(
        itertools.chain(*[top_20f[aid][:5] for aid in unique_aids4 if aid in top_20f])
    )
    for i, aid in enumerate(aids4):
        w = i // 5
        aids_temp[aid] += 1 / 2 - w * 0.05
        if i % 5 == 0:
            aids_temp[aid] += 1 / 2 - w * 0.05
    aids5 = list(
        itertools.chain(*[top_20e[aid][:55] for aid in unique_aids3 if aid in top_20e])
    )
    w5 = weights2 * int(len(aids5) // 55)
    for i, (aid, w) in enumerate(zip(aids5, w5)):
        w2 = i // 55
        aids_temp[aid] += w - w2 * 0.1
    aids5 = list(
        itertools.chain(*[top_20e[aid][:10] for aid in unique_aids2 if aid in top_20e])
    )
    w5 = weights * int(len(aids5) // 10)
    for i, (aid, w) in enumerate(zip(aids5, w5)):
        w2 = i // 10
        aids_temp[aid] += w / 2.0 - w2 * 0.05

    sorted_aids = [k for k, v in aids_temp.most_common(ITEM_CT) if k not in unique_aids]
    result = unique_aids + sorted_aids[: ITEM_CT - len(unique_aids)]

    if MODE != "test":
        if len(result) < 1:
            result += top_orders[:1]
        return session, result[:ITEM_CT]
    else:
        return session, (result + top_orders[: ITEM_CT - len(result)])[:ITEM_CT]

## Main

In [23]:
%%time
PIECES = 10

valid_bysession_list = []
for PART in range(PIECES):
    with open(LIST_FOLDER + f"group_{PART}.pkl", 'rb') as f:
        valid_bysession_list.extend(pickle.load(f))
    
print(len(valid_bysession_list))

FileNotFoundError: [Errno 2] No such file or directory: '../output/lists/val/group_0.pkl'

### Fast Clicks

In [24]:
%%time
if not os.path.exists(f"../output/candidates/clicks_fast_{MODE}.parquet"):
    preds = df_parallelize_run(suggest_aids, valid_bysession_list)
    
    pred_df = cudf.DataFrame(
        cudf.Series([f[1] for f in preds], index=[f[0] for f in preds])
    ).reset_index()
    pred_df.columns = ["session", "candidates"]

    pred_df.to_parquet(f"../output/candidates/clicks_fast_{MODE}.parquet")

ValueError: Number of processes must be at least 1

In [25]:
click_df = (
    cudf.read_parquet(f"../output/candidates/clicks_fast_{MODE}.parquet")
    .set_index("session")["candidates"]
    .to_pandas()
    .to_dict()
)

FileNotFoundError: /home/datvodinh/Workspace/otto-recsys/kaggle_otto_rs/src/../output/candidates/clicks_fast_val.parquet

### Main

In [None]:
%%time
if CLICKS:
    preds = df_parallelize_run(suggest_clicks, valid_bysession_list)
    pred_df = pd.Series([f[1] for f in preds], index=[f[0] for f in preds])
else:
    preds = df_parallelize_run(suggest_orders, valid_bysession_list)
    pred_df = pd.Series([f[1] for f in preds], index=[f[0] for f in preds])

### Coverage

In [None]:
clicks_pred_df = pd.DataFrame(
    pred_df.add_suffix("_clicks"), columns=["labels"]
).reset_index()
orders_pred_df = pd.DataFrame(
    pred_df.add_suffix("_orders"), columns=["labels"]
).reset_index()
carts_pred_df = pd.DataFrame(
    pred_df.add_suffix("_carts"), columns=["labels"]
).reset_index()

pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df], ignore_index=True)
pred_df.columns = ["session_type", "labels_l"]
pred_df["labels"] = pred_df["labels_l"].apply(lambda x: " ".join(map(str, x)))

In [None]:
if MODE != "test":
    gt = pd.read_parquet(GT_FILE)

    recs = []
    df_pred = pred_df[["session_type", "labels_l"]].copy()
    df_pred.columns = ["session_type", "candidates"]
    df_pred["session"] = (
        df_pred["session_type"].apply(lambda x: x.split("_")[0]).astype(int)
    )
    df_pred["type"] = df_pred["session_type"].apply(lambda x: x.split("_")[1])

    df_pred = df_pred.merge(gt, on=["session", "type"], how="left")

    for col in CLASSES:
        df_pred_c = df_pred[df_pred["type"] == col]

        n_preds, n_gts, n_found = get_coverage(
            df_pred_c["candidates"].values, df_pred_c["ground_truth"].values
        )
        print(
            f"- {col} \t- Found {numerize(n_found)} GTs with {numerize(n_preds)} candidates (pos_prop={n_found / n_preds * 100 :.2f}%)\t-  Highest reachable Recall : {n_found / n_gts :.4f}"
        )

        recs.append(n_found / n_gts)

    cv = np.average(recs, weights=WEIGHTS)
    print(f"\n-> CV : {cv:.4f}")

    del clicks_pred_df, orders_pred_df, carts_pred_df, df_pred
    gc.collect()

- 75
 - clicks 	- Found 1.11M GTs with 134.98M candidates (pos_prop=0.82%)	-  Highest reachable Recall : 0.6312
 - carts 	- Found 293.95K GTs with 134.98M candidates (pos_prop=0.22%)	-  Highest reachable Recall : 0.5103
 - orders 	- Found 222.1K GTs with 134.98M candidates (pos_prop=0.16%)	-  Highest reachable Recall : 0.7090
- 50
 - clicks 	- Found 1.05M GTs with 90.01M candidates (pos_prop=1.17%)	-  Highest reachable Recall : 0.5999
 - carts 	- Found 279.92K GTs with 90.01M candidates (pos_prop=0.31%)	-  Highest reachable Recall : 0.4859
 - orders 	- Found 217.58K GTs with 90.01M candidates (pos_prop=0.24%)	-  Highest reachable Recall : 0.6946
- 50 clicks
 - clicks 	- Found 1.09M GTs with 90.06M candidates (pos_prop=1.21%)	-  Highest reachable Recall : 0.6224
 - carts 	- Found 272.36K GTs with 90.06M candidates (pos_prop=0.30%)	-  Highest reachable Recall : 0.4728
 - orders 	- Found 211.46K GTs with 90.06M candidates (pos_prop=0.23%)	-  Highest reachable Recall : 0.6751

### Save

In [None]:
df_candids = pred_df[["session_type", "labels_l"]].copy()
df_candids.columns = ["session", "candidates"]
df_candids["session"] = (
    df_candids["session"].apply(lambda x: x.split("_")[0]).astype("int32")
)
df_candids = df_candids.drop_duplicates(keep="first", subset="session").reset_index(
    drop=True
)
df_candids = df_candids.sort_values("session", ignore_index=True)

In [None]:
if MODE != "test":
    gt = pd.read_parquet(GT_FILE)
    gt["ground_truth"] = gt["ground_truth"].apply(lambda x: x.tolist())

    for col in CLASSES:
        if f"gt_{col}" not in df_candids.columns:
            df_candids = df_candids.merge(
                gt[gt["type"] == col].drop("type", axis=1), how="left"
            ).rename(columns={"ground_truth": f"gt_{col}"})

In [None]:
df_candids = explode(df_candids, test=(MODE == "test"))

df_candids.to_parquet(
    f"../output/candidates/candidates_{SUFFIX}_{MODE}.parquet", index=False
)
print(f"Saved to ../output/candidates/candidates_{SUFFIX}_{MODE}.parquet")

## Theo's version

In [None]:
from data.candidates import (
    load_parquets,
    create_candidates,
    explode,
    matrix_to_candids_dict,
)

### Params

In [None]:
MODE = "extra"
SUFFIX = "v5"  # 6 for new matrices

In [None]:
N_MATRIX = 20
MAX_COOC = 100

In [None]:
if MODE == "val":
    PARQUET_FILES = "../output/val_parquet/*"
elif MODE == "test":
    PARQUET_FILES = "../output/test_parquet/*"
elif MODE == "extra":
    PARQUET_FILES =  "../output/val_trimmed_parquet/*"
    GT_FILE = "../output/val_labels_trimmed.parquet"
else:
    raise NotImplementedError
    
MATRIX_FOLDER = "../output/matrices/"
# MATRIX_FOLDER = "../output/matrices_2/"

### Load

In [None]:
df = load_parquets(PARQUET_FILES)
df = df.sort_values(["session", "ts"]).reset_index(drop=True)

In [None]:
if MODE == "extra":  # Remove useless sessions for speed-up
    gt = pd.read_parquet(GT_FILE)
    kept_sessions = gt[gt['type'] != "clicks"].drop('ground_truth', axis=1)
    kept_sessions = kept_sessions.drop_duplicates(subset="session", keep="first")

    prev_len = len(df)
    df = df.merge(
        kept_sessions, on="session", how="left", suffixes=('', '_x')
    ).dropna(0).drop('type_x', axis=1).reset_index(drop=True)

    factor = prev_len / len(df)
    print(f'Reduced dataset size by {factor:.1f}x')

In [None]:
MODE_ = "val" if MODE == "extra" else MODE
    
clicks_candids = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_123_temporal_{N_MATRIX}_{MODE_}.pqt")
)
type_weighted_candids = matrix_to_candids_dict(
    cudf.read_parquet(
        MATRIX_FOLDER + f"matrix_123_type0.590.5_{N_MATRIX}_{MODE_}.pqt"
    )
)

### Candidates

In [None]:
%%time
df = create_candidates(df, clicks_candids, type_weighted_candids, max_cooc=MAX_COOC)

### Coverage

In [None]:
if MODE != "test":
    recalls = []
    gt = pd.read_parquet(GT_FILE)

    for col in CLASSES:
        if f"gt_{col}" not in df.columns:
            df = df.merge(
                gt[gt["type"] == col].drop("type", axis=1), how="left"
            ).rename(columns={"ground_truth": f"gt_{col}"})

        n_preds, n_gts, n_found = get_coverage(
            df["candidates"].values, df[f"gt_{col}"].values
        )

        print(
            f"- {col} \t- Found {numerize(n_found)} GTs with {numerize(n_preds)} candidates (pos_prop={n_found / n_preds * 100 :.2f}%)\t-  Highest reachable Recall : {n_found / n_gts :.4f}"
        )
        recalls.append(n_found / n_gts)

    cv = np.average(recalls, weights=WEIGHTS)
    print(f"\n-> Highest reachable CV : {cv:.3f}")

- clicks 	- Found 1.02M GTs with 89.04M candidates (pos_prop=1.14%)	-  Highest reachable Recall : 0.5806
- carts 	- Found 277.39K GTs with 89.04M candidates (pos_prop=0.31%)	-  Highest reachable Recall : 0.4816
- orders 	- Found 217.5K GTs with 89.04M candidates (pos_prop=0.24%)	-  Highest reachable Recall : 0.6944

-> Highest reachable CV : 0.619

### Explode & saving

In [None]:
df = explode(df, test=(MODE == "test"))

In [None]:
df.to_parquet(f"../output/candidates/candidates_{SUFFIX}_{MODE}.parquet", index=False)
print(f"Saved to ../output/candidates/candidates_{SUFFIX}_{MODE}.parquet")

In [None]:
del df
gc.collect()

### Blend Candidates

In [None]:
MODE = "extra"  # "test"  "extra"
CLICKS = False

SUFFIX = "cv7+-tv5"

if CLICKS:
    SUFFIX = "clicks_" + SUFFIX

if MODE == "extra":
    GT_FILE = "../output/val_labels_trimmed.parquet"

In [None]:
chris_candids_clicks = cudf.read_parquet(
    f"../output/candidates/candidates_c-clicks-v7_{MODE}.parquet"  # TODO : v7
)

chris_candids = cudf.read_parquet(
    f"../output/candidates/candidates_c-orders-v7_{MODE}.parquet"
)

In [None]:
theo_candids = cudf.read_parquet(f"../output/candidates/candidates_v5_{MODE}.parquet")

In [None]:
candids = (
    cudf.concat([
        chris_candids_clicks,  # comment for cv7-tv5
        chris_candids, 
        theo_candids,
    ])
    .drop_duplicates(keep="first", subset=["session", "candidates"])
    .reset_index(drop=True)
)

In [None]:
# dfg = candids.groupby('session').count()

In [None]:
candids.to_parquet(
    f"../output/candidates/candidates_{SUFFIX}_{MODE}.parquet", index=False
)
print(f"Saved to ../output/candidates/candidates_{SUFFIX}_{MODE}.parquet")

#### Coverage

In [None]:
if MODE != "test":
    df = candids[["session", "candidates"]].groupby("session").agg(list)
    df = df.reset_index().to_pandas()

    GT_FILE = '../output/val_labels.parquet' if MODE == "val" else '../output/val_labels_trimmed.parquet'
    gt = pd.read_parquet(GT_FILE)

    recalls = []
    for col in CLASSES:
        if f"gt_{col}" not in df.columns:
            df = df.merge(
                gt[gt["type"] == col].drop("type", axis=1), how="left"
            ).rename(columns={"ground_truth": f"gt_{col}"})

        n_preds, n_gts, n_found = get_coverage(
            df["candidates"].values, df[f"gt_{col}"].values
        )

        print(
            f"- {col} \t- Found {numerize(n_found)} GTs with {numerize(n_preds)} candidates (pos_prop={n_found / n_preds * 100 :.2f}%)\t-  Highest reachable Recall : {n_found / n_gts :.4f}"
        )
        recalls.append(n_found / n_gts)

**Chris v7 clicks + Theo v5**
- clicks 	- Found 1.12M GTs with 128.21M candidates (pos_prop=0.87%)	-  Highest reachable Recall : 0.6371
- carts 	- Found 293.55K GTs with 128.21M candidates (pos_prop=0.23%)	-  Highest reachable Recall : 0.5096
- orders 	- Found 222.12K GTs with 128.21M candidates (pos_prop=0.17%)	-  Highest reachable Recall : 0.7091

**Chris v9 + Theo v5**
- clicks 	- Found 1.16M GTs with 206.7M candidates (pos_prop=0.56%)	-  Highest reachable Recall : 0.6590
- carts 	- Found 309.85K GTs with 206.7M candidates (pos_prop=0.15%)	-  Highest reachable Recall : 0.5379
- orders 	- Found 227.4K GTs with 206.7M candidates (pos_prop=0.11%)	-  Highest reachable Recall : 0.7260

**Chris v8 + Theo v5**
- clicks 	- Found 1.13M GTs with 167.12M candidates (pos_prop=0.68%)	-  Highest reachable Recall : 0.6443
- carts 	- Found 303.47K GTs with 167.12M candidates (pos_prop=0.18%)	-  Highest reachable Recall : 0.5268
- orders 	- Found 225.46K GTs with 167.12M candidates (pos_prop=0.13%)	-  Highest reachable Recall : 0.7198

**Chris v7 clicks + orders + Theo v5**
- clicks 	- Found 1.14M GTs with 151.14M candidates (pos_prop=0.75%)	-  Highest reachable Recall : 0.6473
- carts 	- Found 301.08K GTs with 151.14M candidates (pos_prop=0.20%)	-  Highest reachable Recall : 0.5227
- orders 	- Found 224.5K GTs with 151.14M candidates (pos_prop=0.15%)	-  Highest reachable Recall : 0.7167

**Chris v7 + Theo v5**
- clicks 	- Found 1.09M GTs with 129.87M candidates (pos_prop=0.84%)	-  Highest reachable Recall : 0.6224
- carts 	- Found 295.05K GTs with 129.87M candidates (pos_prop=0.23%)	-  Highest reachable Recall : **0.5122**
- orders 	- Found 222.88K GTs with 129.87M candidates (pos_prop=0.17%)	-  Highest reachable Recall : **0.7115**

**Clicks Chris v3 + Theo v5**
- clicks 	- Found 1.12M GTs with 129M candidates (pos_prop=0.87%)	-  Highest reachable Recall : **0.6361**
- carts 	- Found 293.98K GTs with 129M candidates (pos_prop=0.23%)	-  Highest reachable Recall : **0.5104**
- orders 	- Found 222.29K GTs with 129M candidates (pos_prop=0.17%)	-  Highest reachable Recall : **0.7097**

**Chris v3 + Theo v5**
- clicks	- Found 1.11M GTs with 128.42M candidates (pos_prop=0.86%)	-  Highest reachable Recall : 0.6298
- carts	- Found 292.58K GTs with 128.42M candidates (pos_prop=0.23%)	-  Highest reachable Recall : 0.5079
- orders	- Found 222.05K GTs with 128.42M candidates (pos_prop=0.17%)	-  Highest reachable Recall : 0.7089

**Chris v4 + Theo v5**
- clicks 	- Found 1.14M GTs with 161.5M candidates (pos_prop=0.70%)	-  Highest reachable Recall : 0.6471
- carts 	- Found 299.54K GTs with 161.5M candidates (pos_prop=0.19%)	-  Highest reachable Recall : 0.5200
- orders 	- Found 224.13K GTs with 161.5M candidates (pos_prop=0.14%)	-  Highest reachable Recall : 0.7155

Done !