# About

## What is done in notebook:

1. Generate covisitation matrices for train set. Covisitation matrices contain information about what items are "clicked" together. Code for covisitation matrices is forked and modified (additional matrix is added for carts and constants used in matrices generatation are adjusted) from Chris Deotte's covistiation matrices from [here](https://www.kaggle.com/code/cdeotte/candidate-rerank-model-lb-0-575). 

2. Generating 40 candidates for each session from train set, using covisitation matrices. While generating candidadates, "weights of similarity" between items are also saved, to be used as features in further models.

# Import

In [1]:
VER = 1

import pandas as pd, numpy as np
import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
import matplotlib.pyplot as plt
import datetime

We will use RAPIDS version 21.10.01


## Config

In [3]:
class cfg:
    local = False
    load_in_cache = False

    print_steps = True
    
    keep_amount = 40
    clip_amount_in_score_calc = 40 
    
    weight_const1_carts = 0.1
    weight_const1_orders = 1.5

    create_submission = False
    run_final_score_calc = False

    add_most_common = False # very low (0.0001) score improvement
    
    recreate_matrices = False
    save_matrices = True
    mat_keep_amount = 35

In [None]:
if not cfg.local:
    import rmm
    rmm.reinitialize(pool_allocator=True, initial_pool_size=15*1024**3)

## Files

In [4]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

def read_file(f):
    if cfg.load_in_cache:
        return cudf.DataFrame( data_cache[f] )
    else:
        return read_file_to_cudf(f)      
def read_file_to_cudf(f):
    df = cudf.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(type_labels).astype('int8')
    return df
def read_file_to_cache(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(type_labels).astype('int8')
    return df

In [5]:
%%time

# CACHE THE DATA ON CPU BEFORE PROCESSING ON GPU
data_cache = {}

if cfg.local:
    files = glob.glob('../input/otto-validation/*_parquet/*')
    test_labels_file = '../../../downloaded_data/split_data_local_validation/test_labels.parquet'

else:
    files = glob.glob('../input/otto-validation/*_parquet/*')
    test_files = glob.glob('../input/otto-validation/test_parquet/*')
    test_labels_file = '../input/otto-validation/test_labels.parquet'

    
if cfg.load_in_cache:
    for f in files: data_cache[f] = read_file_to_cache(f)

# CHUNK PARAMETERS
READ_CT = 5
DIVIDE_BY = 6
CHUNK = int( np.ceil( len(files)/DIVIDE_BY ))
print(f'We will process {len(files)} files, in groups of {READ_CT} and chunks of {CHUNK}.')

We will process 120 files, in groups of 5 and chunks of 20.
CPU times: user 1.93 ms, sys: 647 µs, total: 2.58 ms
Wall time: 27 ms


In [6]:
if cfg.recreate_matrices:
    files_for_top_20_clicks = []
    files_for_top_20_buys = []
    files_for_top_20_buy2buy = []
    files_for_click2cart = []
else:
    if cfg.local:
        files_for_top_20_clicks = glob.glob('top_20_clicks_v*')
        files_for_top_20_buys = glob.glob('top_15_carts_orders_v*')
        files_for_top_20_buy2buy = glob.glob(f'top_15_buy2buy_v{VER}*')
        files_for_click2cart = glob.glob('top_click2cart_v*')
    else:
        files_for_top_20_clicks = glob.glob('/kaggle/input/covisitation-to-canidates-dataset/top_20_clicks_*')
        files_for_top_20_buys = glob.glob('/kaggle/input/covisitation-to-canidates-dataset/top_15_carts_orders_*')
        files_for_top_20_buy2buy = glob.glob('/kaggle/input/covisitation-to-canidates-dataset/top_15_buy2buy_*')
        files_for_click2cart = glob.glob('/kaggle/input/covisitation-to-canidates-dataset/top_click2cart_v*')


print(len(files_for_top_20_clicks))
print(len(files_for_top_20_buys))
print(len(files_for_top_20_buy2buy))
print(len(files_for_click2cart))

4
4
1
4


In [7]:
# assert False

# Helpers

In [8]:
dev0 = cudf.cupy.cuda.Device(0)
def print_gpu_mem_info():
    free_memory, total_memory = dev0.mem_info
    print('Free -', free_memory // 1024**2, 'MiB', '\tTotal -', total_memory // 1024**2, 'MiB')
def print_gpu_mem():
    print_gpu_mem_info()
print_gpu_mem()

Free - 447 MiB 	Total - 16280 MiB


In [9]:
def ts_format(ts):
    return datetime.datetime.fromtimestamp(ts).strftime("%b %d %Y  %H:%M:%S")
def print_date_ts(ts, end='\n'):
    print(ts_format(ts), end=end)
def print_date(d, end='\n'):
    print(d.strftime("%b %d %Y  %H:%M:%S"), end=end)
def mils_format(x):
    return f'{x:,}'
def len_mils(x):
    return mils_format(len(x))
def preview_df(df, head_show=1):
    print(len_mils(df))
    display(df.head(head_show))

# Compute Co-visitation Matrices

We will compute 4 co-visitation matrices using RAPIDS cuDF on GPU. This is 30x faster than using Pandas CPU like other public notebooks! For maximum speed, set the variable `DISK_PIECES` to the smallest number possible based on the GPU you are using without incurring memory errors. If you run this code offline with 32GB GPU ram, then you can use `DISK_PIECES = 1` and compute each co-visitation matrix in almost 1 minute! Kaggle's GPU only has 16GB ram, so we use `DISK_PIECES = 4` and it takes an amazing 3 minutes each! Below are some of the tricks to speed up computation
* Use RAPIDS cuDF GPU instead of Pandas CPU
* Read disk once and save in CPU RAM for later GPU multiple use
* Process largest amount of data possible on GPU at one time
* Merge data in two stages. Multiple small to single medium. Multiple medium to single large.
* Write result as parquet instead of dictionary

## 1) "Carts Orders" Co-visitation Matrix - Type Weighted

In [11]:
%%time
# files_for_top_20_buys = []
if len(files_for_top_20_buys) == 0:
    type_weight = {0:1, 1:6, 2:3}

    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    DISK_PIECES = 4
    SIZE = 1.86e6/DISK_PIECES

    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for PART in range(DISK_PIECES):
        print()
        print('### DISK PART',PART+1)

        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # => OUTER CHUNKS
        for j in range(DIVIDE_BY):
            a = j*CHUNK
            b = min( (j+1)*CHUNK, len(files) )
#             print(f'Processing files {a} thru {b-1} in groups of {READ_CT}...', end=' ')
            print('|', end=' ')

            # => INNER CHUNKS
            for k in range(a,b,READ_CT):
                print(k, end=' ')
                # READ FILE
                df = [read_file(files[k])]
                for i in range(1,READ_CT): 
                    if k+i<b: 
                        print(k+i, end=' ')
                        df.append( read_file(files[k+i]) )
                df = cudf.concat(df,ignore_index=True,axis=0)
                df = df.sort_values(['session','ts'],ascending=[True,False])
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df['n'] = df.groupby('session').cumcount()
                df = df.loc[df.n<30].drop('n',axis=1)
                # CREATE PAIRS
                df = df.merge(df,on='session')
                df = df.loc[ ((df.ts_x - df.ts_y).abs()< 30 * 60) & (df.aid_x != df.aid_y) ]
                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= PART*SIZE)&(df.aid_x < (PART+1)*SIZE)]
                # ASSIGN WEIGHTS
                df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
                df['wgt'] = df.type_y.map(type_weight)
                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()
                # COMBINE INNER CHUNKS
                if k==a: tmp2 = df
                else: tmp2 = tmp2.add(df, fill_value=0)
#                 print(k,', ',end='')
#             print()
            # COMBINE OUTER CHUNKS
            if a==0: tmp = tmp2
            else: tmp = tmp.add(tmp2, fill_value=0)
            del tmp2, df
            gc.collect()
        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
        # SAVE TOP 40
        tmp = tmp.reset_index(drop=True)
        tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
        tmp = tmp.loc[tmp.n < cfg.mat_keep_amount].drop('n',axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)

        tmp = tmp.to_pandas()
        tmp.wgt = np.log(1.73+tmp.wgt)
        tmp.to_parquet(f'top_15_carts_orders_v{VER}_{PART}.pqt')

        files_for_top_20_buys = glob.glob('top_15_carts_orders_v*')

        # tmp.to_pandas().to_parquet(f'top_15_carts_orders_v{VER}_{PART}.pqt')
    del tmp
print(files_for_top_20_buys)

['/kaggle/input/covisitation-to-canidates-dataset/top_15_carts_orders_v6_0.pqt', '/kaggle/input/covisitation-to-canidates-dataset/top_15_carts_orders_v6_1.pqt', '/kaggle/input/covisitation-to-canidates-dataset/top_15_carts_orders_v6_2.pqt', '/kaggle/input/covisitation-to-canidates-dataset/top_15_carts_orders_v6_3.pqt']
CPU times: user 62 µs, sys: 3 µs, total: 65 µs
Wall time: 70.1 µs


In [12]:
if cfg.save_matrices:
    for f in files_for_top_20_buys:
        tmp = cudf.read_parquet(f)
        file_name = os.path.basename(f)
        print(file_name)
        tmp.to_parquet(file_name)

top_15_carts_orders_v6_0.pqt
top_15_carts_orders_v6_1.pqt
top_15_carts_orders_v6_2.pqt
top_15_carts_orders_v6_3.pqt


## 2) "Buy2Buy" Co-visitation Matrix

In [13]:
%%time
# files_for_top_20_buy2buy = []
if len(files_for_top_20_buy2buy) == 0:

    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    DISK_PIECES = 1
    SIZE = 1.86e6/DISK_PIECES
    DIVIDE_BY = 5
    READ_CT = 5
    CHUNK = int( np.ceil( len(files)/DIVIDE_BY ))
    print(f'We will process {len(files)} files, in groups of {READ_CT} and chunks of {CHUNK}.')

    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for PART in range(DISK_PIECES):
        print()
        print('### DISK PART',PART+1)

        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # => OUTER CHUNKS
        for j in range(DIVIDE_BY):
            a = j*CHUNK
            b = min( (j+1)*CHUNK, len(files) )
            print('|', end=' ')

            # => INNER CHUNKS
            for k in range(a,b,READ_CT):
                # READ FILE
                df = [read_file(files[k])]
                print(k, end=' ')
                for i in range(1,READ_CT): 
                    if k+i<b:
                        print(k+i, end=' ')
                        df.append( read_file(files[k+i]) )

                df = cudf.concat(df,ignore_index=True,axis=0)
                df = df.loc[df['type'].isin([1,2])] # ONLY WANT CARTS AND ORDERS
                df = df.sort_values(['session','ts'],ascending=[True,False])
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df['n'] = df.groupby('session').cumcount()
                df = df.loc[df.n<20].drop('n',axis=1)
                # CREATE PAIRS
                df = df.merge(df,on='session')
                df = df.loc[ ((df.ts_x - df.ts_y).abs()< 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y) ] # 14 DAYS
                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= PART*SIZE)&(df.aid_x < (PART+1)*SIZE)]
                # ASSIGN WEIGHTS

                # df = df[['session', 'aid_x', 'aid_y','ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y'])
                # df['wgt'] = 1 + 3*(df.ts_x - ts_min_used)/(ts_max_used-ts_min_used)

                df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
                df['wgt'] = 1

                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()


                # COMBINE INNER CHUNKS
                if k==a: tmp2 = df
                else: tmp2 = tmp2.add(df, fill_value=0)
                # print(k,', ',end='')
            print()
            # COMBINE OUTER CHUNKS
            if a==0: tmp = tmp2
            else: tmp = tmp.add(tmp2, fill_value=0)
            del tmp2, df
            gc.collect()
        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
        # SAVE TOP 40
        tmp = tmp.reset_index(drop=True)
        tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
        tmp = tmp.loc[tmp.n<cfg.mat_keep_amount].drop('n',axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)

        tmp = tmp.to_pandas()
        tmp.wgt = np.log(1.73+tmp.wgt)
        tmp.to_parquet(f'top_15_buy2buy_v{VER}_{PART}.pqt')

        # tmp.to_pandas().to_parquet(f'top_15_buy2buy_v{VER}_{PART}.pqt')
    del tmp
    files_for_top_20_buy2buy = glob.glob(f'top_15_buy2buy_v{VER}*')

    # Carts and orders occuring together:

    top_20_buy2buy_df = cudf.DataFrame()
    for f in files_for_top_20_buy2buy:
        print(f)
        top_20_buy2buy_df = cudf.concat([top_20_buy2buy_df, cudf.read_parquet(f)])
    preview_df(top_20_buy2buy_df)
    
print(files_for_top_20_buy2buy)

# 1,314,327

['/kaggle/input/covisitation-to-canidates-dataset/top_15_buy2buy_v6_0.pqt']
CPU times: user 54 µs, sys: 13 µs, total: 67 µs
Wall time: 71.3 µs


In [14]:
if cfg.save_matrices:
    for f in files_for_top_20_buy2buy:
        tmp = cudf.read_parquet(f)
        file_name = os.path.basename(f)
        print(file_name)
        tmp.to_parquet(file_name)

top_15_buy2buy_v6_0.pqt


## 3) "Clicks" Co-visitation Matrix - Time Weighted

In [15]:
%%time
# files_for_top_20_clicks = []
if len(files_for_top_20_clicks) == 0:

    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    DISK_PIECES = 4
    SIZE = 1.86e6/DISK_PIECES

    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for PART in range(DISK_PIECES):
        print()
        print('### DISK PART',PART+1)

        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # => OUTER CHUNKS
        for j in range(DIVIDE_BY):
            a = j*CHUNK
            b = min( (j+1)*CHUNK, len(files) )
            print('|', end=' ')

            # => INNER CHUNKS
            for k in range(a,b,READ_CT):
                print(k, end=' ')
                # READ FILE
                df = [read_file(files[k])]
                for i in range(1,READ_CT): 
                    if k+i<b: 
                        print(k+i, end=' ')
                        df.append( read_file(files[k+i]) )
                        
                df = cudf.concat(df,ignore_index=True,axis=0)
                df = df.sort_values(['session','ts'],ascending=[True,False])
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df['n'] = df.groupby('session').cumcount()
                df = df.loc[df.n<30].drop('n',axis=1)
                # CREATE PAIRS
                df = df.merge(df,on='session')
                df = df.loc[ ((df.ts_x - df.ts_y).abs()< 30 * 60) & (df.aid_x != df.aid_y) ]
                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= PART*SIZE)&(df.aid_x < (PART+1)*SIZE)]
                # ASSIGN WEIGHTS
                df = df[['session', 'aid_x', 'aid_y','ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y'])
                df['wgt'] = 1 + 3*(df.ts_x - 1659304800)/(1662328791-1659304800)
                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()
                # COMBINE INNER CHUNKS
                if k==a: tmp2 = df
                else: tmp2 = tmp2.add(df, fill_value=0)
#                 print(k,', ',end='')
            print()
            # COMBINE OUTER CHUNKS
            if a==0: tmp = tmp2
            else: tmp = tmp.add(tmp2, fill_value=0)
            del tmp2, df
            gc.collect()
        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
        # SAVE TOP 40
        tmp = tmp.reset_index(drop=True)
        tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
        tmp = tmp.loc[tmp.n < cfg.mat_keep_amount].drop('n',axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)

        tmp = tmp.to_pandas()
        tmp.wgt = np.log(1+tmp.wgt)
        tmp.to_parquet(f'top_20_clicks_v{VER}_{PART}.pqt')

        # tmp.to_pandas().to_parquet(f'top_20_clicks_v{VER}_{PART}.pqt')
        
        files_for_top_20_clicks = glob.glob(f'top_20_clicks_v{VER}_*')
    del tmp
print(files_for_top_20_clicks)

['/kaggle/input/covisitation-to-canidates-dataset/top_20_clicks_v6_3.pqt', '/kaggle/input/covisitation-to-canidates-dataset/top_20_clicks_v6_0.pqt', '/kaggle/input/covisitation-to-canidates-dataset/top_20_clicks_v6_2.pqt', '/kaggle/input/covisitation-to-canidates-dataset/top_20_clicks_v6_1.pqt']
CPU times: user 62 µs, sys: 9 µs, total: 71 µs
Wall time: 76.5 µs


In [16]:
if cfg.save_matrices:
    for f in files_for_top_20_clicks:
        tmp = cudf.read_parquet(f)
        file_name = os.path.basename(f)
        print(file_name)
        tmp.to_parquet(file_name)

top_20_clicks_v6_3.pqt
top_20_clicks_v6_0.pqt
top_20_clicks_v6_2.pqt
top_20_clicks_v6_1.pqt


In [17]:
# _ = plt.hist(top_20_clicks_df.wgt.to_pandas(), log=True)

## 4) Click2Cart

In [18]:
%%time
# files_for_click2cart = []
if len(files_for_click2cart) == 0:

    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    DISK_PIECES = 4
    SIZE = 1.86e6/DISK_PIECES
    READ_CT = 4
    DIVIDE_BY = 5

    # COMPUTE IN PARTS FOR MEMORY MANGEMENT
    for PART in range(DISK_PIECES):
        print()
        print('### DISK PART',PART+1)

        # MERGE IS FASTEST PROCESSING CHUNKS WITHIN CHUNKS
        # => OUTER CHUNKS
        for j in range(DIVIDE_BY):
            a = j*CHUNK
            b = min( (j+1)*CHUNK, len(files) )
            print('|', end=' ')

            # => INNER CHUNKS
            for k in range(a,b,READ_CT):
                # READ FILE
                df = [read_file(files[k])]
                print(k, end=' ')
                for i in range(1,READ_CT): 
                    print(k+i, end=' ')
                    if k+i<b: df.append( read_file(files[k+i]) )
                df = cudf.concat(df,ignore_index=True,axis=0)

                df = df.loc[df['type'] < 2] # CLICKS and CARTS 
                df = df.sort_values(['session','ts'],ascending=[True,False])
                # USE TAIL OF SESSION
                df = df.reset_index(drop=True)
                df['n'] = df.groupby('session').cumcount()
                df = df.loc[df.n<20].drop('n',axis=1)

                # CREATE PAIRS
                df = df.merge(df,on='session')
                df = df.loc[ ((df.ts_x - df.ts_y).abs()< 14 * 24 * 60 * 60) & (df.aid_x != df.aid_y) ] # 14 DAYS

                # MEMORY MANAGEMENT COMPUTE IN PARTS
                df = df.loc[(df.aid_x >= PART*SIZE)&(df.aid_x < (PART+1)*SIZE)]

                # ASSIGN WEIGHTS

                # df = df[['session', 'aid_x', 'aid_y','ts_x']].drop_duplicates(['session', 'aid_x', 'aid_y'])
                # df['wgt'] = 1 + 3*(df.ts_x - 1659304800)/(1662328791-1659304800)

                df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
                df['wgt'] = 1
                
                df = df[['aid_x','aid_y','wgt']]
                df.wgt = df.wgt.astype('float32')
                df = df.groupby(['aid_x','aid_y']).wgt.sum()
                
                # COMBINE INNER CHUNKS
                if k==a: tmp2 = df
                else: tmp2 = tmp2.add(df, fill_value=0)
                # print(k,'-',k+i,', ',end='')
            print()
            # COMBINE OUTER CHUNKS
            if a==0: tmp = tmp2
            else: tmp = tmp.add(tmp2, fill_value=0)
            del tmp2, df
            gc.collect()
        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
        # SAVE TOP 40
        tmp = tmp.reset_index(drop=True)
        tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
        tmp = tmp.loc[tmp.n < cfg.mat_keep_amount].drop('n',axis=1)
        # SAVE PART TO DISK (convert to pandas first uses less memory)
        
        tmp = tmp.to_pandas()
        tmp.wgt = np.log(1.73+tmp.wgt)
        tmp.to_parquet(f'top_click2cart_v{VER}_{PART}.pqt')

        # tmp.to_pandas().to_parquet(f'top_click2cart_v{VER}_{PART}.pqt')
    del tmp
    files_for_click2cart = glob.glob('top_click2cart_v*')

    # Carts and clicks occuring together:

    top_click2cart_df = cudf.DataFrame()
    for f in files_for_click2cart:
        print(f)
        top_click2cart_df = cudf.concat([top_click2cart_df, cudf.read_parquet(f)])
    preview_df(top_click2cart_df)

print(files_for_click2cart)

['/kaggle/input/covisitation-to-canidates-dataset/top_click2cart_v6_2.pqt', '/kaggle/input/covisitation-to-canidates-dataset/top_click2cart_v6_1.pqt', '/kaggle/input/covisitation-to-canidates-dataset/top_click2cart_v6_0.pqt', '/kaggle/input/covisitation-to-canidates-dataset/top_click2cart_v6_3.pqt']
CPU times: user 82 µs, sys: 0 ns, total: 82 µs
Wall time: 86.8 µs


In [19]:
if cfg.save_matrices:
    for f in files_for_click2cart:
        tmp = cudf.read_parquet(f)
        file_name = os.path.basename(f)
        print(file_name)
        tmp.to_parquet(file_name)

top_click2cart_v6_2.pqt
top_click2cart_v6_1.pqt
top_click2cart_v6_0.pqt
top_click2cart_v6_3.pqt


## Cleanup

In [20]:
# FREE MEMORY
try:
    if cfg.load_in_cache:
        del data_cache
except Exception as e:
    print(e)
_ = gc.collect()

# Step 2 - ReRank - choose candidates using handcrafted rules

In [22]:
def load_test():    
    dfs = []
    for e, chunk_file in enumerate(test_files):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

test_df = load_test()
print('Test data has shape',test_df.shape)
test_df.head()

Test data has shape (7683577, 4)


Unnamed: 0,session,aid,ts,type
0,12089221,700554,1661448002,0
1,12089221,619488,1661448024,0
2,12089221,579241,1661449547,0
3,12089221,619488,1661449585,0
4,12089221,619488,1661456661,0


In [23]:
# TOP CLICKS AND ORDERS IN TEST

top_clicks = test_df.loc[test_df['type']==0,'aid'].value_counts().index.values[:20]
top_carts = test_df.loc[test_df['type']==1,'aid'].value_counts().index.values[:20]
top_orders = test_df.loc[test_df['type']==2,'aid'].value_counts().index.values[:20]

top_clicks_df = pd.DataFrame({'aid':top_clicks})
top_carts_df = pd.DataFrame({'aid':top_carts})
top_orders_df = pd.DataFrame({'aid':top_orders})

## Functions that generate candidates

In [24]:
def load_top_20_clicks_df():
    top_20_clicks_df = cudf.DataFrame()
    files_for_top_20_clicks.sort()
    for f in files_for_top_20_clicks:
        print(f, end=' ')
        top_20_clicks_df = cudf.concat([top_20_clicks_df, cudf.read_parquet(f)])
    # preview_df(top_20_clicks_df)

    return top_20_clicks_df
def load_top_20_buys_df():
    top_20_buys_df = cudf.DataFrame()
    for f in files_for_top_20_buys:
        print(f, end=' ')
        top_20_buys_df = cudf.concat([top_20_buys_df, cudf.read_parquet(f)])
    # print()
    # preview_df(top_20_buys_df)

    return top_20_buys_df
def load_top_20_buy2buy_df():
    # Carts and orders occuring together:
    top_20_buy2buy_df = cudf.DataFrame()
    for f in files_for_top_20_buy2buy:
        print(f, end=' ')
        top_20_buy2buy_df = cudf.concat([top_20_buy2buy_df, cudf.read_parquet(f)])
    # preview_df(top_20_buy2buy_df)

    return top_20_buy2buy_df
def load_top_click2cart_df():
    # Carts and clicks occuring together:
    top_click2cart_df = cudf.DataFrame()
    for f in files_for_click2cart:
        print(f, end=' ')
        top_click2cart_df = cudf.concat([top_click2cart_df, cudf.read_parquet(f)])
    # preview_df(top_click2cart_df)
    return top_click2cart_df

In [25]:
def calc_event_recall(ev_preds_df, event_name, clip_amount=cfg.clip_amount_in_score_calc):
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    sub = ev_preds_df
    sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
    sub.labels = sub.labels.apply(lambda x: x[:clip_amount])
    
    test_labels = pd.read_parquet(test_labels_file)
    test_labels = test_labels.loc[test_labels['type']==event_name]
    test_labels = test_labels.merge(sub, how='left', on=['session'])

    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
    
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0, clip_amount)
    
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    # score += weights[event_name]*recall
    print(f'{event_name} recall =',recall)

    return recall
    
def calc_score(event_name, clip_amount):
    ev_preds_df = {}
    for ev in [event_name]:
        files_for_ev = glob.glob(f'{ev}_pred_df*')
        # print(ev, len(files_for_ev))
        ev_preds_df[ev] = pd.DataFrame()
        for f in files_for_ev:
            ev_preds_df[ev] = pd.concat([ev_preds_df[ev], pd.read_parquet(f)], ignore_index=True)
        # preview_df(ev_preds_df[ev])

    return calc_event_recall(ev_preds_df[event_name], event_name, clip_amount)

In [26]:
def add_most_common(df, most_common, event_name, keep_amount=20):
    df = df.to_pandas()
    tmp1 = df[['session']].drop_duplicates('session').copy()
    tmp1 = pd.merge(tmp1, most_common, how='cross')
    if event_name=='clicks':
        tmp1['type'] = -1
        tmp1['repetitions'] = -1
        tmp1['clicks_similarity_wgt'] = -1
        tmp1[f'{event_name}_final_wgt'] = -1
    else:
        tmp1['type'] = -1
        tmp1['repetitions'] = -1
        tmp1['top_buys_wgt'] = -1
        tmp1['top_buy2buy_wgt'] = -1
        tmp1[f'{event_name}_final_wgt'] = -1

    df = pd.concat([df, tmp1]).sort_values(['session', f'{event_name}_final_wgt'], ascending=False)
    df = df.drop_duplicates(['session', 'aid'], ignore_index=True)
    df = df.sort_values(['session', f'{event_name}_final_wgt'], ascending=False).reset_index(drop=True)

    df['n'] = df.groupby('session').cumcount()
    df = df.loc[df.n < keep_amount]
    del df['n']
    df = df.reset_index(drop=True)
    
    return df

In [27]:
def add_items_from_buys_matrices(df, for_event, keep_amount=cfg.keep_amount, weight_const1=1.0):
    aids1_df = df.loc[df.type != 0]
    # preview_df(aids1_df)

    # df = df.drop_duplicates(['session', 'aid'], ignore_index=True) # other ways of duplicates handling can be done
    df['repetitions'] = 1
    df = df.groupby(['session', 'aid', 'type']).repetitions.sum().reset_index()
    # preview_df(df)

    # print('df',df.session.nunique())
    # Events at same time in session
    # some sessions will be lost, because inner merge and top_20_buys_df do not have all sessions
    # But adding df sessions at function end will fix sessions nunique to be same as test_df 
    aids2_df = cudf.merge(df[['session', 'aid']], top_20_buys_df, left_on='aid', right_on='aid_x', how='inner') 
    aids2_df.wgt = aids2_df.wgt.astype('float32')
    # print('aids2', aids2_df.session.nunique())

    aids2_df = aids2_df.groupby(['session', 'aid_y']).wgt.sum().reset_index()
    aids2_df = aids2_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True).reset_index(drop=True)
    aids2_df = aids2_df.rename(columns={'aid_y': 'aid', 'wgt': 'top_buys_wgt'})
    aids2_df['top_buy2buy_wgt'] = 0

    # preview_df(aids2_df,5)

    if for_event == 'carts':
        # Carts and clicks occuring together
        aids3_df = cudf.merge(df[['session', 'aid']], top_click2cart_df, left_on='aid', right_on='aid_x', how='inner')

        aids3_df = aids3_df.groupby(['session', 'aid_y']).wgt.sum().reset_index()
        aids3_df.wgt = aids3_df.wgt.astype('float32')
        aids3_df = aids3_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True).reset_index(drop=True)
        aids3_df = aids3_df.rename(columns={'aid_y': 'aid', 'wgt': 'top_buy2buy_wgt'})
        aids3_df['top_buys_wgt'] = 0
    else:
        # Carts and orders occuring together
        aids3_df = cudf.merge(df[['session', 'aid']], top_20_buy2buy_df, left_on='aid', right_on='aid_x', how='inner')

        aids3_df = aids3_df.groupby(['session', 'aid_y']).wgt.sum().reset_index()
        aids3_df.wgt = aids3_df.wgt.astype('float32')
        aids3_df = aids3_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True).reset_index(drop=True)
        aids3_df = aids3_df.rename(columns={'aid_y': 'aid', 'wgt': 'top_buy2buy_wgt'})
        aids3_df['top_buys_wgt'] = 0

    # preview_df(aids3_df,5)

    aids4_df = cudf.concat([aids2_df, aids3_df], ignore_index=True)
    aids4_df.top_buys_wgt = aids4_df.top_buys_wgt.astype('float32')
    aids4_df.top_buy2buy_wgt = aids4_df.top_buy2buy_wgt.astype('float32')

    # preview_df(aids4_df,5)

    aids4_df = aids4_df.groupby(['session', 'aid']).sum().reset_index()
    if for_event == 'carts':
        aids4_df['wgt'] = aids4_df['top_buys_wgt'] + weight_const1 * aids4_df['top_buy2buy_wgt']
    else:
        aids4_df['wgt'] = aids4_df['top_buys_wgt'] + weight_const1 * aids4_df['top_buy2buy_wgt']
        
    aids4_df = aids4_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True).reset_index(drop=True)

    # preview_df(aids4_df,5)

    max_weight_df = aids4_df.groupby('session').wgt.max().reset_index()
    df = cudf.merge(df, max_weight_df, on='session', how='left').fillna(1)
    df.wgt = df.wgt.astype('float32')
    df.wgt = df.wgt * df.repetitions
    if for_event=='orders':
        type_weghts = [1, 3, 2]
    else:
        type_weghts = [2, 3, 1]
    for i in range(3):
        df.loc[df.type==i, 'wgt'] = df.loc[df.type==i, 'wgt'] + type_weghts[i]
    df['top_buys_wgt'] = -1
    df['top_buy2buy_wgt'] = -1

    aids4_df['type'] = -1
    aids4_df['repetitions'] = -1
    # del df['type']
    # del df['repetitions']


    aids4_df = cudf.concat([df, aids4_df])
    aids4_df = aids4_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True)
    aids4_df = aids4_df.drop_duplicates(['session', 'aid'], ignore_index=True)
    aids4_df = aids4_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True).reset_index(drop=True)
#     preview_df(aids4_df,2)


    aids4_df['n'] = aids4_df.groupby('session').cumcount()
    aids4_df = aids4_df.loc[aids4_df.n < keep_amount]
    del aids4_df['n']
    # preview_df(aids4_df,5)


    aids4_df.repetitions = aids4_df.repetitions.astype('int8')
    aids4_df.type = aids4_df.type.astype('int8')
    aids4_df.top_buys_wgt = aids4_df.top_buys_wgt.astype('float32')
    aids4_df.top_buy2buy_wgt = aids4_df.top_buy2buy_wgt.astype('float32')

    aids4_df.wgt = aids4_df.wgt.astype('float32')
    aids4_df = aids4_df.rename(columns={'wgt': f'{for_event}_final_wgt'})

    return aids4_df

In [28]:
def add_items_from_click_matrix(df, keep_amount=cfg.keep_amount):

    # df = df.drop_duplicates(['session', 'aid'], ignore_index=True) # other ways of duplicates handling can be done
    df['repetitions'] = 0.5
    df = cudf.merge(
        df[['session', 'aid', 'type','ts']], 
        df.groupby(['session', 'aid', 'type']).repetitions.sum().reset_index(),
        on=['session', 'aid', 'type'],
        how='inner'
    )
    
    df = cudf.merge(
        df, 
        df.groupby('session').ts.max().reset_index().rename(columns={'ts': 'ts_max'}), 
        on='session', how='inner'
    )

    df['ts_distance'] = df['ts_max'] - df['ts']
    df['ts_distance'] = np.log(df['ts_distance']+1)
    df['ts_distance'] = df['ts_distance'].max()- df['ts_distance']

#     preview_df(df)

    aids2_df = cudf.merge(df, top_20_clicks_df, left_on='aid', right_on='aid_x', how='inner')
    # preview_df(aids2_df,5)

    aids2_df = aids2_df.groupby(['session', 'aid_y']).wgt.sum().reset_index()
    aids2_df.wgt = aids2_df.wgt.astype('float32')
#     aids2_df.wgt = np.log(1+aids2_df.wgt)
    aids2_df.loc[aids2_df.wgt > 100, 'wgt'] = 100
    

    aids2_df = aids2_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True).reset_index(drop=True)
    aids2_df = aids2_df.rename(columns={'aid_y': 'aid'})
    aids2_df['clicks_similarity_wgt'] = aids2_df['wgt']
    # preview_df(aids2_df,5)


    max_weight_df = aids2_df.groupby('session').wgt.max().reset_index()
    df = cudf.merge(df, max_weight_df, on='session', how='left').fillna(1)
    df.wgt = df.wgt.astype('float32')
    
    df = df.sort_values(['session', 'ts'], ignore_index=True).reset_index(drop=True)

#     print()
#     print('df.ts_distance', df.ts_distance.min(), df.ts_distance.max(), df.ts_distance.mean())
#     print('df.repetitions', df.repetitions.min(), df.repetitions.max(), df.repetitions.mean())
#     print('df.wgt', df.wgt.min(), df.wgt.max(), df.wgt.mean())
    
    df.wgt = df.wgt + df.ts_distance
    df.wgt = df.wgt + df.repetitions
    type_weghts = [1, 3, 2]
    for i in range(3):
        df.loc[df.type==i, 'wgt'] = df.loc[df.type==i, 'wgt'] + type_weghts[i]

    del df['ts'], df['ts_max'], df['ts_distance']
    df['clicks_similarity_wgt'] = -1
    aids2_df['type'] = -1
    aids2_df['repetitions'] = -1
    aids2_df = cudf.concat([df, aids2_df])
    # preview_df(aids2_df,5)
    
    aids2_df = aids2_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True)
    aids2_df = aids2_df.drop_duplicates(['session', 'aid'], ignore_index=True)
    aids2_df = aids2_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True).reset_index(drop=True)
    
    aids2_df['n'] = aids2_df.groupby('session').cumcount()
    aids2_df = aids2_df.loc[aids2_df.n < keep_amount]
    del aids2_df['n']
#     preview_df(aids2_df,2)


    aids2_df.type = aids2_df.type.astype('int8')
    aids2_df.repetitions = aids2_df.repetitions.astype('int8')
    aids2_df.clicks_similarity_wgt = aids2_df.clicks_similarity_wgt.astype('float32')

    aids2_df.wgt = aids2_df.wgt.astype('float32')
    aids2_df = aids2_df.rename(columns={'wgt': f'clicks_final_wgt'})

    return aids2_df.reset_index(drop=True)

In [29]:
recall_score = {}

## Clicks

In [30]:
top_20_clicks_df = load_top_20_clicks_df()

/kaggle/input/covisitation-to-canidates-dataset/top_20_clicks_v6_0.pqt /kaggle/input/covisitation-to-canidates-dataset/top_20_clicks_v6_1.pqt /kaggle/input/covisitation-to-canidates-dataset/top_20_clicks_v6_2.pqt /kaggle/input/covisitation-to-canidates-dataset/top_20_clicks_v6_3.pqt 

In [31]:
print(len_mils(top_20_clicks_df))

49067985

In [32]:
uniq_test_sessions = test_df.session.unique()
uniq_test_sessions_df = cudf.DataFrame({'session': uniq_test_sessions})
chunks = 10
chunk_size = int(np.ceil(len(uniq_test_sessions) / chunks))
for i in range(chunks):
    print(i, end=' ')
    from_i = i*chunk_size
    to_i = min((i+1)*chunk_size, len(uniq_test_sessions))
    # print(mils_format(from_i),' - ', mils_format(to_i), end=' ')
    
    tmp = cudf.merge(cudf.DataFrame(test_df), uniq_test_sessions_df[from_i:to_i], on='session', how='inner')
    
    pred_df_clicks = add_items_from_click_matrix(
        cudf.DataFrame(tmp).sort_values(["session", "ts"]), 
        keep_amount=cfg.keep_amount,
    )

    if cfg.add_most_common:

        tmp_sizes = pred_df_clicks.groupby('session').size().reset_index()
        tmp_sizes.columns=['session', 'session_size']

        less_then_keep_preds = cudf.merge(pred_df_clicks, tmp_sizes.loc[tmp_sizes.session_size < cfg.keep_amount], on='session', how='inner')
        del less_then_keep_preds['session_size']
        less_then_keep_preds = add_most_common(less_then_keep_preds, top_clicks_df, event_name='clicks', keep_amount=cfg.keep_amount)

        pred_df_clicks = cudf.merge(pred_df_clicks, tmp_sizes.loc[tmp_sizes.session_size >= cfg.keep_amount], on='session', how='inner')
        pred_df_clicks = pred_df_clicks.to_pandas()
        pred_df_clicks = pd.concat([pred_df_clicks, less_then_keep_preds], ignore_index=True).sort_values('session').reset_index(drop=True)
        pred_df_clicks.to_parquet(f'pred_df_clicks_{i}.pqt')

        del tmp_sizes, less_then_keep_preds
    else:
        pred_df_clicks = pred_df_clicks.to_pandas()
        pred_df_clicks.to_parquet(f'pred_df_clicks_{i}.pqt')

    print('Predictions per session', pred_df_clicks.groupby('session').size().mean())

    pred_df_clicks = pred_df_clicks.groupby('session').aid.apply(list)
    pred_df_clicks = pred_df_clicks.add_suffix("_clicks")
    clicks_pred_df = pred_df_clicks.reset_index()
    clicks_pred_df.columns = ["session_type", "labels"]

    clicks_pred_df.to_parquet(f'clicks_pred_df_{i}.pqt')

    del pred_df_clicks, clicks_pred_df, tmp
    _ = gc.collect()

print('---' * 10)
recall_score['clicks'] = calc_score('clicks', clip_amount=cfg.clip_amount_in_score_calc)
# recall_score['clicks'] = calc_score('clicks', clip_amount=20)

print(recall_score['clicks'])

0 

  "When using a sequence of booleans for `ascending`, "
  "When using a sequence of booleans for `ascending`, "


Predictions per session 37.245417096921045
1 Predictions per session 37.25388894440558
2 Predictions per session 37.173228739882084
3 Predictions per session 37.160654208720565
4 Predictions per session 37.21449429843554
5 Predictions per session 37.4111066697756
6 Predictions per session 37.16907609118062
7 Predictions per session 37.29300045523689
8 Predictions per session 37.28649389871534
9 Predictions per session 37.229362025794345
------------------------------
clicks recall = 0.5734790667682882
0.5734790667682882


In [33]:
if cfg.clip_amount_in_score_calc > 20:
    print(calc_score('clicks', clip_amount=20))

clicks recall = 0.5195336575651625
0.5195336575651625


## Orders

In [35]:
# Events at same time in session:

top_20_buys_df = load_top_20_buys_df()
top_20_buy2buy_df = load_top_20_buy2buy_df()

/kaggle/input/covisitation-to-canidates-dataset/top_15_carts_orders_v6_0.pqt /kaggle/input/covisitation-to-canidates-dataset/top_15_carts_orders_v6_1.pqt /kaggle/input/covisitation-to-canidates-dataset/top_15_carts_orders_v6_2.pqt /kaggle/input/covisitation-to-canidates-dataset/top_15_carts_orders_v6_3.pqt /kaggle/input/covisitation-to-canidates-dataset/top_15_buy2buy_v6_0.pqt 

In [36]:
def add_items_from_buys_matrices(df, for_event, keep_amount=cfg.keep_amount, weight_const1=1.0):
    aids1_df = df.loc[df.type != 0]
    # preview_df(aids1_df)

    # df = df.drop_duplicates(['session', 'aid'], ignore_index=True) # other ways of duplicates handling can be done
    df['repetitions'] = 1
    df = cudf.merge(
        df[['session', 'aid', 'type','ts']], 
        df.groupby(['session', 'aid', 'type']).repetitions.sum().reset_index(),
        on=['session', 'aid', 'type'],
        how='inner'
    )
    
    df = cudf.merge(
        df, 
        df.groupby('session').ts.max().reset_index().rename(columns={'ts': 'ts_max'}), 
        on='session', how='inner'
    )

    df['ts_distance'] = df['ts_max'] - df['ts']
    df['ts_distance'] = np.log(df['ts_distance']+1)
    df['ts_distance'] = df['ts_distance'].max()- df['ts_distance']
        
    # preview_df(df)

    # print('df',df.session.nunique())
    # Events at same time in session
    # some sessions will be lost, because inner merge and top_20_buys_df do not have all sessions
    # But adding df sessions at function end will fix sessions nunique to be same as test_df 
    aids2_df = cudf.merge(df[['session', 'aid']], top_20_buys_df, left_on='aid', right_on='aid_x', how='inner') 
    aids2_df.wgt = aids2_df.wgt.astype('float32')
#     aids2_df.wgt = np.log(1+aids2_df.wgt)
    aids2_df.loc[aids2_df.wgt > 200, 'wgt'] = 200

    
    # print('aids2', aids2_df.session.nunique())

    aids2_df = aids2_df.groupby(['session', 'aid_y']).wgt.sum().reset_index()
    aids2_df = aids2_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True).reset_index(drop=True)
    aids2_df = aids2_df.rename(columns={'aid_y': 'aid', 'wgt': 'top_buys_wgt'})
    aids2_df['top_buy2buy_wgt'] = 0

    # preview_df(aids2_df,5)

    if for_event == 'carts':
        # Carts and clicks occuring together
        aids3_df = cudf.merge(df[['session', 'aid']], top_click2cart_df, left_on='aid', right_on='aid_x', how='inner')

        aids3_df = aids3_df.groupby(['session', 'aid_y']).wgt.sum().reset_index()
        aids3_df.wgt = aids3_df.wgt.astype('float32')
#         aids3_df.wgt = np.log(1+aids3_df.wgt)
        aids3_df.loc[aids3_df.wgt > 200, 'wgt'] = 200
        
        aids3_df = aids3_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True).reset_index(drop=True)
        aids3_df = aids3_df.rename(columns={'aid_y': 'aid', 'wgt': 'top_buy2buy_wgt'})
        aids3_df['top_buys_wgt'] = 0
    else:
        # Carts and orders occuring together
        aids3_df = cudf.merge(df[['session', 'aid']], top_20_buy2buy_df, left_on='aid', right_on='aid_x', how='inner')

        aids3_df = aids3_df.groupby(['session', 'aid_y']).wgt.sum().reset_index()
        aids3_df.wgt = aids3_df.wgt.astype('float32')
#         aids3_df.wgt = np.log(1+aids3_df.wgt)
        aids3_df.loc[aids3_df.wgt > 200, 'wgt'] = 200
        
        aids3_df = aids3_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True).reset_index(drop=True)
        aids3_df = aids3_df.rename(columns={'aid_y': 'aid', 'wgt': 'top_buy2buy_wgt'})
        aids3_df['top_buys_wgt'] = 0

    # preview_df(aids3_df,5)

    aids4_df = cudf.concat([aids2_df, aids3_df], ignore_index=True)
    aids4_df.top_buys_wgt = aids4_df.top_buys_wgt.astype('float32')
    aids4_df.top_buy2buy_wgt = aids4_df.top_buy2buy_wgt.astype('float32')

    # preview_df(aids4_df,5)

    aids4_df = aids4_df.groupby(['session', 'aid']).sum().reset_index()
    if for_event == 'carts':
        aids4_df['wgt'] = aids4_df['top_buys_wgt'] + weight_const1 * aids4_df['top_buy2buy_wgt']
    else:
        aids4_df['wgt'] = aids4_df['top_buys_wgt'] + weight_const1 * aids4_df['top_buy2buy_wgt']
        
    aids4_df = aids4_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True).reset_index(drop=True)

    # preview_df(aids4_df,5)
    max_weight_df = aids4_df.groupby('session').wgt.max().reset_index()
    df = cudf.merge(df, max_weight_df, on='session', how='left').fillna(1)
    
    df = df.sort_values(['session', 'ts'], ascending=[True, True], ignore_index=True).reset_index(drop=True)
    
    df.wgt = df.wgt.astype('float32')
    df.wgt = df.wgt + df.repetitions
    df.wgt = df.wgt + df.ts_distance
    if for_event=='orders':
        type_weghts = [1, 10, 2]
    else:
        type_weghts = [2, 3, 1]
    for i in range(3):
        df.loc[df.type==i, 'wgt'] = df.loc[df.type==i, 'wgt'] + type_weghts[i]
    df['top_buys_wgt'] = -1
    df['top_buy2buy_wgt'] = -1
    del df['ts'], df['ts_max'], df['ts_distance']

#     aids4_df['ts'] = df.ts.max()+1
    aids4_df['type'] = -1
    aids4_df['repetitions'] = -1
    # del df['type']
    # del df['repetitions']
    
#     df = df.sort_values(['session', 'ts'], ignore_index=True).reset_index(drop=True)


    aids4_df = cudf.concat([df, aids4_df])

    aids4_df = aids4_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True)
    aids4_df = aids4_df.drop_duplicates(['session', 'aid'], keep='first', ignore_index=True)
    aids4_df = aids4_df.sort_values(['session', 'wgt'], ascending=[True, False], ignore_index=True)
    aids4_df = aids4_df.reset_index(drop=True)
#     preview_df(aids4_df,2)


    aids4_df['n'] = aids4_df.groupby('session').cumcount()
    aids4_df = aids4_df.loc[aids4_df.n < keep_amount]
    del aids4_df['n']
    # preview_df(aids4_df,5)


    aids4_df.repetitions = aids4_df.repetitions.astype('int8')
    aids4_df.type = aids4_df.type.astype('int8')
    aids4_df.top_buys_wgt = aids4_df.top_buys_wgt.astype('float32')
    aids4_df.top_buy2buy_wgt = aids4_df.top_buy2buy_wgt.astype('float32')

    aids4_df.wgt = aids4_df.wgt.astype('float32')
    aids4_df = aids4_df.rename(columns={'wgt': f'{for_event}_final_wgt'})

    return aids4_df

In [37]:
uniq_test_sessions = test_df.session.unique()
uniq_test_sessions_df = cudf.DataFrame({'session': uniq_test_sessions})
chunks = 10
chunk_size = int(np.ceil(len(uniq_test_sessions) / chunks))
for i in range(chunks):
    print(i, end=' ')
    from_i = i*chunk_size
    to_i = min((i+1)*chunk_size, len(uniq_test_sessions))
    # print(mils_format(from_i),' - ', mils_format(to_i), end=' ')
    
    tmp = cudf.merge(cudf.DataFrame(test_df), uniq_test_sessions_df[from_i:to_i], on='session', how='inner')
    
    pred_df_orders = add_items_from_buys_matrices(
        cudf.DataFrame(tmp).sort_values(["session", "ts"]), 
        keep_amount=cfg.keep_amount,
        weight_const1=cfg.weight_const1_orders,
#         weight_const1=1.5,
        for_event='orders'
    )

    if cfg.add_most_common:

        tmp_sizes = pred_df_orders.groupby('session').size().reset_index()
        tmp_sizes.columns=['session', 'session_size']

        less_then_keep_preds = cudf.merge(pred_df_orders, tmp_sizes.loc[tmp_sizes.session_size < cfg.keep_amount], on='session', how='inner')
        del less_then_keep_preds['session_size']
        less_then_keep_preds = add_most_common(less_then_keep_preds, top_orders_df, event_name='orders', keep_amount=cfg.keep_amount)

        pred_df_orders = cudf.merge(pred_df_orders, tmp_sizes.loc[tmp_sizes.session_size >= cfg.keep_amount], on='session', how='inner')
        pred_df_orders = pred_df_orders.to_pandas()
        pred_df_orders = pd.concat([pred_df_orders, less_then_keep_preds], ignore_index=True).sort_values('session').reset_index(drop=True)
        pred_df_orders.to_parquet(f'pred_df_orders_{i}.pqt')

        del tmp_sizes
    else:
        pred_df_orders = pred_df_orders.to_pandas()
        pred_df_orders.to_parquet(f'pred_df_orders_{i}.pqt')

    print('Predictions per session', pred_df_orders.groupby('session').size().mean())


    pred_df_orders = pred_df_orders.groupby('session').aid.apply(list)
    pred_df_orders = pred_df_orders.add_suffix("_orders")
    orders_pred_df = pred_df_orders.reset_index()
    orders_pred_df.columns = ["session_type", "labels"]

#     preview_df(orders_pred_df)
    # print('Orders_pred len:', len_mils(orders_pred_df))
    orders_pred_df.to_parquet(f'orders_pred_df_{i}.pqt')

    del pred_df_orders, orders_pred_df, tmp
    _ = gc.collect()

print('---' * 10)
recall_score['orders'] = calc_score('orders', clip_amount=cfg.clip_amount_in_score_calc)

0 Predictions per session 39.063566614480976
1 Predictions per session 39.0377069384764
2 Predictions per session 39.03462576196662
3 Predictions per session 38.98121870246383
4 Predictions per session 39.04541820725492
5 Predictions per session 39.15981035497374
6 Predictions per session 39.04358060468783
7 Predictions per session 39.073826099508125
8 Predictions per session 39.101118106214535
9 Predictions per session 39.06642904334405
------------------------------
orders recall = 0.6724584327200813


In [38]:
if cfg.clip_amount_in_score_calc > 20:
    print(calc_score('orders', clip_amount=20))

orders recall = 0.6478010105233591
0.6478010105233591


## Carts

In [39]:
try:
    print(len_mils(top_20_buys_df))
except:
    top_20_buys_df = load_top_20_buys_df()
    print(len_mils(top_20_buys_df))
try:
    print(len_mils(top_click2cart_df))
except:
    top_click2cart_df = load_top_click2cart_df()
    print(len_mils(top_click2cart_df))

49,067,985
/kaggle/input/covisitation-to-canidates-dataset/top_click2cart_v6_2.pqt /kaggle/input/covisitation-to-canidates-dataset/top_click2cart_v6_1.pqt /kaggle/input/covisitation-to-canidates-dataset/top_click2cart_v6_0.pqt /kaggle/input/covisitation-to-canidates-dataset/top_click2cart_v6_3.pqt 53,503,866


In [40]:
uniq_test_sessions = test_df.session.unique()
uniq_test_sessions_df = cudf.DataFrame({'session': uniq_test_sessions})
chunks = 10
chunk_size = int(np.ceil(len(uniq_test_sessions) / chunks))
for i in range(chunks):
    print(i, end=' ')
    from_i = i*chunk_size
    to_i = min((i+1)*chunk_size, len(uniq_test_sessions))
    
    tmp = cudf.merge(cudf.DataFrame(test_df), uniq_test_sessions_df[from_i:to_i], on='session', how='inner')
    
    pred_df_carts = add_items_from_buys_matrices(
        cudf.DataFrame(tmp).sort_values(["session", "ts"]), 
        keep_amount=cfg.keep_amount,
#         weight_const1=cfg.weight_const1_carts,
        weight_const1=0.5,
        for_event='carts'
    )
    
    if cfg.add_most_common:
        tmp_sizes = pred_df_carts.groupby('session').size().reset_index()
        tmp_sizes.columns=['session', 'session_size']

        less_then_keep_preds = cudf.merge(pred_df_carts, tmp_sizes.loc[tmp_sizes.session_size < cfg.keep_amount], on='session', how='inner')
        del less_then_keep_preds['session_size']
        less_then_keep_preds = add_most_common(less_then_keep_preds, top_carts_df, event_name='carts', keep_amount=cfg.keep_amount)

        pred_df_carts = cudf.merge(pred_df_carts, tmp_sizes.loc[tmp_sizes.session_size >= cfg.keep_amount], on='session', how='inner')
        pred_df_carts = pred_df_carts.to_pandas()
        pred_df_carts = pd.concat([pred_df_carts, less_then_keep_preds], ignore_index=True).sort_values('session').reset_index(drop=True)
        pred_df_carts.to_parquet(f'pred_df_carts_{i}.pqt')

        del tmp_sizes, less_then_keep_preds
    else:
        pred_df_carts = pred_df_carts.to_pandas()
        pred_df_carts.to_parquet(f'pred_df_carts_{i}.pqt')

    print('Predictions per session', pred_df_carts.groupby('session').size().mean())

    pred_df_carts = pred_df_carts.groupby('session').aid.apply(list)
    pred_df_carts = pred_df_carts.add_suffix("_carts")
    carts_pred_df = pred_df_carts.reset_index()
    carts_pred_df.columns = ["session_type", "labels"]
    
    carts_pred_df.to_parquet(f'carts_pred_df_{i}.pqt')

    del pred_df_carts, carts_pred_df, tmp
    _ = gc.collect()

print('---' * 10)

recall_score['carts'] = calc_score('carts', clip_amount=cfg.clip_amount_in_score_calc)

0 Predictions per session 39.399259407303774
1 Predictions per session 39.35649489801583
2 Predictions per session 39.37376614147874
3 Predictions per session 39.329169581293094
4 Predictions per session 39.3644004752229
5 Predictions per session 39.47136448930193
6 Predictions per session 39.373099941152304
7 Predictions per session 39.412555655485605
8 Predictions per session 39.42992127732809
9 Predictions per session 39.38752588595191
------------------------------
carts recall = 0.4424275647125343


In [41]:
if cfg.clip_amount_in_score_calc > 20:
    print(calc_score('carts', clip_amount=20))

carts recall = 0.4040715928684677
0.4040715928684677


# All score

In [44]:
print(recall_score)

{'clicks': 0.5734790667682882, 'orders': 0.6724584327200813, 'carts': 0.4424275647125343}


In [45]:
def calc_total_score(recall_score):
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for event_name in ['clicks','carts','orders']:
        score += weights[event_name] * recall_score[event_name]
        print(f'{event_name} recall =',recall_score[event_name])
    
    print('=============')
    print('Overall Recall =',score)
    print('=============')
    
    return score

calc_total_score(recall_score)

clicks recall = 0.5734790667682882
carts recall = 0.4424275647125343
orders recall = 0.6724584327200813
Overall Recall = 0.5935512357226379


0.5935512357226379

# Create Submission CSV

In [47]:
if cfg.create_submission:
    ev_preds_df = {}
    events_list = ['clicks', 'carts', 'orders']
    for ev in events_list:
        files_for_ev = glob.glob(f'{ev}_pred_df*')
        print(ev, len(files_for_ev))
        ev_preds_df[ev] = pd.DataFrame()
        for f in files_for_ev:
            ev_preds_df[ev] = pd.concat([ev_preds_df[ev], pd.read_parquet(f)], ignore_index=True)
        preview_df(ev_preds_df[ev])

    print('Submission: ')
    pred_df = pd.concat([ev_preds_df[ev] for ev in events_list], ignore_index=True).reset_index(drop=True)
    pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
    pred_df.to_csv("submission.csv", index=False)
    preview_df(pred_df,5)

# Compute Validation Score
This code is from Radek [here][1]. It has been modified to use less memory.

[1]: https://www.kaggle.com/competitions/otto-recommender-system/discussion/364991

In [49]:
%%time
# COMPUTE METRIC

# if True:
if cfg.run_final_score_calc and cfg.create_submission:
    # FREE MEMORY
    del top_clicks, top_orders, test_df
    _ = gc.collect()
    
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in ['clicks','carts','orders']:
        sub = pred_df.loc[pred_df.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])

        test_labels = pd.read_parquet(test_labels_file)
        test_labels = test_labels.loc[test_labels['type']==t]
        test_labels = test_labels.merge(sub, how='left', on=['session'])
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)

        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)

        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)

    print('=============')
    print('Overall Recall =',score)
    print('=============')

CPU times: user 17 µs, sys: 0 ns, total: 17 µs
Wall time: 22.9 µs
