## 1. Setup

In [1]:
import pathlib
from tqdm import tqdm
import pandas as pd

In [2]:
competition_dataset_directory = pathlib.Path('../input/otto-recommender-system')
pickled_dataset_directory = pathlib.Path('../input/otto-multi-objective-recommender-system-pickle')

df_train = pd.read_pickle(pickled_dataset_directory / 'train.pkl')
df_test = pd.read_pickle(pickled_dataset_directory / 'test.pkl')

print(f'Training Shape: {df_train.shape} - Memory Usage: {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'Test Shape: {df_test.shape} - Memory Usage: {df_test.memory_usage().sum() / 1024 ** 2:.2f} MB')

Training Shape: (216716096, 4) - Memory Usage: 3513.50 MB
Test Shape: (6928123, 4) - Memory Usage: 112.32 MB


## 2. Dataset Statistics

Top 20 most frequent aids are extracted from training, test and training + test sets for every event at once and separately. Extracted aids will be used for filling predictions with less than 20 unique aids. 

In [3]:
df_train_aid_counts = df_train.groupby('aid')[['aid']].count()
df_test_aid_counts = df_test.groupby('aid')[['aid']].count()

df_train_aid_counts = df_train_aid_counts.rename(columns={'aid': 'count'}).reset_index()
df_test_aid_counts = df_test_aid_counts.rename(columns={'aid': 'count'}).reset_index()
df_all_aid_counts = pd.concat((df_train_aid_counts, df_test_aid_counts), axis=0, ignore_index=True).reset_index(drop=True)
df_all_aid_counts = df_all_aid_counts.groupby('aid')['count'].sum().reset_index()

df_train_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_test_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_all_aid_counts.sort_values(by='count', ascending=False, inplace=True)

train_20_most_frequent_aids = df_train_aid_counts.set_index('aid').head(20).to_dict()['count']
test_20_most_frequent_aids = df_test_aid_counts.set_index('aid').head(20).to_dict()['count']
all_20_most_frequent_aids = df_all_aid_counts.set_index('aid').head(20).to_dict()['count']
del df_train_aid_counts, df_test_aid_counts, df_all_aid_counts

df_train_aids_by_types = df_train.groupby(['type', 'aid'])[['aid']].count()
df_test_aids_by_types = df_test.groupby(['type', 'aid'])[['aid']].count()
df_train_click_aid_counts = df_train_aids_by_types.loc[0].rename(columns={'aid': 'count'}).reset_index()
df_train_cart_aid_counts = df_train_aids_by_types.loc[1].rename(columns={'aid': 'count'}).reset_index()
df_train_order_aid_counts = df_train_aids_by_types.loc[2].rename(columns={'aid': 'count'}).reset_index()
df_test_click_aid_counts = df_test_aids_by_types.loc[0].rename(columns={'aid': 'count'}).reset_index()
df_test_cart_aid_counts = df_test_aids_by_types.loc[1].rename(columns={'aid': 'count'}).reset_index()
df_test_order_aid_counts = df_test_aids_by_types.loc[2].rename(columns={'aid': 'count'}).reset_index()
del df_train_aids_by_types, df_test_aids_by_types
df_all_click_aid_counts = pd.concat((df_train_click_aid_counts, df_test_click_aid_counts), axis=0, ignore_index=True).reset_index(drop=True)
df_all_click_aid_counts = df_all_click_aid_counts.groupby('aid')['count'].sum().reset_index()
df_all_cart_aid_counts = pd.concat((df_train_cart_aid_counts, df_test_cart_aid_counts), axis=0, ignore_index=True).reset_index(drop=True)
df_all_cart_aid_counts = df_all_cart_aid_counts.groupby('aid')['count'].sum().reset_index()
df_all_order_aid_counts = pd.concat((df_train_order_aid_counts, df_test_order_aid_counts), axis=0, ignore_index=True).reset_index(drop=True)
df_all_order_aid_counts = df_all_order_aid_counts.groupby('aid')['count'].sum().reset_index()

df_train_click_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_test_click_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_all_click_aid_counts.sort_values(by='count', ascending=False, inplace=True)

df_train_cart_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_test_cart_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_all_cart_aid_counts.sort_values(by='count', ascending=False, inplace=True)

df_train_order_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_test_order_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_all_order_aid_counts.sort_values(by='count', ascending=False, inplace=True)

train_20_most_frequent_click_aids = df_train_click_aid_counts.set_index('aid').head(20).to_dict()['count']
test_20_most_frequent_click_aids = df_test_click_aid_counts.set_index('aid').head(20).to_dict()['count']
all_20_most_frequent_click_aids = df_all_click_aid_counts.set_index('aid').head(20).to_dict()['count']
del df_train_click_aid_counts, df_test_click_aid_counts, df_all_click_aid_counts

train_20_most_frequent_cart_aids = df_train_cart_aid_counts.set_index('aid').head(20).to_dict()['count']
test_20_most_frequent_cart_aids = df_test_cart_aid_counts.set_index('aid').head(20).to_dict()['count']
all_20_most_frequent_cart_aids = df_all_cart_aid_counts.set_index('aid').head(20).to_dict()['count']
del df_train_cart_aid_counts, df_test_cart_aid_counts, df_all_cart_aid_counts

train_20_most_frequent_order_aids = df_train_order_aid_counts.set_index('aid').head(20).to_dict()['count']
test_20_most_frequent_order_aids = df_test_order_aid_counts.set_index('aid').head(20).to_dict()['count']
all_20_most_frequent_order_aids = df_all_order_aid_counts.set_index('aid').head(20).to_dict()['count']
del df_train_order_aid_counts, df_test_order_aid_counts, df_all_order_aid_counts

## 3. aid Frequency Baseline

[@Tawara](https://www.kaggle.com/ttahara) shared aid frequency baseline in [this](https://www.kaggle.com/code/ttahara/otto-mors-aid-frequency-baseline) notebook. However, his implementation doesn't predict 20 aids if number of unique aids is less than 20 in a session.

Aim of this notebook is comparing different ways of filling empty predictions and improve frequency baseline little bit further.

* top 20 most frequent aids per session **-> 0.482** (Tawara's baseline)
* top 20 most frequent aids per session concatenated with top x (number of aids left to predict) most frequent aids in training set **-> 0.484**
* top 20 most frequent aids per session concatenated with top x (number of aids left to predict) most frequent aids in training + test set **-> 0.484**
* top 20 most frequent aids per session concatenated with top x (number of aids left to predict) most frequent aids per event type in training set **-> 0.484**
* top 20 most frequent aids per session concatenated with top x (number of aids left to predict) most frequent aids per event type in training + test set **-> 0.484**

There wasn't much improvement after concatenation. All of the concatenation types scored 0.484 on public leaderboard however "most frequent aids per event type in training + test set" yields the largest boost compared to others.

In [4]:
submission = []

df_test_session_aid_frequencies = df_test.groupby(['session', 'aid'])['aid'].count()
# Sort values inside groups
df_test_session_aid_frequencies = df_test_session_aid_frequencies.sort_values(ascending=False).sort_index(level='session', sort_remaining=False)
df_test_session_aid_frequencies = df_test_session_aid_frequencies.rename('count').reset_index()
# Create a dictionary of session id keys and list of top 20 most frequent aid values
df_test_session_aid_frequencies = df_test_session_aid_frequencies.groupby('session')['aid'].agg(lambda x: list(x)[:20]).to_dict()

for session_id, aids in tqdm(df_test_session_aid_frequencies.items()):

    for event_type in ['click', 'cart', 'order']:
        
        predictions = aids.copy()
        
        if event_type == 'click':
            predictions += list(all_20_most_frequent_click_aids.keys())[:20 - len(aids)]
        elif event_type == 'cart':
            predictions += list(all_20_most_frequent_cart_aids.keys())[:20 - len(aids)]
        elif event_type == 'order':
            predictions += list(all_20_most_frequent_order_aids.keys())[:20 - len(aids)]
        
        predictions = ' '.join([str(aid) for aid in predictions])
        submission.append({
            'session_type': f'{session_id}_{event_type}s',
            'labels': predictions
        })


100%|██████████| 1671803/1671803 [00:38<00:00, 43033.78it/s]


In [5]:
df_submission = pd.DataFrame(submission)
df_submission

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1460571 108125 485256 29735 1733943 1849...
1,12899779_carts,59625 485256 152547 33343 166037 1733943 23148...
2,12899779_orders,59625 231487 166037 1733943 1445562 1022566 80...
3,12899780_clicks,1142000 736515 973453 582732 1460571 108125 48...
4,12899780_carts,1142000 736515 973453 582732 485256 152547 333...
...,...,...
5015404,14571580_carts,202353 485256 152547 33343 166037 1733943 2314...
5015405,14571580_orders,202353 231487 166037 1733943 1445562 1022566 8...
5015406,14571581_clicks,1100210 1460571 108125 485256 29735 1733943 18...
5015407,14571581_carts,1100210 485256 152547 33343 166037 1733943 231...


In [6]:
df_submission.to_csv('submission.csv', index=False)