In [1]:
pip install polars

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install pickle5


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pickle5
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pickle5
  Building wheel for pickle5 (setup.py) ... [?25l[?25hdone
  Created wheel for pickle5: filename=pickle5-0.0.11-cp310-cp310-linux_x86_64.whl size=256403 sha256=9a80d0eedd1f568a96525f08509601fd02b2665797206df9b38f1d1b475833e7
  Stored in directory: /root/.cache/pip/wheels/7d/14/ef/4aab19d27fa8e58772be5c71c16add0426acf9e1f64353235c
Successfully built pickle5
Installing collected packages: pickle5
Successfully installed pickle5-0.0.11


In [None]:
config = {'train_session_num': 12899779}

In [None]:


import polars as pl
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import seaborn as sns
import pandas as pd
from heapq import nlargest
import pickle5 as pickle
import os
from datetime import datetime
from tqdm import tqdm
import math
import numpy as np
import random
import copy
from collections import defaultdict, Counter
import gc


In [None]:
def load_datasets_and_mappings():
    """Load training data and id-to-type mappings."""
    training_data = pl.read_parquet('train.parquet')
    with open('id2type.pkl', "rb") as fh:
        id_to_type_mapping = pickle.load(fh)
    with open('type2id.pkl', "rb") as fh:
        type_to_id_mapping = pickle.load(fh)
    return training_data.to_pandas(), id_to_type_mapping, type_to_id_mapping



In [None]:
training_data, id_to_type_mapping, _ = load_datasets_and_mappings()

In [None]:
def preprocess_training_data(training_data, config):
    """Preprocess the training data."""
    training_data['aid'] = training_data['aid'].astype('int32').astype('str')

    # Randomly sample sessions for training
    sampled_sessions = random.sample(list(training_data['session'].unique()), config['train_session_num'])
    training_data = training_data.query('session in @sampled_sessions').reset_index(drop=True)

    training_data['time_stamp'] = pd.to_datetime(training_data['ts'], unit='s').dt.strftime('%Y-%m-%d')

    return training_data

In [None]:
training_data = preprocess_training_data(training_data, config)

In [None]:
import gc
del sampled_sessions
gc.collect()

In [None]:
def generate_similarity_pairs(data):
    """Generate pair-wise interactions for similarity computation."""
    data = data.sort_values(by=['session', 'ts'])
    data['next_aid'] = data['aid'].shift(-1)
    data['session_day'] = data['session'].astype('str') + '_' + data['time_stamp']
    data['session_day_count'] = data['session_day'].map(data['session_day'].value_counts())
    data['ranking'] = data.groupby(['session_day'])['ts'].rank(method='first', ascending=True)
    data = data.query('session_day_count!=ranking').reset_index(drop=True)

    similar_aids = data.groupby('aid').apply(lambda df: Counter(df.next_aid).most_common(50)).to_dict()
    similar_aids = {aid: Counter(dict(top)) for aid, top in similar_aids.items()}

    return similar_aids

In [None]:
similar_items = generate_similarity_pairs(training_data)

In [None]:
del training_data
gc.collect()

In [None]:
def load_and_preprocess_test_data():
    """Load and preprocess test data."""
    test_data = pl.read_parquet('test.parquet')
    test_data = test_data.to_pandas()
    test_data['aid'] = test_data['aid'].astype('int32').astype('str')
    test_data['time_stamp'] = pd.to_datetime(test_data['ts'],unit='s').dt.strftime('%Y-%m-%d')
    test_data = test_data.sort_values(["session", "type", "ts"])
    session_to_item_ids = test_data.groupby('session')['aid'].agg(list).to_dict()
    return session_to_item_ids

In [None]:
def generate_recommendations(session_to_item_ids, similar_items, popular_items):
    """Generate item recommendations for each session."""
    session_ids = []
    recommended_item_lists = []
    for session_id, session_items in tqdm(session_to_item_ids.items()):
        recommended_items = recommend_items(session_items, similar_items, popular_items)
        session_ids.append(session_id)
        recommended_item_lists.append(recommended_items)

    return session_ids, recommended_item_lists

In [None]:
def create_submission_file(session_ids, recommended_item_lists, id_to_type_mapping):
    """Create a submission file with the recommended items for each session type."""
    submission_df = pd.DataFrame()
    submission_df['session_type'] = session_ids
    submission_df['labels'] = [' '.join([str(item) for item in item_list]) for item_list in recommended_item_lists]

    submission_list = []
    for type_ in [0,1,2]:
        type_specific_df = submission_df.copy()
        type_specific_df['session_type'] = type_specific_df['session_type'].apply(lambda x: f'{x}_{id_to_type_mapping[type_]}')
        submission_list.append(type_specific_df)
    submission_df = pd.concat(submission_list,axis=0)

    submission_df.to_csv('submission.csv',index=False)
    return submission_df

In [None]:
session_to_item_ids = load_and_preprocess_test_data()
popular_items = list(training_data['aid'].value_counts().index)
session_ids, recommended_item_lists = generate_recommendations(session_to_item_ids, similar_items, popular_items)
submission_df=create_submission_file(session_ids, recommended_item_lists, id_to_type_mapping)

In [None]:
submission_df.head()

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1243845 1223875 1266598 1275942 1272658 ...
1,12899779_carts,59625 1243845 1223875 1266598 1275942 1272658 ...
2,12899779_orders,59625 1243845 1223875 1266598 1275942 1272658 ...
3,12899780_clicks,1142000 736515 582732 973453 1502122 889686 48...
4,12899780_carts,1142000 736515 582732 973453 1502122 889686 48...
