In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from multiprocessing.pool import Pool

In [2]:
# load data
trans = pd.read_csv('transactions.csv')
prods = pd.read_csv('products.csv')
df = trans.merge(prods, on=['product_id'])

In [3]:
# preprocessing and optimisation
int64 = df.select_dtypes('int64').columns
float64 = df.select_dtypes('float64').columns
df.days_since_prior_order = df.days_since_prior_order.fillna(9999)
df[int64] = df[int64].astype('int32')
df[float64] = df[float64].astype('int32')
df.sort_values(by=['user_id', 'order_number', 'add_to_cart_order'], ignore_index=True, inplace=True)
df['time'] = df.index.values
df['time'] = df['time'].astype('int32')

In [4]:
df.isna().sum()

order_id                  0
user_id                   0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
product_id                0
add_to_cart_order         0
reordered                 0
product_name              0
aisle_id                  0
department_id             0
aisle                     0
department                0
time                      0
dtype: int64

In [5]:
# calc frequency
aggr = df.groupby(['user_id','product_id']) \
        .agg({'order_id':'count'}) \
        .rename(columns={'order_id':'cnt'}) \
        .sort_values('cnt', ascending=False)
aggr.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt
user_id,product_id,Unnamed: 2_level_1
41356,6583,99
41356,14366,99
41356,38652,99
41356,29671,98
17997,4210,98


In [6]:
# calc top10 prods for cold start
top10 = df.groupby('product_id').agg({'order_id':'count'}).rename(columns={'order_id':'cnt'}).sort_values(by='cnt', ascending=False).head(10)
top10 = top10.reset_index().merge(prods[['product_id','product_name']])

In [None]:
# make submission sync version
results = {"user_id":[],
           "product_id":[]}

for user in tqdm(df.user_id.unique()):
    prods = aggr.loc[user].head(10).index.to_list()
    results["user_id"].append(user)
    if len(prods)<10:
        tmp = prods + top10.product_id.to_list()[0:10-len(prods)]
        results["product_id"].append(" ".join([str(x) for x in tmp]))
    else:
        results["product_id"].append(" ".join([str(x) for x in prods]))

sub = pd.DataFrame(results)
sub.to_csv('submission.csv', index=False)
sub.head()

  0%|          | 0/100000 [00:00<?, ?it/s]

In [37]:
print(len(results['product_id']), len(results['user_id']))

100000 100000
