In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

IDIR = 'data/'

In [2]:
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv',
                     dtype={
                         'order_id': np.int32,
                         'product_id': np.uint16,
                         'add_to_cart_order': np.int16,
                         'reordered': np.int8})
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))

print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv',
                    dtype={
                        'order_id': np.int32,
                        'product_id': np.uint16,
                        'add_to_cart_order': np.int16,
                        'reordered': np.int8})
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv',
                     dtype={
                         'order_id': np.int32,
                         'user_id': np.int32,
                         'eval_set': 'category',
                         'order_number': np.int16,
                         'order_dow': np.int8,
                         'order_hour_of_day': np.int8,
                         'days_since_prior_order': np.float32})
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))

print('loading products')
products = pd.read_csv(IDIR + 'products.csv',
                       dtype={
                           'product_id': np.uint16,
                           'order_id': np.int32,
                           'aisle_id': np.uint8,
                           'department_id': np.uint8},
                       usecols=['product_id', 'aisle_id', 'department_id'])

loading prior
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
loading train
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered
loading orders
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
loading products


In [3]:
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

split orders : train, test


In [4]:
test_users_prior_orders = orders[(orders.eval_set == 'prior') & (orders.user_id.isin(test_orders.user_id))]
test_users_prior_orders = test_users_prior_orders.merge(priors, on='order_id')

test_users_prior_orders_products = test_users_prior_orders[['user_id', 'product_id', 'order_id']]
test_users_prior_orders_products = test_users_prior_orders_products.groupby(['user_id', 'product_id']).count()
test_users_prior_orders_products.columns = ['count']
test_users_prior_orders_products.reset_index(1, inplace=True)

del(test_users_prior_orders)

print('test_users_prior_orders_products {}: {}'.format(test_users_prior_orders_products.shape,
                                                     ', '.join(test_users_prior_orders_products.columns)))

test_users_prior_orders_products (4833292, 2): product_id, count


#### Predict products for each user

In [5]:
def dummy_predict_products_previous_users(df, std_thresh=3.0):
    res = {}
    
    for user_id in df.index.unique():
        tmp_user_df = df.loc[user_id]
        
        if type(tmp_user_df) is pd.Series:
            selected_products = tmp_user_df['product_id']
        
        else:
            thresh = tmp_user_df['count'].mean() + std_thresh * tmp_user_df['count'].std()

            selected_products = tmp_user_df[tmp_user_df['count'] > thresh]

            predict_prods = selected_products['product_id'].values
        
        if len(predict_prods) == 0:
            predict_prods = [None]

        res[user_id] = ' '.join(str(e) for e in predict_prods)
    
    return(res)

In [6]:
res = dummy_predict_products_previous_users(test_users_prior_orders_products)

sub = pd.DataFrame.from_dict(res, orient='index').reset_index()
sub.columns = ['user_id', 'products']

sub = sub.merge(test_orders[['order_id', 'user_id']], on='user_id').drop('user_id', axis=1)

In [7]:
sub[['order_id', 'products']].to_csv('baseline.csv', index=False)