Implementation of the paper

In [1]:
import numpy as np
import pandas as pd
import pickle
from scipy.special import expit

In [3]:
orders = pd.read_csv('orders.csv')
orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [4]:
order_products = pd.read_csv('order_products__prior.csv')
order_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [5]:
order_products_train = pd.read_csv('order_products__train.csv')
order_products_train.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


In [6]:
mean_basket = np.load('mean_basket.npy')

In [7]:
user_products = pickle.load(open('user_prods.pkl', 'rb'))

In [18]:
# initializing matrix dimentsions and matrices from normal distribution
U = 207000
I = 50000
Ku = 70
Kl = 70

In [19]:
Vui = np.random.normal(size=(U, Ku), scale=0.2)
Viu = np.random.normal(size=(I, Ku), scale=0.2)
Vil = np.random.normal(size=(I, Kl), scale=0.2)
Vli = np.random.normal(size=(I, Kl), scale=0.2)

In [11]:
# function to calculate the probability that user u will choose the item i if it's basket was Bt_1
def predict(u, i, Vui, Viu, Vil, Vli, Bt_1):
    return Vui[u, :].dot(Viu[i, :]) + np.sum(np.multiply(Vil[i, :], Vli[Bt_1, :])) / len(Bt_1)

In [12]:
def learn(Vui, Viu, Vil, Vli, alpha, lam, T):
    global orders, order_products, order_products_train
    user_ids    = orders['user_id'].unique()
    user_orders = orders[orders['eval_set'] == 'prior'].set_index('user_id')
    basket = order_products.set_index('order_id')
        
    for _ in range(T):
        #randomly selecting user, his basket, the product that contains in the basket and product that doesn't contain
        u = user = np.random.choice(user_ids)
        u_ords = user_orders.loc[user]['order_id'].values
        
        rand_index = np.random.randint(low=1, high=user_orders.loc[user]['order_id'].values.shape[0])
        t_1 = u_ords[rand_index - 1]
        t = u_ords[rand_index]
        
        Bt_1 = basket.loc[t_1]['product_id']
        Bt_1 = (Bt_1.values) if type(Bt_1) is pd.core.series.Series else np.array([Bt_1])
        
        Bt = basket.loc[t]['product_id']
        Bt = (Bt.values) if type(Bt) is pd.core.series.Series else np.array([Bt])
        
        diff = np.setdiff1d(user_products[str(user)], Bt)
        if len(diff) == 0:
            continue
            
        i = np.random.choice(Bt)    
        j = np.random.choice(diff)
        
        #calculating probabilities that user will choose the i'th and j'th
        x = predict(user, i, Vui, Viu, Vil, Vli, Bt_1)
        y = predict(user, j, Vui, Viu, Vil, Vli, Bt_1)
        delta = 1 - expit(x - y)
        
        #updating matrices
        Vui[u, :] = Vui[u, :] + alpha * (delta * (Viu[i, :] - Viu[j, :]) - lam * Vui[u, :])
        Viu[i, :] = Viu[i, :] + alpha * (delta * Vui[u, :] - lam * Viu[i, :])
        Viu[j, :] = Viu[j, :] - alpha * (delta * Vui[u, :] + lam * Viu[j, :])
        
        eta = np.sum(Vli[Bt_1, :], axis=0) / len(Bt_1)
        Vil[i, :] = Vil[i, :] + alpha * (delta * eta - lam * Vil[i, :])
        Vil[j, :] = Vil[j, :] - alpha * (delta * eta + lam * Vil[j, :])
        Vli[Bt_1, :] = Vli[Bt_1, :] + alpha * (delta * (Vil[i, :] - Vil[j, :]) / len(Bt_1) - lam * Vli[Bt_1, :])
    
    return Vui, Viu, Vil, Vli

In [13]:
def vard(Vui, Viu, Vil, Vli, f):
    f.write('order_id,products\n')
    global orders, order_products, order_products_train
    
    user_orders = orders[orders['eval_set'] == 'prior'].set_index('user_id')
    user_test_orders = orders[orders['eval_set'] == 'test'].set_index('user_id')
    user_ids = user_test_orders.index.values
    basket = order_products.set_index('order_id')
    t = 0
    
    for user in user_ids:
        #for each user selecting it's last basket
        u_ords = user_orders.loc[user]['order_id'].values
        Bt_1 = basket.loc[u_ords[-1]]['product_id']
        Bt_1 = (Bt_1.values) if type(Bt_1) is pd.core.series.Series else np.array([Bt_1])

        #getting user's all products and calculate probability that user will choose for all of them
        products = user_products[str(user)]
        probs = np.zeros_like(products, dtype=np.float16)
        for index, product in enumerate(products):
            probs[index] = predict(user, product, Vui, Viu, Vil, Vli, Bt_1)
            
        #sort and get the product's with highest probabilities
        df = pd.DataFrame(data={'id': products, 'prob': probs}).sort_values('prob')
        
        f.write(str(user_test_orders.loc[user]['order_id']))
        f.write(',')
        f.write(str(df['id'].values[-mean_basket[user]:])[1:-1])
        f.write('\n')
        
        t += 1
        if t % 1000 == 0:
            print(t)

In [None]:
%%time
Vui, Viu, Vil, Vli = learn(Vui, Viu, Vil, Vli, 0.3, 0.1, 20000000)

In [None]:
%%time
with open('submission_fmc5.csv', 'w') as f:
    vard(Vui, Viu, Vil, Vli, f)

In [28]:
np.sum(np.isnan(Vui)) + np.sum(np.isnan(Viu)) + np.sum(np.isnan(Vil)) + np.sum(np.isnan(Vli))

0

In [31]:
np.save('Vui', Vui)
np.save('Viu', Viu)
np.save('Vil', Vil)
np.save('Vli', Vli)