In [1]:
import pandas as pd
import numpy as np
import sys
from itertools import combinations, groupby
from collections import Counter
from scipy import sparse
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity
import Apriori
import Evrecsys
import lightfm_form
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k

In [2]:
#Import data
order_products_prior = pd.read_csv("order_products__prior.csv", dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

orders = pd.read_csv("orders.csv", dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

products = pd.read_csv("products.csv", dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

orders=orders.loc[orders['eval_set']=='prior']

#We will work with 5000 users, last order will be in test set
number_users=5000
users=range(1,number_users+1)
orders_set=orders[orders["user_id"].isin(users)]
idx_test = orders_set.groupby(['user_id'])['order_number'].transform(max) == orders_set['order_number']
orders_set_test=orders_set[idx_test]

orders_set_testlist=np.unique(orders_set_test.order_id)
orders_set_train = orders_set[-orders_set["order_id"].isin(orders_set_testlist)]
orders_set_trainlist=np.unique(orders_set_train.order_id)
train= order_products_prior[order_products_prior["order_id"].isin(orders_set_trainlist)]
test=order_products_prior[order_products_prior["order_id"].isin(orders_set_testlist)]
train = pd.merge(train, orders_set_train, on='order_id', how='left')
test=pd.merge(test, orders_set_test, on='order_id', how='left')

print('Train set has %f orders',len(np.unique(train.order_id)))
print('Test set has %f orders',len(np.unique(test.order_id)))
print('Train set has %f users',len(np.unique(train.user_id)))
print('Test set has %f users',len(np.unique(test.user_id)))

Train set has %f orders 71832
Test set has %f orders 5000
Train set has %f users 5000
Test set has %f users 5000


In [28]:
def last_order(train, test):
    last_orders = train.groupby("user_id")["order_number"].aggregate(np.max)
    t = pd.merge(left=last_orders.reset_index(), right=train, how='inner', on=['user_id', 'order_number'])
    t_last_order = t.groupby('order_id').aggregate({'product_id': lambda x: list(x)})
    t_last_order = pd.merge(t_last_order, train[['order_id', 'user_id']], on='order_id')
    t_last_order = t_last_order.drop_duplicates(subset=['order_id', 'user_id'], keep='first')

    test_history = test[test['reordered']==1].groupby('order_id').aggregate({'product_id': lambda x: list(x)})
    test_history = pd.merge(test_history, test[['order_id', 'user_id']], on='order_id')
    test_history = test_history.drop_duplicates(subset=['order_id', 'user_id'], keep='first')

    t_last_order = pd.merge(t_last_order, test_history, on='user_id')
    t_last_order = t_last_order.sort_values('user_id')
    y_pred=t_last_order['product_id_x']
    y_true=t_last_order['product_id_y']

    return y_pred , y_true

In [29]:
def last_order_reorder(train, test):
    last_orders = train.groupby("user_id")["order_number"].aggregate(np.max)
    t = pd.merge(left=last_orders.reset_index(), right=train[train.reordered == 1], how='inner',
                 on=['user_id', 'order_number'])
    t_last_order = t.groupby('order_id').aggregate({'product_id': lambda x: list(x)})
    t_last_order = pd.merge(t_last_order, train[['order_id', 'user_id']], on='order_id')
    t_last_order = t_last_order.drop_duplicates(subset=['order_id', 'user_id'], keep='first')

    test_history = test[test['reordered']==1].groupby('order_id').aggregate({'product_id': lambda x: list(x)})
    test_history = pd.merge(test_history, test[['order_id', 'user_id']], on='order_id')
    test_history = test_history.drop_duplicates(subset=['order_id', 'user_id'], keep='first')

    t_last_order = pd.merge(t_last_order, test_history, on='user_id')
    t_last_order = t_last_order.sort_values('user_id')
    y_pred=t_last_order['product_id_x']
    y_true=t_last_order['product_id_y']

    return y_pred , y_true

In [30]:
def top10(train, test):
    top_reorder_train=train.groupby("product_id")["reordered"].aggregate({'Total_reorders': 'sum'})['Total_reorders'].sort_values(ascending=False).head(10)
    top_reorder_train=np.array(top_reorder_train.values)
    test_history=test[test['reordered']==1].groupby('order_id').aggregate({'product_id':lambda x: list(x)})
    return top_reorder_train, test_history['product_id']

In [31]:
#lightfm
from lightfm import LightFM
def use_lightfm(train, test):
    set1 = set(np.unique(train.product_id))
    set2 = set(np.unique(test.product_id))
    missing =  pd.DataFrame.from_dict(list(sorted(set1 - set2)))
    added =  pd.DataFrame.from_dict(list(sorted(set2 - set1)))
    
    for i in range(len(missing)): 
        a=missing[0][i]
        test=test.append({'product_id': a},ignore_index=True)
    for i in range(len(added)): 
        a=added[0][i]
        train=train.append({'product_id': a},ignore_index=True)
    
    train=train.fillna(0)
    test=test.fillna(0)
    
    grouped_train_i = train.groupby(["user_id", "product_id"])["reordered"].aggregate("sum").reset_index()
    grouped_test_i = test.groupby(["user_id", "product_id"])["reordered"].aggregate("sum").reset_index()
    
    interactions_i = lightfm_form.create_interaction_matrix(df = grouped_train_i,
                                         user_col = 'user_id',
                                         item_col = 'product_id',
                                         rating_col = 'reordered')
    
    interactions_test_i = lightfm_form.create_interaction_matrix(df = grouped_test_i,
                                         user_col = 'user_id',
                                         item_col = 'product_id',
                                         rating_col = 'reordered')
    
    mf_model = lightfm_form.runMF(interactions = interactions_i,
                 n_components = 30,loss = 'warp',epoch = 40,n_jobs = 4)
    
    test_history = test[test['reordered']==1].groupby('order_id').aggregate({'product_id': lambda x: list(x)})
    test_history = pd.merge(test_history, test[['order_id', 'user_id']], on='order_id')
    test_history = test_history.drop_duplicates(subset=['order_id', 'user_id'], keep='first')
    n_users, n_items = interactions_i.shape
    
    results=[]
    test_history['pred']=0
    for user_id in test_history['user_id']:
        print(user_id)
        recom = mf_model.predict(user_id, np.arange(n_items), num_threads=4)
        recom = pd.Series(recom)
        recom.sort_values(ascending=False, inplace=True)
        if(len(results) == 0):
            results = np.array(recom.iloc[0:10].index.values)
        else:
            results= np.vstack((results, recom.iloc[0:10].index.values))
    
    results_df = pd.DataFrame(data=results)
    columns= results_df.columns.values
    test_history['pred'] = results_df[columns].values.tolist()
           
    y_pred=test_history['pred']
    y_true=test_history['product_id']
    
    test_precision = precision_at_k(mf_model, sparse.csr_matrix(interactions_test_i.values), k=10).mean()
    test_recall = recall_at_k(mf_model, sparse.csr_matrix(interactions_test_i.values), k=10).mean()
    f_test=2 * test_precision * test_recall / (test_precision + test_recall)
    print(f_test)
    return y_pred , y_true, test_precision, test_recall, f_test
 


In [32]:
def apriori(train, test):
    train_orders_i = train.set_index('order_id')['product_id'].rename('item_id')
    test_orders_i = test.set_index('order_id')['product_id'].rename('item_id')
    
    item_name   = pd.read_csv('products.csv')
    item_name   = item_name.rename(columns={'product_id':'item_id', 'product_name':'item_name'})
    rules_i = Apriori.association_rules(train_orders_i, 0.01)  
    rules_final_i = Apriori.merge_item_name(rules_i, item_name).sort_values('lift', ascending=False)
    display(rules_final_i)
    
    #Train set pairs
    train_pairs_gen_i=Apriori.get_item_pairs(train_orders_i)
    train_pairs_i = Apriori.freq(train_pairs_gen_i).to_frame("freqAB")
    train_pairs_i = train_pairs_i.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
    train_pairs_i['pair']=train_pairs_i.item_A.astype(str).str.cat(train_pairs_i.item_B.astype(str), sep='-')
    
    #Test set pairs
    test_pairs_gen_i=Apriori.get_item_pairs(test_orders_i)
    test_pairs_i = Apriori.freq(test_pairs_gen_i).to_frame("freqAB")
    test_pairs_i = test_pairs_i.reset_index().rename(columns={'level_0': 'item_A', 'level_1': 'item_B'})
    test_pairs_i['pair']=test_pairs_i.item_A.astype(str).str.cat(test_pairs_i.item_B.astype(str), sep='-')
    
    #Rules set pairs
    rules_i['pair']=rules_i.item_A.astype(str).str.cat(rules_i.item_B.astype(str), sep='-')
    
    test_pair_set_i=set(np.unique(test_pairs_i.pair))
    train_pair_set_i=set(np.unique(train_pairs_i.pair))
    rules_pair_set_i=set(np.unique(rules_i.pair))

    #TP= Pairs that exist in a priori pred and test
    tp=len(list(test_pair_set_i&rules_pair_set_i))

    #TN= pairs that exists train set but not in test
    tn=len(list(test_pair_set_i-train_pair_set_i))

    #FN= Pairs that exists in test but not in a priori
    fn=len(list(rules_pair_set_i-test_pair_set_i))

    #FP= Pairs that exists in a priori but not in test
    fp=len(list(test_pair_set_i-rules_pair_set_i))
    
    return tp, tn, fn, fp

In [None]:
y_pred_last_order , y_true_last_order= last_order(train, test)
print('finish')
y_pred_last_reorder , y_true_last_reorder= last_order_reorder(train, test)
print('finish')
y_pred_top10 , y_true_top10=top10(train, test)
print('finish')
y_pred_lightfm , y_true_lightfm, test_precision_light, test_recall_light, f_test_light=use_lightfm(train, test)

finish
finish


is deprecated and will be removed in a future version
  


finish


In [19]:
results=[]
#top10
sum_recall=0
sum_precision=0
sum_fscore=0
sum_auc=0
sum_ap=0
sum_coverage=0
n=len(y_true_top10)
for i in y_true_top10.index:
    p,r,f1=Evrecsys.calculate_prf(y_true_top10[i],y_pred_top10)
    sum_precision=sum_precision+p
    sum_recall=sum_recall+r
    sum_fscore=sum_fscore+f1

    ap=Evrecsys.apk(y_true_top10[i],y_pred_top10)
    sum_ap=sum_ap+ap
    
results.append(['top10',sum_recall/n,sum_precision/n ,sum_fscore/n, sum_auc/n,sum_ap/n])



In [20]:
#last order
sum_recall=0
sum_precision=0
sum_fscore=0
sum_auc=0
sum_ap=0
n=len(y_pred_last_order)
for i in y_pred_last_order.index:
    p,r,f1=Evrecsys.calculate_prf(y_true_last_order[i],y_pred_last_order[i])
    sum_precision=sum_precision+p
    sum_recall=sum_recall+r
    sum_fscore=sum_fscore+f1
    
    #auc=Evrecsys.auc_score(y_pred_last_order[i], y_true_last_order[i])
    #sum_auc=sum_auc+auc
    
    ap=Evrecsys.apk(y_true_last_order[i],y_pred_last_order[i])
    sum_ap=sum_ap+ap
    
results.append(['last order',sum_recall/n,sum_precision/n ,sum_fscore/n, sum_auc/n,sum_ap/n])

In [21]:
#last reorder
sum_recall=0
sum_precision=0
sum_fscore=0
sum_auc=0
sum_ap=0
n=len(y_pred_last_reorder)
for i in y_pred_last_reorder.index:
    p,r,f1=Evrecsys.calculate_prf(y_true_last_reorder[i],y_pred_last_reorder[i])
    sum_precision=sum_precision+p
    sum_recall=sum_recall+r
    sum_fscore=sum_fscore+f1
    #auc=Evrecsys.auc_score(y_pred_last_order[i], y_true_last_order[i])
    #sum_auc=sum_auc+auc
    
    ap=Evrecsys.apk(y_true_last_reorder[i],y_pred_last_reorder[i])
    sum_ap=sum_ap+ap
    
results.append(['last reorder',sum_recall/n,sum_precision/n ,sum_fscore/n, sum_auc/n,sum_ap/n])

In [22]:
#lightfm
sum_recall=0
sum_precision=0
sum_fscore=0
sum_auc=0
sum_ap=0
n=len(y_pred_lightfm)
for i in y_pred_lightfm.index:
    p,r,f1=Evrecsys.calculate_prf(y_true_lightfm[i],y_pred_lightfm[i])
    sum_precision=sum_precision+p
    sum_recall=sum_recall+r
    sum_fscore=sum_fscore+f1
    #auc=Evrecsys.auc_score(y_pred_last_order[i], y_true_last_order[i])
    #sum_auc=sum_auc+auc
    
    ap=Evrecsys.apk(y_true_lightfm[i],y_pred_lightfm[i])
    sum_ap=sum_ap+ap
    
results.append(['lightfm',sum_recall/n,sum_precision/n ,sum_fscore/n, sum_auc/n,sum_ap/n])
results.append(['lightfm1',test_recall_light,test_precision_light ,f_test_light,'-','-'])             

In [23]:
#apriori
tp, tn, fn, fp=apriori(train, test)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
f1=2*(recall*precision)/(recall+precision)
results.append(['apriori',recall, precision, f1, '-', '-'])

Starting order_item:                 710227
Items with support >= 0.01:           10544
Remaining order_item:                664072
Remaining orders with 2+ items:       67473
Remaining order_item:                660034


  order_item = order_item.reset_index().as_matrix()


Item pairs:                         2308248
Item pairs with support >= 0.01:      80043



Unnamed: 0,itemA,itemB,freqAB,supportAB,freqA,supportA,freqB,supportB,confidenceAtoB,confidenceBtoA,lift
0,Cherry Caffeine Free Unsweetened Soft Drink Mix,Orange Unsweetened Soft Drink Mix,8,0.011857,10,0.014821,9,0.013339,0.800000,0.888889,59.976000
5,Dora and Friends Fruit Snacks,Sponge Bob Square Pants Fruit Snacks,7,0.010375,10,0.014821,8,0.011857,0.700000,0.875000,59.038875
1,Tropical Punch Unsweetened Soft Drink Mix,Orange Unsweetened Soft Drink Mix,8,0.011857,11,0.016303,9,0.013339,0.727273,0.888889,54.523636
17,Strawberry ZFruit,Kids Mixed Berry Twisted Fruit,7,0.010375,10,0.014821,9,0.013339,0.700000,0.777778,52.479000
2,Grape Unsweetened Soft Drink Mix,Orange Unsweetened Soft Drink Mix,7,0.010375,10,0.014821,9,0.013339,0.700000,0.777778,52.479000
18,Organic Bold Original Cacao Superfood Drink,Coconut Cacao Drink,8,0.011857,14,0.020749,8,0.011857,0.571429,1.000000,48.195000
3,Cherry Caffeine Free Unsweetened Soft Drink Mix,Grape Unsweetened Soft Drink Mix,7,0.010375,10,0.014821,10,0.014821,0.700000,0.700000,47.231100
1144,Chocolate Breakfast Bites,Asian Pear Crisps,7,0.010375,13,0.019267,8,0.011857,0.538462,0.875000,45.414519
4,Cherry Caffeine Free Unsweetened Soft Drink Mix,Tropical Punch Unsweetened Soft Drink Mix,7,0.010375,10,0.014821,11,0.016303,0.700000,0.636364,42.937364
19,Cacao Chocolate Mint,Coconut Cacao Drink,7,0.010375,14,0.020749,8,0.011857,0.500000,0.875000,42.170625


In [25]:
results.append(['10000 users','-', '-', '-', '-', '-'])

In [26]:
result_df = pd.DataFrame(results,columns=['method', 'recall', 'precision','f1', 'au', 'ap'])
result_df

Unnamed: 0,method,recall,precision,f1,au,ap
0,top10,0.00116363,0.00116,0.00107729,0,0.00036129
1,last order,0.290462,0.29307,0.269538,0,0.222318
2,last reorder,0.249678,0.390526,0.274065,0,0.226576
3,lightfm,8.47872e-05,0.000239952,9.21618e-05,0,0.000127982
4,lightfm1,0.153612,0.0810817,0.106139,-,-
5,apriori,0.427045,0.0972981,0.158487,-,-
6,10000 users,-,-,-,-,-
