In [1]:
# The main purpose of this .ipynb file is to generate the training and validation sets required
# for our stage 2 ranking model. Required is the file containing the top X predictions per user
# generated from Pipeline_Main_Stage1.ipynb. Returned from this notebook are the training set,
# validation set, and ground truth set indicated which items were actually purcahsed by the user
# in their last order.

In [1]:
import numpy as np
import pandas as pd
import os
import pickle
import pyarrow.parquet as pq
import pyarrow as pa

In [3]:
# read in pickle file containing top X predictions

cg_exploded = pd.read_pickle("cg_exploded_rank350_allusers_50.pkl")
#cg_exploded = pd.read_pickle("cg_exploded_rank425_40ksample_50.pkl")


# read in 0-based user_id index map

# file_name = 'user_study_dict_40k.pkl'
file_name = 'user_study_dict_ALLUSERS.pkl'
open_file = open(file_name, "rb")
user_zerobased_map = pickle.load(open_file)
open_file.close()
reverse_user_map = {v: k for k, v in user_zerobased_map.items()}

# convert user_id back to original 1-based index

cg_exploded.user_id = cg_exploded.user_id.apply(lambda x: reverse_user_map[x])
user_study = user_zerobased_map.keys()

In [4]:
# adding in all previously purchased items into cg_exploded
# this is only if we want to include wpp

# prior = pd.read_csv('order_products__prior.csv')
# orders = pd.read_csv('orders.csv')
# orders_products_prior = orders.merge(prior, how = 'inner', on = 'order_id')

# df_previous_orders = orders_products_prior[orders_products_prior.user_id.isin(user_study)]
# df_previous_orders = df_previous_orders[['user_id', 'product_id']].drop_duplicates().sort_values("user_id")
# df_previous_orders.product_id = df_previous_orders.product_id - 1

# cg_exploded = pd.concat([cg_exploded, df_previous_orders], axis=0).drop_duplicates().reset_index(drop=True)
# print (cg_exploded.shape)

(3959210, 2)


#### Load Historical Order Data

In [4]:
myfolder=''
prior = pd.read_csv(myfolder + 'order_products__prior.csv', dtype={'order_id': np.uint32,
           'product_id': np.uint16, 'add_to_cart_order': np.uint8})

train_orders = pd.read_csv(myfolder + 'order_products__train.csv', dtype={'order_id': np.uint32,
           'product_id': np.uint16, 'reordered': np.int8, 'add_to_cart_order': np.uint8 })

orders = pd.read_csv(myfolder + 'orders.csv', dtype={'order_hour_of_day': np.uint8,
           'order_number': np.uint8, 'order_id': np.uint32, 'user_id': np.uint32,
           'order_dow': np.uint8, 'days_since_prior_order': np.float16})

orders.eval_set = orders.eval_set.replace({'prior': 0, 'train': 1, 'test':2}).astype(np.uint8)

orders.days_since_prior_order = orders.days_since_prior_order.fillna(30).astype(np.uint8)

products = pd.read_csv(myfolder + 'products.csv', dtype={'product_id': np.uint16,
            'aisle_id': np.uint8, 'department_id': np.uint8},
             usecols=['product_id', 'aisle_id', 'department_id'])

print('done loading')
print('merge prior and orders and keep train separate ...')

orders_products_prior = orders.merge(prior, how = 'inner', on = 'order_id')
orders_products_prior = orders_products_prior.merge(products, how='inner', on='product_id')
#orders_products_prior.to_csv('instacart_prior_set.csv', index=False)

orders_products_train = orders.merge(train_orders, how='inner', on='order_id')
orders_products_train = orders_products_train.merge(products, how='inner', on='product_id')
orders_products_train['purchased_label'] = 1
#orders_products_train.to_csv('instacart_train_set.csv', index=False)

train_orders = train_orders.merge(orders[['user_id','order_id']], left_on = 'order_id', right_on = 'order_id', how = 'inner')

all_products = products.product_id.unique()
all_products.sort()
product_zerobased_map = dict(zip(all_products, range(len(all_products))))

done loading
merge prior and orders and keep train separate ...


#### Extract all Relevant Prior Interactions (Purchases)
- This will serve as the training set for stage 2 ranking model

In [5]:
# returns a generator serving n records at a time

def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [6]:
# file to save to

#filename = "interaction_prior_40k_sample_300recs_wpp.csv"
#filename = "interaction_prior_40k_sample_300recs.csv"
filename = "interaction_prior_allusers_rank350_50recs.csv"

In [7]:
# filtering previous purchase data for population we've selected (user_study)
df_interaction_prior = orders_products_prior[orders_products_prior.user_id.isin(user_study)][["user_id", "product_id"]].reset_index(drop=True)
#df_interaction_prior = orders_products_prior[["user_id", "product_id"]].reset_index(drop=True)

# specifying that our target variable is 1 for all previously purchased items
df_interaction_prior['interaction'] = 1

df = products[["product_id"]].copy()

list_user_df = []
counter = 0
for user_id_group in batch(df_interaction_prior.user_id.unique(), 500):
    counter += 500
    
    print (counter)

    # creating our negative training examples
    df_user = pd.concat([pd.concat([df.copy() for x in range(len(user_id_group))], axis=0).reset_index(drop=True), 
                         pd.concat([pd.Series(np.ones(products.shape[0]) * user_id, name='user_id') for user_id in user_id_group], axis=0).reset_index(drop=True)], axis=1)
    df_user["interaction"] = 0

    # concatenating positive (purchased items) and negative training examples
    df_user = pd.concat([df_interaction_prior[df_interaction_prior.user_id.isin(user_id_group)], df_user], 
                            axis=0, ignore_index=True)

    
    # removing instances where we have a positive and negative instance for the same product. The negative instance
    # will be dropped and the positive example will remain
    df_user['distinct_count'] = df_user.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
    df_user_unique = df_user[~((df_user.interaction == 0) & (df_user.distinct_count == 2))].reset_index(drop=True)
    df_user_unique.drop('distinct_count', axis=1, inplace=True)
    
    # converting user_id and product_id back to their original index
    df_user_unique.user_id = df_user_unique.user_id.astype(int)
    df_user_unique.product_id = df_user_unique.product_id.astype(int)
    df_user_unique.product_id = df_user_unique.product_id - 1
    
    # only want to include products that are included in our top X predictions from stage 1 model
    df_user_unique = pd.merge(df_user_unique, cg_exploded[cg_exploded.user_id.isin(user_id_group)], how='inner', on=['user_id', 'product_id'])
    df_user_unique.user_id = df_user_unique.user_id.apply(lambda x: user_zerobased_map[x])

    print ('null values:', df_user_unique[df_user_unique.product_id.isnull()].shape[0])
    
    # shuffling data for better training mechanics later
    df_user_unique = df_user_unique.sample(frac=1).reset_index(drop=True)
    
    # writing to file
    df_user_unique.to_csv(filename, mode='a', index=False, header=(not os.path.exists(filename)))

    del df_user
    del df_user_unique

print ('done with for loop')

500
null values: 0
1000
null values: 0
1500
null values: 0
2000
null values: 0
2500
null values: 0
3000
null values: 0
3500
null values: 0
4000
null values: 0
4500
null values: 0
5000
null values: 0
5500
null values: 0
6000
null values: 0
6500
null values: 0
7000
null values: 0
7500
null values: 0
8000
null values: 0
8500
null values: 0
9000
null values: 0
9500
null values: 0
10000
null values: 0
10500
null values: 0
11000
null values: 0
11500
null values: 0
12000
null values: 0
12500
null values: 0
13000
null values: 0
13500
null values: 0
14000
null values: 0
14500
null values: 0
15000
null values: 0
15500
null values: 0
16000
null values: 0
16500
null values: 0
17000
null values: 0
17500
null values: 0
18000
null values: 0
18500
null values: 0
19000
null values: 0
19500
null values: 0
20000
null values: 0
20500
null values: 0
21000
null values: 0
21500
null values: 0
22000
null values: 0
22500
null values: 0
23000
null values: 0
23500
null values: 0
24000
null values: 0
24500
null v

#### Extract Ground Truth (Last order purchases)

In [10]:
# filename to write ground truth instances to

ground_truth_filename = "cg_interaction_ground_truth_allusers_rank350_50recs.pkl"

# creating 0-based index dataframe to convert original index back more efficiently

df_user_zerobased_map = pd.DataFrame.from_dict(user_zerobased_map, orient='index', columns=['zerobased'])
df_user_zerobased_map['user_id'] = df_user_zerobased_map.index
df_user_zerobased_map.reset_index(drop=True, inplace=True)

df_interaction_train_ground_truth = orders_products_train[orders_products_train.user_id.isin(user_study)][["user_id", "product_id"]].reset_index(drop=True)
df_interaction_train_ground_truth['interaction'] = 1

# converting user_id and product_id back to original index
df_interaction_train_ground_truth.user_id = df_interaction_train_ground_truth.user_id.astype(int)
df_interaction_train_ground_truth = pd.merge(df_interaction_train_ground_truth, df_user_zerobased_map, how='left', on='user_id')
df_interaction_train_ground_truth.drop(['user_id'], axis=1, inplace=True)
df_interaction_train_ground_truth.rename({'zerobased': 'user_id'}, axis=1, inplace=True)
df_interaction_train_ground_truth = df_interaction_train_ground_truth[['user_id', 'product_id', 'interaction']]
df_interaction_train_ground_truth.product_id = df_interaction_train_ground_truth.product_id - 1

# saving to file
df_interaction_train_ground_truth.to_pickle(ground_truth_filename)

#### Extract Validation Set to Test Model on

In [11]:
# file to save to

filename = "cg_interaction_train_allusers_rank350_50recs.csv"

In [12]:
# filtering previous purchase data for population we've selected (user_study)

df_interaction_train = orders_products_train[orders_products_train.user_id.isin(user_study)][["user_id", "product_id"]].reset_index(drop=True)
df_interaction_train['interaction'] = 1

df = products[["product_id"]].copy()
list_user_df = []
counter = 0
for user_id_group in batch(df_interaction_train.user_id.unique(), 500):
    counter += 500
    
    print (counter)
    
    # creating our negative training examples
    df_user = pd.concat([pd.concat([df.copy() for x in range(len(user_id_group))], axis=0).reset_index(drop=True), 
                         pd.concat([pd.Series(np.ones(products.shape[0]) * user_id, name='user_id') for user_id in user_id_group], axis=0).reset_index(drop=True)], axis=1)
    df_user["interaction"] = 0
    
    # concatenating positive (purchased items) and negative training examples
    df_user = pd.concat([df_interaction_train[df_interaction_train.user_id.isin(user_id_group)], df_user], 
                            axis=0, ignore_index=True)
    
    # removing instances where we have a positive and negative instance for the same product. The negative instance
    # will be dropped and the positive example wil remain
    df_user['distinct_count'] = df_user.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
    df_user_unique = df_user[~((df_user.interaction == 0) & (df_user.distinct_count == 2))].reset_index(drop=True)
    df_user_unique.drop('distinct_count', axis=1, inplace=True)
    df_user_unique.user_id = df_user_unique.user_id.astype(int)
    df_user_unique.product_id = df_user_unique.product_id - 1
    
    # only want to include products that are included in our top X predictions from stage 1 model
    df_user_unique = pd.merge(df_user_unique, cg_exploded[cg_exploded.user_id.isin(user_id_group)], how='inner', on=['user_id', 'product_id'])
    
    # converting user_id and product_id back to their original index
    df_user_unique = pd.merge(df_user_unique, df_user_zerobased_map, how='left', on='user_id')
    df_user_unique.drop(['user_id'], axis=1, inplace=True)
    df_user_unique.rename({'zerobased': 'user_id'}, axis=1, inplace=True)
    df_user_unique = df_user_unique[['user_id', 'product_id', 'interaction']]
    
    # shuffling data for better training mechanics later
    df_user_unique = df_user_unique.sample(frac=1).reset_index(drop=True)
    
    # writing to file
    df_user_unique.to_csv(filename, mode='a', index=False, header=(not os.path.exists(filename)))

    del df_user
    del df_user_unique

print ('done with for loop')

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000
45500
46000
46500
47000
47500
48000
48500
49000
49500
50000
50500
51000
51500
52000
52500
53000
53500
54000
54500
55000
55500
56000
56500
57000
57500
58000
58500
59000
59500
60000
60500
61000
61500
62000
62500
63000
63500
64000
64500
65000
65500
66000
66500
67000
67500
68000
68500
69000
69500
70000
70500
71000
71500
72000
72500
73000
73500
74000
74500
75000
75500
76000
76500
77000
77500
78000
78500
79000
79500
80000
80500
81000
81500
82000
82500
83000
83500
84000
84500
85000


In [35]:
pd.read_csv("cg_interaction_train_random_40k_50_rank425.csv").user_id.nunique()

26257