In [None]:
# The main purpose of this .ipynb file is to generate the training and validation sets required
# for our stage 2 ranking model. Required is the file containing the top X predictions per user
# generated from Pipeline_Main_Stage1.ipynb. Returned from this notebook are the training set,
# validation set, and ground truth set indicated which items were actually purcahsed by the user
# in their last order.

In [1]:
(0.09981707889495871-0.09631531038969282)/0.09631531038969282

0.03635734018919414

In [87]:
import numpy as np
import pandas as pd
import os
import pickle
import pyarrow.parquet as pq
import pyarrow as pa
from scipy import spatial
import time

In [88]:
# read in pickle file containing top X predictions

cg_exploded = pd.read_pickle("~/work/cg_exploded_rank250_20k_100.pkl")

# read in 0-based user_id index map

file_name = "user_study_dict_20k.pkl"
#file_name = 'user_study_dict_ALLUSERS.pkl'
open_file = open(file_name, "rb")
user_zerobased_map = pickle.load(open_file)
open_file.close()
reverse_user_map = {v: k for k, v in user_zerobased_map.items()}

# convert user_id back to original 1-based index

cg_exploded.user_id = cg_exploded.user_id.apply(lambda x: reverse_user_map[x])
user_study = user_zerobased_map.keys()
cg_exploded['candidate_generated_set'] = True

#### Load Historical Order Data

In [89]:
myfolder='~/work/'
prior = pd.read_csv(myfolder + 'order_products__prior.csv', dtype={'order_id': np.uint32,
           'product_id': np.uint16, 'add_to_cart_order': np.uint8})

train_orders = pd.read_csv(myfolder + 'order_products__train.csv', dtype={'order_id': np.uint32,
           'product_id': np.uint16, 'reordered': np.int8, 'add_to_cart_order': np.uint8 })

orders = pd.read_csv(myfolder + 'orders.csv', dtype={'order_hour_of_day': np.uint8,
           'order_number': np.uint8, 'order_id': np.uint32, 'user_id': np.uint32,
           'order_dow': np.uint8, 'days_since_prior_order': np.float16})

orders.eval_set = orders.eval_set.replace({'prior': 0, 'train': 1, 'test':2}).astype(np.uint8)

orders.days_since_prior_order = orders.days_since_prior_order.fillna(30).astype(np.uint8)

products = pd.read_csv(myfolder + 'products.csv', dtype={'product_id': np.uint16,
            'aisle_id': np.uint8, 'department_id': np.uint8},
             usecols=['product_id', 'aisle_id', 'department_id'])

print('done loading')
print('merge prior and orders and keep train separate ...')

orders_products_prior = orders.merge(prior, how = 'inner', on = 'order_id')
orders_products_prior = orders_products_prior.merge(products, how='inner', on='product_id')
#orders_products_prior.to_csv('instacart_prior_set.csv', index=False)

orders_products_train = orders.merge(train_orders, how='inner', on='order_id')
orders_products_train = orders_products_train.merge(products, how='inner', on='product_id')
orders_products_train['purchased_label'] = 1
#orders_products_train.to_csv('instacart_train_set.csv', index=False)

train_orders = train_orders.merge(orders[['user_id','order_id']], left_on = 'order_id', right_on = 'order_id', how = 'inner')

all_products = products.product_id.unique()
all_products.sort()
product_zerobased_map = dict(zip(all_products, range(len(all_products))))

with open('np_instacart_product_vectors.pkl', 'rb') as handle:
    np_instacart_product_vectors = pickle.load(handle)
vector_list_instacart_products = np_instacart_product_vectors.tolist()

done loading
merge prior and orders and keep train separate ...


## Helper Functions

In [90]:
# returns a generator serving n records at a time

def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

def grab_users_hard_negatives(user, products_purchased, n, tree):
    closest_neighbors = pd.Series(products_purchased, name='product_id').apply(lambda x: grab_closest_n_neighbors(n, x, tree)[0:n])
    df_hard_negatives = pd.concat([pd.Series([user]*len(closest_neighbors), name='user_id'), closest_neighbors, 
                                  pd.Series([0]*len(closest_neighbors), name='interaction')], axis=1)
    return df_hard_negatives.explode('product_id')

# def update_neighbor_dict(user_cohort_purchased_products, n):
#     for x in user_cohort_purchased_products:
#         if x in dict_all_nn:
#             closest_neighbor_dict[x] = dict_all_nn[x]
#         else:
#             closest_neighbor_dict[x] = grab_closest_n_neighbors(n, x, tree)
#     return closest_neighbor_dict

def update_neighbor_dict(user_cohort_purchased_products, dict_all_nn, n):
    for x in user_cohort_purchased_products:
        if x not in dict_all_nn:
            dict_all_nn[x] =  grab_closest_n_neighbors(n, x, tree)
    return dict_all_nn

tree = spatial.cKDTree(vector_list_instacart_products)
def grab_closest_n_neighbors(n, product_id, tree):
    study_index_vector = vector_list_instacart_products[product_id]
    tree_search_indices = tree.query(study_index_vector, k=5, workers=-1)[1].tolist()
    try:
        tree_search_indices.remove(product_id)
        return tree_search_indices[0:n]
    except:
        return tree_search_indices[0:n]

#### Extract all Relevant Prior Interactions (Purchases)
- This will serve as the training set for stage 2 ranking model

In [98]:
def train_data(filename, orders_products_prior, products, user_study, include_hard_negative, n_hard_negatives, all_positives, include_random_popularity_sampling, n_rns):
    
    # filtering previous purchase data for population we've selected (user_study)
    df_interaction_prior = orders_products_prior[orders_products_prior.user_id.isin(user_study)][["user_id", "product_id"]].reset_index(drop=True)
    #df_interaction_prior = orders_products_prior[["user_id", "product_id"]].reset_index(drop=True)

    # specifying that our target variable is 1 for all previously purchased items
    df_interaction_prior['interaction'] = 1

    # taking into account the popularity of each item for sampling purposes later

    relative_frequencies = pd.DataFrame(orders_products_prior.product_id.value_counts(normalize=True))
    relative_frequencies.rename({'product_id': 'frequency'}, axis=1, inplace=True)
    relative_frequencies['product_id'] = relative_frequencies.index
    relative_frequencies.reset_index(drop=True, inplace=True)

    df = products[["product_id"]].copy()

    list_user_df = []
    counter = 0
    dict_all_nn = {}
    for user_id_group in batch(df_interaction_prior.user_id.unique(), 500):
        counter += 500
        print (counter)
        # creating our negative training examples
        df_user = pd.concat([pd.concat([df.copy() for x in range(len(user_id_group))], axis=0).reset_index(drop=True), 
                             pd.concat([pd.Series(np.ones(products.shape[0]) * user_id, name='user_id') for user_id in user_id_group], axis=0).reset_index(drop=True)], axis=1)
        df_user["interaction"] = 0

        # concatenating positive (purchased items) and negative training examples
        df_user = pd.concat([df_interaction_prior[df_interaction_prior.user_id.isin(user_id_group)], df_user], 
                                axis=0, ignore_index=True)

        # removing instances where we have a positive and negative instance for the same product. The negative instance
        # will be dropped and the positive example will remain
        df_user['distinct_count'] = df_user.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
        df_user_unique = df_user[~((df_user.interaction == 0) & (df_user.distinct_count == 2))].reset_index(drop=True)
        df_user_unique.drop('distinct_count', axis=1, inplace=True)

        # converting user_id and product_id back to their original index
        df_user_unique.user_id = df_user_unique.user_id.astype(int)
        df_user_unique.product_id = df_user_unique.product_id.astype(int)
        df_user_unique.product_id = df_user_unique.product_id - 1
        original_shape = pd.merge(df_user_unique, cg_exploded[cg_exploded.user_id.isin(user_id_group)], how='inner', on=['user_id', 'product_id']).shape[0]
        if all_positives:
            # want to include both products that are included in our top X predictions as well as all previously purchased items
            df_user_unique = pd.merge(df_user_unique, cg_exploded[cg_exploded.user_id.isin(user_id_group)], how='left', on=['user_id', 'product_id'])
            df_user_unique = df_user_unique[(df_user_unique.candidate_generated_set == True) | (test.interaction == 1)]
            df_user_unique.user_id = df_user_unique.user_id.apply(lambda x: user_zerobased_map[x])
            print (str(df_user_unique.shape[0] - original_shape), 'positives were added from historical purchases not in top K predictions from ALS.')
        else:
            # only want to include products that are included in our top X predictions from stage 1 model
            df_user_unique = pd.merge(df_user_unique, cg_exploded[cg_exploded.user_id.isin(user_id_group)], how='inner', on=['user_id', 'product_id'])
            df_user_unique.user_id = df_user_unique.user_id.apply(lambda x: user_zerobased_map[x])
        df_user_unique.drop(['candidate_generated_set'], axis=1 ,inplace=True)
        original_shape = df_user_unique.shape[0]
        # add in random popularity sampling
        if include_random_popularity_sampling:
            unique_positive_count = df_user_unique[df_user_unique.interaction == 1].drop_duplicates().shape[0]
            sample_product_ids = random.choices(list(relative_frequencies.product_id), list(relative_frequencies.frequency), 
                                          k=unique_positive_count*n_rns)
            negative_random_samples = pd.concat([pd.concat([df_user_unique[df_user_unique.interaction == 1].drop_duplicates().user_id.reset_index(drop=True) for i in range(n_rns)], axis=0).reset_index(drop=True), 
                                                 pd.Series(sample_product_ids, name='product_id'), 
                                                 pd.Series([0] * unique_positive_count*n_rns, name='interaction')], axis=1)
            df_user_unique = pd.concat([df_user_unique, negative_random_samples], axis=0)
            df_user_unique.reset_index(drop=True, inplace=True)
            df_user_unique['distinct_count'] = df_user_unique.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
            df_user_unique = df_user_unique[~((df_user_unique.interaction == 0) & (df_user_unique.distinct_count == 2))].reset_index(drop=True)
            df_user_unique.drop('distinct_count', axis=1, inplace=True)
            print (str(df_user_unique.shape[0] - original_shape), 'negatives were added from random negative sampling.')
        original_shape = df_user_unique.shape[0]
        # add in hard negatives based on parameter, n
        if include_hard_negative:

            df_nn = df_user_unique.copy()
            df_nn = df_nn.drop_duplicates()
            df_nn = df_nn[df_nn.interaction == 1].copy()
            user_cohort_purchased_products = df_user_unique[df_user_unique.interaction == 1].product_id.unique()
            start = time.time()
            dict_all_nn = update_neighbor_dict(user_cohort_purchased_products, dict_all_nn, n_hard_negatives)
            end = time.time()
            print (len(dict_all_nn.items()), 'items in dictionary.')

            print ('completed neighbor search in', np.round((end-start), 2), 'seconds.')

            df_nn['nearest_neighbors'] = df_nn.product_id.map(dict_all_nn)
            df_nn = df_nn.explode('nearest_neighbors')
            df_nn.drop(['product_id'], axis=1, inplace=True)
            df_nn = df_nn.rename({'nearest_neighbors': 'product_id'}, axis=1)
            df_nn['interaction'] = 0
            df_user_unique = pd.concat([df_user_unique, df_nn], axis=0).reset_index(drop=True)
            df_user_unique['distinct_count'] = df_user_unique.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
            df_user_unique = df_user_unique[~((df_user_unique.interaction == 0) & (df_user_unique.distinct_count == 2))].reset_index(drop=True)
            df_user_unique.drop('distinct_count', axis=1, inplace=True)
            print (str(df_user_unique.shape[0] - original_shape), 'negatives were added from hard negative sampling.')
    #         user_cohort_purchased_products = df_user_unique[df_user_unique.interaction == 1].product_id.unique()

    #         start = time.time()
    #         closest_neighbor_dict = update_neighbor_dict(user_cohort_purchased_products, n)
    #         end = time.time()
    #         print ('completed neighbor search in', np.round((end-start), 2), 'seconds.')
    #         dict_all_nn.update(closest_neighbor_dict)

    #         df_closest_neighbors = pd.DataFrame(closest_neighbor_dict.items(), columns=['product_id', 'nearest_neighbors']).explode('nearest_neighbors').reset_index(drop=True)
    #         df_closest_neighbors['interaction'] = 0
    #         hard_negatives = pd.merge(df_user_unique[df_user_unique.interaction == 1].drop_duplicates(), df_closest_neighbors, how='left', on='product_id')
    #         hard_negatives.drop(['interaction_x', 'product_id'], axis=1, inplace=True)
    #         hard_negatives = hard_negatives.rename({'interaction_y': 'interaction', 'nearest_neighbors': 'product_id'}, axis=1)
    #         df_user_unique = pd.concat([df_user_unique, hard_negatives], axis=0).reset_index(drop=True)
    #         df_user_unique['distinct_count'] = df_user_unique.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
    #         df_user_unique = df_user_unique[~((df_user_unique.interaction == 0) & (df_user_unique.distinct_count == 2))].reset_index(drop=True)
    #         df_user_unique.drop('distinct_count', axis=1, inplace=True)

        print ('null values:', df_user_unique[df_user_unique.product_id.isnull()].shape[0])
        print ('')
        print ('')
        # shuffling data for better training mechanics later
        df_user_unique = df_user_unique.sample(frac=1).reset_index(drop=True)

        # writing to file
        df_user_unique.to_csv(filename, mode='a', index=False, header=(not os.path.exists(filename)))

        del df_user
        del df_user_unique

In [92]:
# # testing

# include_hard_negative = False
# all_positives = True
# n_hard_negatives = 0
# include_random_popularity_sampling = True


# # filtering previous purchase data for population we've selected (user_study)
# df_interaction_prior = orders_products_prior[orders_products_prior.user_id.isin(user_study)][["user_id", "product_id"]].reset_index(drop=True)
# #df_interaction_prior = orders_products_prior[["user_id", "product_id"]].reset_index(drop=True)

# # specifying that our target variable is 1 for all previously purchased items
# df_interaction_prior['interaction'] = 1

# # taking into account the popularity of each item for sampling purposes later

# relative_frequencies = pd.DataFrame(orders_products_prior.product_id.value_counts(normalize=True))
# relative_frequencies.rename({'product_id': 'frequency'}, axis=1, inplace=True)
# relative_frequencies['product_id'] = relative_frequencies.index
# relative_frequencies.reset_index(drop=True, inplace=True)

# df = products[["product_id"]].copy()

# list_user_df = []
# counter = 0
# dict_all_nn = {}
# for user_id_group in batch(df_interaction_prior.user_id.unique(), 500):
#     counter += 500
#     print (counter)
#     # creating our negative training examples
#     df_user = pd.concat([pd.concat([df.copy() for x in range(len(user_id_group))], axis=0).reset_index(drop=True), 
#                          pd.concat([pd.Series(np.ones(products.shape[0]) * user_id, name='user_id') for user_id in user_id_group], axis=0).reset_index(drop=True)], axis=1)
#     df_user["interaction"] = 0

#     # concatenating positive (purchased items) and negative training examples
#     df_user = pd.concat([df_interaction_prior[df_interaction_prior.user_id.isin(user_id_group)], df_user], 
#                             axis=0, ignore_index=True)

#     # removing instances where we have a positive and negative instance for the same product. The negative instance
#     # will be dropped and the positive example will remain
#     df_user['distinct_count'] = df_user.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
#     df_user_unique = df_user[~((df_user.interaction == 0) & (df_user.distinct_count == 2))].reset_index(drop=True)
#     df_user_unique.drop('distinct_count', axis=1, inplace=True)

#     # converting user_id and product_id back to their original index
#     df_user_unique.user_id = df_user_unique.user_id.astype(int)
#     df_user_unique.product_id = df_user_unique.product_id.astype(int)
#     df_user_unique.product_id = df_user_unique.product_id - 1
#     original_shape = df_user_unique.shape[0]
#     if all_positives:
#         # want to include both products that are included in our top X predictions as well as all previously purchased items
#         df_user_unique = pd.merge(df_user_unique, cg_exploded[cg_exploded.user_id.isin(user_id_group)], how='left', on=['user_id', 'product_id'])
#         df_user_unique = df_user_unique[(df_user_unique.candidate_generated_set == True) | (test.interaction == 1)]
#         df_user_unique.user_id = df_user_unique.user_id.apply(lambda x: user_zerobased_map[x])
#         print (str(df_user_unique.shape[0] - original_shape), 'positives were added from historical purchases not in top K predictions from ALS.')
#     else:
#         # only want to include products that are included in our top X predictions from stage 1 model
#         df_user_unique = pd.merge(df_user_unique, cg_exploded[cg_exploded.user_id.isin(user_id_group)], how='inner', on=['user_id', 'product_id'])
#         df_user_unique.user_id = df_user_unique.user_id.apply(lambda x: user_zerobased_map[x])
#     df_user_unique.drop(['candidate_generated_set'], axis=1 ,inplace=True)
#     original_shape = df_user_unique.shape[0]
#     # add in random popularity sampling
#     if include_random_popularity_sampling:
#         unique_positive_count = df_user_unique[df_user_unique.interaction == 1].drop_duplicates().shape[0]
#         sample_product_ids = random.choices(list(relative_frequencies.product_id), list(relative_frequencies.frequency), 
#                                       k=unique_positive_count*n_rns)
#         negative_random_samples = pd.concat([pd.concat([df_user_unique[df_user_unique.interaction == 1].drop_duplicates().user_id.reset_index(drop=True) for i in range(n_rns)], axis=0).reset_index(drop=True), 
#                                              pd.Series(sample_product_ids, name='product_id'), 
#                                              pd.Series([0] * unique_positive_count*n_rns, name='interaction')], axis=1)
#         df_user_unique = pd.concat([df_user_unique, negative_random_samples], axis=0)
#         df_user_unique.reset_index(drop=True, inplace=True)
#         df_user_unique['distinct_count'] = df_user_unique.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
#         df_user_unique = df_user_unique[~((df_user_unique.interaction == 0) & (df_user_unique.distinct_count == 2))].reset_index(drop=True)
#         df_user_unique.drop('distinct_count', axis=1, inplace=True)
#         print (str(df_user_unique.shape[0] - original_shape), 'negatives were added from random negative sampling.')
#     original_shape = df_user_unique.shape[0]
#     # add in hard negatives based on parameter, n
#     if include_hard_negative:

#         df_nn = df_user_unique.copy()
#         df_nn = df_nn.drop_duplicates()
#         df_nn = df_nn[df_nn.interaction == 1].copy()
#         user_cohort_purchased_products = df_user_unique[df_user_unique.interaction == 1].product_id.unique()
#         start = time.time()
#         dict_all_nn = update_neighbor_dict(user_cohort_purchased_products, dict_all_nn, n_hard_negatives)
#         end = time.time()
#         print (len(dict_all_nn.items()), 'items in dictionary.')

#         print ('completed neighbor search in', np.round((end-start), 2), 'seconds.')

#         df_nn['nearest_neighbors'] = df_nn.product_id.map(dict_all_nn)
#         df_nn = df_nn.explode('nearest_neighbors')
#         df_nn.drop(['product_id'], axis=1, inplace=True)
#         df_nn = df_nn.rename({'nearest_neighbors': 'product_id'}, axis=1)
#         df_nn['interaction'] = 0
#         df_user_unique = pd.concat([df_user_unique, df_nn], axis=0).reset_index(drop=True)
#         df_user_unique['distinct_count'] = df_user_unique.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
#         df_user_unique = df_user_unique[~((df_user_unique.interaction == 0) & (df_user_unique.distinct_count == 2))].reset_index(drop=True)
#         df_user_unique.drop('distinct_count', axis=1, inplace=True)
#         print (str(df_user_unique.shape[0] - original_shape), 'negatives were added from hard negative sampling.')
# #         user_cohort_purchased_products = df_user_unique[df_user_unique.interaction == 1].product_id.unique()

# #         start = time.time()
# #         closest_neighbor_dict = update_neighbor_dict(user_cohort_purchased_products, n)
# #         end = time.time()
# #         print ('completed neighbor search in', np.round((end-start), 2), 'seconds.')
# #         dict_all_nn.update(closest_neighbor_dict)

# #         df_closest_neighbors = pd.DataFrame(closest_neighbor_dict.items(), columns=['product_id', 'nearest_neighbors']).explode('nearest_neighbors').reset_index(drop=True)
# #         df_closest_neighbors['interaction'] = 0
# #         hard_negatives = pd.merge(df_user_unique[df_user_unique.interaction == 1].drop_duplicates(), df_closest_neighbors, how='left', on='product_id')
# #         hard_negatives.drop(['interaction_x', 'product_id'], axis=1, inplace=True)
# #         hard_negatives = hard_negatives.rename({'interaction_y': 'interaction', 'nearest_neighbors': 'product_id'}, axis=1)
# #         df_user_unique = pd.concat([df_user_unique, hard_negatives], axis=0).reset_index(drop=True)
# #         df_user_unique['distinct_count'] = df_user_unique.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
# #         df_user_unique = df_user_unique[~((df_user_unique.interaction == 0) & (df_user_unique.distinct_count == 2))].reset_index(drop=True)
# #         df_user_unique.drop('distinct_count', axis=1, inplace=True)

#     print ('null values:', df_user_unique[df_user_unique.product_id.isnull()].shape[0])
#     print ('')
#     print ('')
#     # shuffling data for better training mechanics later
#     df_user_unique = df_user_unique.sample(frac=1).reset_index(drop=True)

#     # writing to file
#     df_user_unique.to_csv(filename, mode='a', index=False, header=(not os.path.exists(filename)))

#     del df_user
#     del df_user_unique

In [93]:
# pd.concat([df_user_unique[df_user_unique.interaction == 1].drop_duplicates().user_id.reset_index(drop=True) for i in range(2)], axis=0).reset_index(drop=True)



# negative_random_samples

# relative_frequencies = pd.DataFrame(orders_products_prior.product_id.value_counts(normalize=True))
# relative_frequencies.rename({'product_id': 'frequency'}, axis=1, inplace=True)
# relative_frequencies['product_id'] = relative_frequencies.index
# relative_frequencies.reset_index(drop=True, inplace=True)
# sample_product_ids = choices(list(relative_frequencies.product_id), list(relative_frequencies.frequency), 
#                                       k=orders_products_prior.shape[0])

# pd.merge(relative_frequencies, pd.read_csv('~/work/products.csv'), how='left', on='product_id')

# import random

# df_user_unique

# unique_positive_count = df_user_unique[df_user_unique.interaction == 1].drop_duplicates().shape[0]

# sample_product_ids = random.choices(list(relative_frequencies.product_id), list(relative_frequencies.frequency), 
#                                       k=unique_positive_count)

# negative_random_samples = pd.concat([df_user_unique[df_user_unique.interaction == 1].drop_duplicates().user_id.reset_index(drop=True), pd.Series(sample_product_ids, name='product_id'), pd.Series([0] * unique_positive_count, name='interaction')], axis=1)

# negative_random_samples

# df_user_unique = pd.concat([df_user_unique, negative_random_samples], axis=0)
# df_user_unique.reset_index(drop=True, inplace=True)

# 24918838 - df_user_unique.shape[0]

# pd.merge(pd.Series(sample_product_ids, name='product_id'), pd.read_csv('~/work/products.csv'), how='left', on='product_id')

# relative_frequencies

# sample_product_ids = choices(list(relative_frequencies.product_id), list(relative_frequencies.frequency), 
#                                       k=orders_products_prior.shape[0])

# df_user_unique.head()

# cg_exploded['candidate_generated_set'] = True

# pd.merge(df_user_unique, cg_exploded[cg_exploded.user_id.isin(user_id_group)], how='inner', on=['user_id', 'product_id'])

# test = pd.merge(df_user_unique, cg_exploded[cg_exploded.user_id.isin(user_id_group)], how='left', on=['user_id', 'product_id'])

# test = test[(test.candidate_generated_set == True) | (test.interaction == 1)]

# test[test.candidate_generated_set.isnull()]

In [95]:
def val_data(filename, orders_products_train, products, user_study):
# filtering previous purchase data for population we've selected (user_study)
    df_interaction_train = orders_products_train[orders_products_train.user_id.isin(user_study)][["user_id", "product_id"]].reset_index(drop=True)
    df_interaction_train['interaction'] = 1

    
    df_user_zerobased_map = pd.DataFrame.from_dict(user_zerobased_map, orient='index', columns=['zerobased'])
    df_user_zerobased_map['user_id'] = df_user_zerobased_map.index
    df_user_zerobased_map.reset_index(drop=True, inplace=True)
    df = products[["product_id"]].copy()
    list_user_df = []
    counter = 0
    for user_id_group in batch(df_interaction_train.user_id.unique(), 500):
        counter += 500

        print (counter)

        # creating our negative training examples
        df_user = pd.concat([pd.concat([df.copy() for x in range(len(user_id_group))], axis=0).reset_index(drop=True), 
                             pd.concat([pd.Series(np.ones(products.shape[0]) * user_id, name='user_id') for user_id in user_id_group], axis=0).reset_index(drop=True)], axis=1)
        df_user["interaction"] = 0

        # concatenating positive (purchased items) and negative training examples
        df_user = pd.concat([df_interaction_train[df_interaction_train.user_id.isin(user_id_group)], df_user], 
                                axis=0, ignore_index=True)

        # removing instances where we have a positive and negative instance for the same product. The negative instance
        # will be dropped and the positive example wil remain
        df_user['distinct_count'] = df_user.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
        df_user_unique = df_user[~((df_user.interaction == 0) & (df_user.distinct_count == 2))].reset_index(drop=True)
        df_user_unique.drop('distinct_count', axis=1, inplace=True)
        df_user_unique.user_id = df_user_unique.user_id.astype(int)
        df_user_unique.product_id = df_user_unique.product_id - 1

        # only want to include products that are included in our top X predictions from stage 1 model
        df_user_unique = pd.merge(df_user_unique, cg_exploded[cg_exploded.user_id.isin(user_id_group)], how='inner', on=['user_id', 'product_id'])

        # converting user_id and product_id back to their original index
        df_user_unique = pd.merge(df_user_unique, df_user_zerobased_map, how='left', on='user_id')
        df_user_unique.drop(['user_id'], axis=1, inplace=True)
        df_user_unique.rename({'zerobased': 'user_id'}, axis=1, inplace=True)
        df_user_unique = df_user_unique[['user_id', 'product_id', 'interaction']]

        # shuffling data for better training mechanics later
        df_user_unique = df_user_unique.sample(frac=1).reset_index(drop=True)

        # writing to file
        df_user_unique.to_csv(filename, mode='a', index=False, header=(not os.path.exists(filename)))

        del df_user
        del df_user_unique

    print ('done with for loop')

In [96]:
def extract_ground_truth(filename, user_study):
    # filename to write ground truth instances to
    # creating 0-based index dataframe to convert original index back more efficiently

    df_user_zerobased_map = pd.DataFrame.from_dict(user_zerobased_map, orient='index', columns=['zerobased'])
    df_user_zerobased_map['user_id'] = df_user_zerobased_map.index
    df_user_zerobased_map.reset_index(drop=True, inplace=True)

    df_interaction_train_ground_truth = orders_products_train[orders_products_train.user_id.isin(user_study)][["user_id", "product_id"]].reset_index(drop=True)
    df_interaction_train_ground_truth['interaction'] = 1

    # converting user_id and product_id back to original index
    df_interaction_train_ground_truth.user_id = df_interaction_train_ground_truth.user_id.astype(int)
    df_interaction_train_ground_truth = pd.merge(df_interaction_train_ground_truth, df_user_zerobased_map, how='left', on='user_id')
    df_interaction_train_ground_truth.drop(['user_id'], axis=1, inplace=True)
    df_interaction_train_ground_truth.rename({'zerobased': 'user_id'}, axis=1, inplace=True)
    df_interaction_train_ground_truth = df_interaction_train_ground_truth[['user_id', 'product_id', 'interaction']]
    df_interaction_train_ground_truth.product_id = df_interaction_train_ground_truth.product_id - 1

    # saving to file
    df_interaction_train_ground_truth.to_pickle(ground_truth_filename)

In [None]:
# # testing

# n=2
# include_hard_negative = True


# # filtering previous purchase data for population we've selected (user_study)
# df_interaction_prior = orders_products_prior[orders_products_prior.user_id.isin(user_study)][["user_id", "product_id"]].reset_index(drop=True)
# #df_interaction_prior = orders_products_prior[["user_id", "product_id"]].reset_index(drop=True)

# # specifying that our target variable is 1 for all previously purchased items
# df_interaction_prior['interaction'] = 1

# df = products[["product_id"]].copy()

# list_user_df = []
# counter = 0
# dict_all_nn = {}
# for user_id_group in batch(df_interaction_prior.user_id.unique(), 500):
#     counter += 500
#     print (counter)
#     # creating our negative training examples
#     df_user = pd.concat([pd.concat([df.copy() for x in range(len(user_id_group))], axis=0).reset_index(drop=True), 
#                          pd.concat([pd.Series(np.ones(products.shape[0]) * user_id, name='user_id') for user_id in user_id_group], axis=0).reset_index(drop=True)], axis=1)
#     df_user["interaction"] = 0

#     # concatenating positive (purchased items) and negative training examples
#     df_user = pd.concat([df_interaction_prior[df_interaction_prior.user_id.isin(user_id_group)], df_user], 
#                             axis=0, ignore_index=True)

#     # removing instances where we have a positive and negative instance for the same product. The negative instance
#     # will be dropped and the positive example will remain
#     df_user['distinct_count'] = df_user.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
#     df_user_unique = df_user[~((df_user.interaction == 0) & (df_user.distinct_count == 2))].reset_index(drop=True)
#     df_user_unique.drop('distinct_count', axis=1, inplace=True)

#     # converting user_id and product_id back to their original index
#     df_user_unique.user_id = df_user_unique.user_id.astype(int)
#     df_user_unique.product_id = df_user_unique.product_id.astype(int)
#     df_user_unique.product_id = df_user_unique.product_id - 1

#     # only want to include products that are included in our top X predictions from stage 1 model
#     df_user_unique = pd.merge(df_user_unique, cg_exploded[cg_exploded.user_id.isin(user_id_group)], how='inner', on=['user_id', 'product_id'])
#     df_user_unique.user_id = df_user_unique.user_id.apply(lambda x: user_zerobased_map[x])

#     # add in hard negatives based on parameter, n
#     if include_hard_negative:
        
#         df_nn = df_user_unique.copy()
#         df_nn = df_nn.drop_duplicates()
#         df_nn = df_nn[df_nn.interaction == 1].copy()
#         user_cohort_purchased_products = df_user_unique[df_user_unique.interaction == 1].product_id.unique()
#         start = time.time()
#         dict_all_nn = update_neighbor_dict(user_cohort_purchased_products, dict_all_nn, n)
#         print (len(dict_all_nn.items()), 'items in dictionary.')
#         end = time.time()
#         print ('completed neighbor search in', np.round((end-start), 2), 'seconds.')
        
#         df_nn['nearest_neighbors'] = df_nn.product_id.map(dict_all_nn)
#         df_nn = df_nn.explode('nearest_neighbors')
#         df_nn.drop(['product_id'], axis=1, inplace=True)
#         df_nn = df_nn.rename({'nearest_neighbors': 'product_id'}, axis=1)
#         df_nn['interaction'] = 0
#         df_user_unique = pd.concat([df_user_unique, df_nn], axis=0).reset_index(drop=True)
#         df_user_unique['distinct_count'] = df_user_unique.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
#         df_user_unique = df_user_unique[~((df_user_unique.interaction == 0) & (df_user_unique.distinct_count == 2))].reset_index(drop=True)
#         df_user_unique.drop('distinct_count', axis=1, inplace=True)
        
# #         user_cohort_purchased_products = df_user_unique[df_user_unique.interaction == 1].product_id.unique()
        
# #         start = time.time()
# #         closest_neighbor_dict = update_neighbor_dict(user_cohort_purchased_products, n)
# #         end = time.time()
# #         print ('completed neighbor search in', np.round((end-start), 2), 'seconds.')
# #         dict_all_nn.update(closest_neighbor_dict)

# #         df_closest_neighbors = pd.DataFrame(closest_neighbor_dict.items(), columns=['product_id', 'nearest_neighbors']).explode('nearest_neighbors').reset_index(drop=True)
# #         df_closest_neighbors['interaction'] = 0
# #         hard_negatives = pd.merge(df_user_unique[df_user_unique.interaction == 1].drop_duplicates(), df_closest_neighbors, how='left', on='product_id')
# #         hard_negatives.drop(['interaction_x', 'product_id'], axis=1, inplace=True)
# #         hard_negatives = hard_negatives.rename({'interaction_y': 'interaction', 'nearest_neighbors': 'product_id'}, axis=1)
# #         df_user_unique = pd.concat([df_user_unique, hard_negatives], axis=0).reset_index(drop=True)
# #         df_user_unique['distinct_count'] = df_user_unique.groupby(by=["user_id", "product_id"])['interaction'].transform('nunique')
# #         df_user_unique = df_user_unique[~((df_user_unique.interaction == 0) & (df_user_unique.distinct_count == 2))].reset_index(drop=True)
# #         df_user_unique.drop('distinct_count', axis=1, inplace=True)
    
#     print ('null values:', df_user_unique[df_user_unique.product_id.isnull()].shape[0])
#     print ('')
#     print ('')
#     # shuffling data for better training mechanics later
#     df_user_unique = df_user_unique.sample(frac=1).reset_index(drop=True)

#     # writing to file
#     df_user_unique.to_csv(filename, mode='a', index=False, header=(not os.path.exists(filename)))

#     del df_user
#     del df_user_unique
    
    

In [None]:
# df_nn = df_user_unique.copy()
# df_nn = df_nn.drop_duplicates()
# df_nn = df_nn[df_nn.interaction == 1].copy()
# df_nn['nearest_neighbors'] = df_nn.product_id.map(dict_all_nn)
# df_nn = df_nn.explode('nearest_neighbors')
# df_nn.drop(['product_id'], axis=1, inplace=True)
# df_nn = df_nn.rename({'nearest_neighbors': 'product_id'}, axis=1)
# df_nn['interaction'] = 0

# to do

- pull closest neighbors to products purchased
- make sure user has not purchased them in the past
- add them into interaction_prior
- BREAK
- include code in here for validation set and ground truth set
- set up code to start training on set
- BREAK
- try to include portion of randomly sampling products from entire distribution
- include all previously purchased items as positives
- try speed up from ANN (spotify)
- calculate theoretical best MAP score and read up on NDCG score

In [None]:
# start = time.time()
# closest_neighbor_dict = {}
# for x in user_cohort_purchased_products:
#     if x not in dict_all_nn:
#         closest_neighbor_dict[x] = grab_closest_n_neighbors(2, x, tree)
# end = time.time()
# print (end-start)

In [100]:
# filename = "interaction_prior_20k_rank250_100recs.csv"
# train_data(filename, orders_products_prior, products, user_study)
# train_data(filename, orders_products_prior, products, user_study, True, nn_option)


filename = "interaction_prior_20k_rank250_100recs_withrns(5).csv"
# train_data(filename, orders_products_prior, products, user_study, include_hard_negative, n_hard_negatives, all_positives, include_random_popularity_sampling, n_rns)
train_data(filename, orders_products_prior, products, user_study, False, 1, False, True, 10)

500
124057 negatives were added from random negative sampling.
null values: 0


1000
142996 negatives were added from random negative sampling.
null values: 0


1500
151902 negatives were added from random negative sampling.
null values: 0


2000
177291 negatives were added from random negative sampling.
null values: 0


2500
166384 negatives were added from random negative sampling.
null values: 0


3000
172334 negatives were added from random negative sampling.
null values: 0


3500
172415 negatives were added from random negative sampling.
null values: 0


4000
169328 negatives were added from random negative sampling.
null values: 0


4500
168505 negatives were added from random negative sampling.
null values: 0


5000
172050 negatives were added from random negative sampling.
null values: 0


5500
175938 negatives were added from random negative sampling.
null values: 0


6000
166869 negatives were added from random negative sampling.
null values: 0


6500
172341 negatives were ad

In [12]:
for nn_option in [10]:
    print (nn_option)
    filename = "interaction_prior_20k_rank250_100recs_with_hardnegatives_" + str(nn_option)+"neighbors.csv"
    train_data(filename, orders_products_prior, products, user_study, True, nn_option)

10
500
2788 items in dictionary.
completed neighbor search in 30.52 seconds.
null values: 0


1000
4345 items in dictionary.
completed neighbor search in 18.88 seconds.
null values: 0


1500
5252 items in dictionary.
completed neighbor search in 10.82 seconds.
null values: 0


2000
5923 items in dictionary.
completed neighbor search in 8.06 seconds.
null values: 0


2500
6510 items in dictionary.
completed neighbor search in 7.28 seconds.
null values: 0


3000
6979 items in dictionary.
completed neighbor search in 5.64 seconds.
null values: 0


3500
7344 items in dictionary.
completed neighbor search in 4.36 seconds.
null values: 0


4000
7701 items in dictionary.
completed neighbor search in 4.19 seconds.
null values: 0


4500
8024 items in dictionary.
completed neighbor search in 4.05 seconds.
null values: 0


5000
8289 items in dictionary.
completed neighbor search in 3.17 seconds.
null values: 0


5500
8573 items in dictionary.
completed neighbor search in 3.38 seconds.
null values

In [10]:
filename = "interaction_prior_20k_rank250_100recs_with_hardnegatives_2neighbors.csv"
train_data(filename, orders_products_prior, products, user_study, True, 2)

interaction_prior_20k_rank250_100recs_with_hardnegatives_3neighbors.csv


In [None]:
filename = "interaction_train_20k_rank250_100recs.csv"
val_data(filename, orders_products_train, products, user_study)

In [None]:
ground_truth_filename = "cg_interaction_ground_truth_20k_rank250_100recs.pkl"
extract_ground_truth(ground_truth_filename, user_study)

In [None]:
#vector_list = products.features.tolist()
# index = 4
# study_index_vector = vector_list[index]

# print (products.loc[index, 'product_name'])
# #mod_vector_list = vector_list[:index] + vector_list[index+1:]
# # vector_list[index] = np.ones(100) * 20
# tree = spatial.KDTree(vector_list)

# tree_search_indices = tree.query(study_index_vector, k=20)[1]
# print (tree_search_indices)
# products.loc[tree_search_indices]