In [2]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict

In [3]:
with open('prior_df.pickle', 'rb') as read_file:
    prior_df = pickle.load(read_file)   

In [76]:
# import SQL table with user features
with open('user_df.pickle','rb') as read_file:
    user_df = pickle.load(read_file)

In [51]:
# import SQL table with product features
with open('prod_df.pickle','rb') as read_file:
    prod_df = pickle.load(read_file)   

In [6]:
orders= pd.read_csv('./instacart_2017_05_01/orders_clean.csv')

In [7]:
train_orders = pd.read_csv('./instacart_2017_05_01/order_products__train.csv')

In [8]:
df_train_orders = train_orders.merge(orders.drop('eval_set',axis=1),on='order_id')

In [9]:
# creating df for user product features on prior data
df_user_product = (prior_df.groupby(['product_id','user_id'],as_index=False) 
                                          .agg({'order_id':'count'}) 
                                          .rename(columns={'order_id':'user_product_total_orders'}))

In [10]:
# creating random sample of 20% of users from train set - for RAM purposes
import random
def get_smaller_data(df,seed=42):
    rs = np.random.RandomState(seed)
    train_ids = df['user_id'].unique() 
    train_users = rs.choice(train_ids, 
                               size=int(train_ids.shape[0] * .2), 
                               replace=False)
    smaller_df = df[df['user_id'].isin(train_users)]
    return train_users

In [11]:
small_train_users = get_smaller_data(df_train_orders)

In [74]:
# filtering df for only smaller train set IDs
df_X = df_user_product[df_user_product['user_id'].isin(small_train_users)]

In [75]:
# latest cart = train_cart product ids
train_carts = (df_train_orders.groupby('user_id',as_index=False)
                                      .agg({'product_id':(lambda x: set(x))})
                                      .rename(columns={'product_id':'latest_cart'}))

df_X = df_X.merge(train_carts, on='user_id')
df_X['will_order'] = (df_X.apply(lambda row: row['product_id'] in row['latest_cart'], axis=1).astype(int))

In [77]:
# merge product features onto df for modeling
df_X = df_X.merge(prod_df,on='product_id')

In [78]:
# merge user features onto df for modeling
df_X = df_X.merge(user_df,on='user_id')

### User-Product Pandas Feature Engineering

In [133]:
# streak length - number of consecutive user orders that contain the product
# num orders since last purchased product
# num days since last purchased product
# group by user_id, product_id, order_number
# def consec_orders(user_group):
#     # get streak length and put into df
#     user_num = user_group['user_id']
#     max_order = user_group['order_number'].max()
#     prod_dict = {}
#     for product_id, group in user_group.groupby('product_id'):
#         prod_dict[product_id] = {'order_number':max(group['order_number']),'streak_length':0,'orders_since_prod':0,'prod_days':0,'days_since_prod':None}
          
#     for order_number, group in user_group.groupby('order_number', sort=False):
#             days = max(group['days_since_prior_order'])
#     continue
            
#         product_list = group['product_id'].unique()
        
#         if group.order_id.unique in train_orders_list:
#             for product_id in prod_dict:
#                 prod_dict[product_id]['prod_days'] +=days
#             continue
                
#         for product_id in prod_dict: 
#             if product_id in product_list:
#                 if prod_dict[product_id]['days_since_prod']==None:
#                     prod_dict[product_id]['days_since_prod'] = prod_dict[product_id]['prod_days']
#                     prod_dict[product_id]['orders_since_prod'] = max_order-order_number
#                 if max_order-order_number-1 == prod_dict[product_id]['streak_length']:
#                     prod_dict[product_id]['streak_length'] += 1
                    
#                 prod_dict[product_id]['prod_days'] = days
#             else:
#                 prod_dict[product_id]['prod_days'] += days
#           #increment days, and loop again
#         days = max(group['days_since_prior_order'])

#     agg_df = pd.DataFrame(prod_dict)
#     agg_df = agg_df.transpose()
#     agg_df = agg_df.reset_index().rename(columns={'index':'product_id'})
#     return agg_df

# aggregating new features per user
# # killed this. never completed running.
# user_agg_df = pd.DataFrame()
# for each in users_list:
#         user_df = train_df[train_df['user_id'] == each]
#         agg_df = consec_orders(user_df)
#         agg_df['user_id'] = each
#         user_agg_df = user_agg_df.append(agg_df, ignore_index=True)

In [155]:
# create new features for # times product ordered in latest 5 orders as a %
def latest_orders(user_group):
    # num of latest orders that contain the product
    max_order = user_group['order_number'].max()
    prod_dict = {}
    for product_id, group in user_group.groupby('product_id'):
        prod_dict[product_id] = {'num_latest_orders':0}
          
    for order_number, group in user_group.groupby('order_number', sort=False):
            for i in range(0,5):
                if order_number == max_order - i:
                    product_list = group['product_id'].unique()

                    for product_id in prod_dict: 
                        if product_id in product_list:
                            prod_dict[product_id]['num_latest_orders']+=1
                        else:
                            continue

    agg_df = pd.DataFrame(prod_dict)
    agg_df = agg_df.transpose()
    agg_df = agg_df.reset_index().rename(columns={'index':'product_id'})
    return agg_df

In [39]:
# create new features for # times product ordered in latest 5 orders as a %
def days_since(user_group):
    # num of latest orders that contain the product
    max_order = user_group['order_number'].max()
    prod_dict = {}
    for product_id, group in user_group.groupby('product_id'):
        max_days = group['days_since_prior_order'].max()
        prod_dict[product_id] = {'num_latest_orders':0,'days_since_prod':None}
          
    for order_number, group in user_group.groupby('order_number', sort=False):
        days_list = []
        for i in [4,3,2,1,0]:
            if order_number == max_order - i:
                product_list = group['product_id'].unique()
                days = group['days_since_prior_order'].max()
                    
                for product_id in prod_dict: 
                    if product_id in product_list:
                        days_list.append(days)
                        idx = days_list.index(days)
                        prod_dict[product_id]['num_latest_orders']+=1
                        prod_dict[product_id]['days_since_prod'] = days_list[idx-1]
                    else:
                        prod_dict[product_id]['days_since_prod'] = max_days
                        continue

    agg_df = pd.DataFrame(prod_dict)
    agg_df = agg_df.transpose()
    agg_df = agg_df.reset_index().rename(columns={'index':'product_id'})
    return agg_df

In [41]:
# creates dataframe by user ID to run days_since function on
user_agg_df = pd.DataFrame()
for each in small_train_users:
        user_df = prior_df[prior_df['user_id'] == each]
        agg_df = days_since(user_df)
        agg_df['user_id'] = each
        user_agg_df = user_agg_df.append(agg_df, ignore_index=True)

In [79]:
# user-product ID aggegrated dataframe
user_prod_features = ['user_prod_avg_cart_order']

user_prod_df = (prior_df.groupby(['product_id','user_id'],as_index=False) \
                                                .agg(OrderedDict(
                                                     [('add_to_cart_order','mean')])))

user_prod_df.columns = ['product_id','user_id'] + user_prod_features 
user_prod_df.head()

Unnamed: 0,product_id,user_id,user_prod_avg_cart_order
0,1,138,3.0
1,1,709,20.0
2,1,764,10.5
3,1,777,7.0
4,1,825,2.0


In [80]:
user_prod_df = user_prod_df.merge(user_agg_df,on=['product_id','user_id'])

In [82]:
# merge user_product features onto dataframe for modeling
df_X = df_X.merge(user_prod_df,on=['user_id','product_id'])

In [85]:
df_X['user_prod_order_freq'] = df_X['user_product_total_orders']/df_X['user_total_orders']

In [86]:
df_X['user_prod_latest_perc'] = df_X['num_latest_orders']/5

In [89]:
df_X = df_X.drop(columns=['user_product_total_orders','num_latest_orders'],axis=1).rename(columns={'days_since_prod': 'user_prod_days_since_last'})

In [92]:
df_X.head()

Unnamed: 0,product_id,user_id,latest_cart,will_order,prod_total_orders,prod_avg_add_to_cart_order,prod_avg_days_since_prior_order,user_total_orders,user_avg_cartsize,user_total_products,user_avg_days_since_prior_order,user_avg_reorder_ratio,user_prod_avg_cart_order,user_prod_days_since_last,user_prod_order_freq,user_prod_latest_perc
0,1,709,"{45444, 32005, 26893, 8859, 28577, 48036, 1027...",0,1202,6,10.3261,5.0,38.0,135.0,5.708333,0.296875,20.0,8.0,0.2,0.2
1,196,709,"{45444, 32005, 26893, 8859, 28577, 48036, 1027...",0,22920,4,11.2559,5.0,38.0,135.0,5.708333,0.296875,25.0,8.0,0.4,0.4
2,223,709,"{45444, 32005, 26893, 8859, 28577, 48036, 1027...",0,99,5,9.9798,5.0,38.0,135.0,5.708333,0.296875,34.0,8.0,0.2,0.2
3,587,709,"{45444, 32005, 26893, 8859, 28577, 48036, 1027...",0,23,12,12.6957,5.0,38.0,135.0,5.708333,0.296875,25.0,8.0,0.2,0.2
4,955,709,"{45444, 32005, 26893, 8859, 28577, 48036, 1027...",0,143,10,11.3636,5.0,38.0,135.0,5.708333,0.296875,30.0,8.0,0.2,0.2


In [203]:
# with open('df_X.pickle', 'wb') as to_write:
#     pickle.dump(df_X,to_write)