In [1]:
# Instakart market basket analysis                  #Amaresh hiremani 22

# Import statements

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

# Reading datasets from external csv file
print('loading product prior')
pprior = pd.read_csv('order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading product train')
ptrain = pd.read_csv('order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv('orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv('products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])
print('loaded completely')



loading product prior
loading product train
loading orders
loading products
loaded completely


In [2]:
# Preprocessing 

#checking for any null values in orders dataframe
orders.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [3]:
# Replacing the nan values with 0 (implies it is 1st order) day_prior_order in orders and converting to type int
# because as it is first order there are no previous order dates
value = 0
orders['days_since_prior_order']=orders['days_since_prior_order'].fillna(value)
orders['days_since_prior_order'] = orders['days_since_prior_order'].astype(int)
orders.isnull().sum()                                                #Amaresh hiremani 22

order_id                  0
user_id                   0
eval_set                  0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

In [4]:
# Computing the user details (finding the details of purchase made by each user by user_id)

# Joining orders with product prior to get product id 
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
pprior = pprior.join(orders, on='order_id', rsuffix='_')
pprior.drop('order_id_', inplace=True, axis=1)

# Joining orders with product train to get product id 
print('add order info to train')
ptrain = ptrain.join(orders, on='order_id', rsuffix='_')
ptrain.drop('order_id_', inplace=True, axis=1)

# Reduce the user id to 500 to train 
#limits = 500
#ptrain = ptrain.loc[ptrain['user_id'] <= limits]

# Reduce the user id to 500 to test 
#pprior = pprior.loc[pprior['user_id'] <= limits]

user = pd.DataFrame()

# Total number of order by users
user['total_products'] = pprior.groupby('user_id').size().astype(np.int16)

# All product_id of products purchased by users
user['products_all'] = pprior.groupby('user_id')['product_id'].apply(set)

# Distinct no of product user purchased
user['total_distinct_items'] = (user.products_all.map(len)).astype(np.int16)

# Finding average days between subsequent orders placed by same user from orders dataframe and no of orders
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

# Joining the user with usr by user_id
user = user.join(usr)
del usr
user['average_basket'] = (user.total_products / user.nb_orders).astype(np.float32)
print('number of users:', user.shape[0])

add order info to priors
add order info to train
number of users: 206209


In [5]:
user.tail()

Unnamed: 0_level_0,total_products,products_all,total_distinct_items,average_days_between_orders,nb_orders,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
206205,32,"{20995, 21137, 22035, 21910, 17691, 31404, 210...",24,12.5,4,8.0
206206,285,"{16896, 44033, 18434, 16387, 21508, 45573, 102...",150,3.661765,68,4.191176
206207,223,"{20995, 18441, 45578, 47626, 33806, 22035, 235...",92,13.470589,17,13.117647
206208,677,"{1025, 20995, 47626, 8203, 5133, 38419, 27156,...",198,7.22,50,13.54
206209,129,"{14727, 26503, 22920, 26634, 13838, 11791, 248...",68,17.142857,14,9.214286


In [6]:
# Computing the products details 

print('computing product features')
prods = pd.DataFrame()
prods['orders'] = pprior.groupby(pprior.product_id).size().astype(np.int32)
prods['reorders'] = pprior['reordered'].groupby(pprior.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods

computing product features


In [7]:
products.head()

Unnamed: 0_level_0,product_id,aisle_id,department_id,orders,reorders,reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,61,19,1852.0,1136.0,0.613391
2,2,104,13,90.0,12.0,0.133333
3,3,94,7,277.0,203.0,0.732852
4,4,38,1,329.0,147.0,0.446809
5,5,5,13,15.0,9.0,0.6


In [8]:
pprior.head(10)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8
1,2,28985,2,1,202279,prior,3,5,9,8
2,2,9327,3,0,202279,prior,3,5,9,8
3,2,45918,4,1,202279,prior,3,5,9,8
4,2,30035,5,0,202279,prior,3,5,9,8
5,2,17794,6,1,202279,prior,3,5,9,8
6,2,40141,7,1,202279,prior,3,5,9,8
7,2,1819,8,1,202279,prior,3,5,9,8
8,2,43668,9,0,202279,prior,3,5,9,8
9,3,33754,1,1,205970,prior,16,5,17,12


In [9]:
# finding the total number of products
#max_products = products['product_id'].max()
#print('max product_id value :',max_products)
#print('max of user_id :',limits)

In [9]:
# Computing userXproduct details from user and product details previously obtained

print('compute userXproduct features this is long...')

# encoding the user_id and product_id into user_product because 50000 is the max value of product_id
pprior['user_product'] = pprior.product_id + pprior.user_id * 50000
d= dict()
for row in pprior.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1,row.order_id,row.add_to_cart_order)
    else:
        d[z] = (d[z][0] + 1, max(d[z][1], (row.order_id)),
                d[z][2] + row.add_to_cart_order)
        
print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient='index')
del d
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.astype(np.int32)
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
print('user X product f', len(userXproduct))

del pprior

compute userXproduct features this is long...
to dataframe (less memory)
user X product f 13307953


In [10]:
userXproduct.head()

Unnamed: 0,nb_orders,last_order_id,sum_pos_in_cart
10113983120,5,2894949,10
10113978985,5,2894949,16
10113959327,1,2,3
10113995918,5,2808715,24
10113980035,3,2808715,14


In [12]:
#different kind of evaluation set presrent in orders

#count = orders.eval_set.value_counts()

#plt.figure(figsize=(10,6))
#sns.barplot(count.index, count.values, alpha=0.8, color='red')
#plt.ylabel('Total Number of occurance', fontsize=12)
#plt.xlabel('Set Type', fontsize=12)
#plt.title('Count of rows in each dataset', fontsize=15)
#plt.xticks(rotation='vertical')
#plt.show() 

In [13]:
# Seperation of train and test orderset from orders to make model and predict

print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

ptrain = ptrain[['order_id','product_id','add_to_cart_order','reordered']]
ptrain.set_index(['order_id', 'product_id'], inplace=True, drop=False)
#train_orders = train_orders.loc[train_orders['user_id']<= limits]
#test_orders = test_orders.loc[test_orders['user_id']<= limits]

split orders : train, test


In [14]:
### build list of candidate products to reorder, with features ###

def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = user.products_all[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in ptrain.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(user.nb_orders)
    df['user_total_items'] = df.user_id.map(user.total_products)
    df['total_distinct_items'] = df.user_id.map(user.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(user.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(user.average_basket)
    
    print('order related features')
    df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print('user_X_product related features')
    df['z'] = df.user_id * 50000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    return (df, labels)

In [15]:
# Gathering required features and labels for given train_orders 
train_x, train_y = features(train_orders, labels_given=True)

# Features used for training the model
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate','dow','UP_orders_since_last','UP_delta_hour_vs_last'] 

build candidate list
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000
user related features
order related features
product related features
user_X_product related features


In [None]:
test_x = features(test_orders)
train_x.to_csv('train_x.csv',index=False)
train_y.to_csv('train_y.csv',index=False)
test_x.to_csv('test_x.csv',index=False)