In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 101)

### Picking up where we left off...

In [2]:
# read in pickled DFs generated by query_dfs.py

df_orders = pd.read_pickle("./pickle/df_orders.pickle")
df_train = pd.read_pickle("./pickle/df_train.pickle")
df_prior = pd.read_pickle("./pickle/df_prior.pickle")
df_prod_detail = pd.read_pickle("./pickle/df_prod_detail.pickle")

In [3]:
# read in pickled feature DF generated by feature_engineering_1.ipynb
X = pd.read_pickle("./pickle/X_5.pickle")

In [4]:
X.shape


(8474661, 5)

In [5]:
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart
0,1,138,2,[42475],0
1,907,138,2,[42475],0
2,1000,138,1,[42475],0


In [6]:
df_orders.sample()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
3040771,2483909,183452,prior,4,0,16,9.0


In [7]:
len(df_train)

1384617

In [8]:
mask = df_orders.eval_set == 'train'
len(df_orders[mask].order_id.unique())

131209

### Features to Add:

Product features
* `prod_pct_reordered`: product percent of prior transactions that were reorders
* `prod_rank` rank in terms of products sold
* some combination of the above, perhaps?

User features
* user num orders
* average items per order

#### Product Features

Let's start with prod_prior_sales

In [9]:
prod_sales = (df_prior.groupby('product_id')
              .agg({"product_id": "count"})['product_id']
              .sort_values(ascending=False))

prod_sales

product_id
24852    472565
13176    379450
21137    264683
21903    241921
47209    213584
          ...  
45893         1
13397         1
30451         1
42464         1
25248         1
Name: product_id, Length: 49677, dtype: int64

In [10]:
# get total sales for each product & add to X

X = (X.merge(prod_sales, left_on='product_id', right_index=True)
    .drop(columns="product_id_x"))
    
X.rename(columns={"product_id_y": "prod_prior_sales"}, inplace=True)
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,prod_prior_sales
0,1,138,2,[42475],0,1852
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,1852
190,1,777,1,"[16797, 43352, 6184]",0,1852


In [11]:
X.shape

(8474661, 6)

In [12]:
# add prod_pct_reordered col
reorders = df_prior.groupby("product_id").agg({"reordered": "sum"})
reorders.head(3)

Unnamed: 0_level_0,reordered
product_id,Unnamed: 1_level_1
1,1136
2,12
3,203


In [13]:
X = (X.merge(reorders, left_on='product_id', right_index=True)
     .rename(columns={"reordered": "prod_prior_reorders"}))
del reorders
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,prod_prior_sales,prod_prior_reorders
0,1,138,2,[42475],0,1852,1136
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,1852,1136
190,1,777,1,"[16797, 43352, 6184]",0,1852,1136


In [14]:
X["prod_pct_reorders"] = X.prod_prior_reorders / X.prod_prior_sales
X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,prod_prior_sales,prod_prior_reorders,prod_pct_reorders
0,1,138,2,[42475],0,1852,1136,0.613391
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,1852,1136,0.613391
190,1,777,1,"[16797, 43352, 6184]",0,1852,1136,0.613391


In [15]:
X.shape

(8474661, 8)

#### User Features

In [16]:
user_order_counts = (df_orders.groupby("user_id")
                     .agg({"order_id": "count"})
                     .rename(columns={"order_id": "user_prior_orders"}))
                     
                     
user_order_counts.head(3)

Unnamed: 0_level_0,user_prior_orders
user_id,Unnamed: 1_level_1
1,11
2,15
3,13


In [17]:
X.shape

(8474661, 8)

In [18]:
# add user_total_orders col

X = X.merge(user_order_counts, how='left', left_on='user_id', right_index=True)

del user_order_counts

X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,prod_prior_sales,prod_prior_reorders,prod_pct_reorders,user_prior_orders
0,1,138,2,[42475],0,1852,1136,0.613391,33
55,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,1852,1136,0.613391,6
190,1,777,1,"[16797, 43352, 6184]",0,1852,1136,0.613391,27


In [19]:
user_num_prior_orders = (df_prior[['user_id', 'order_id', 'order_number']]
                         .groupby("user_id")
                         .agg({"order_number": "max"}))

user_num_prior_orders
                

Unnamed: 0_level_0,order_number
user_id,Unnamed: 1_level_1
1,10
2,14
3,12
4,5
5,4
...,...
206205,3
206206,67
206207,16
206208,49


In [20]:
df_prior = df_prior.merge(user_num_prior_orders, on='user_id')
df_prior.head(3)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number_x,order_dow,order_hour_of_day,days_since_prior_order,next_order_num,cart,order_number_y
0,114,24954,1,0,91891,prior,1,0,11,,2,"[24954, 1688, 37371, 5782, 1263, 23763, 24385,...",15
1,114,1688,2,0,91891,prior,1,0,11,,2,"[24954, 1688, 37371, 5782, 1263, 23763, 24385,...",15
2,114,37371,3,0,91891,prior,1,0,11,,2,"[24954, 1688, 37371, 5782, 1263, 23763, 24385,...",15


In [21]:
df_prior.rename(columns={
    "order_number_x": "order_number",
    "order_number_y": "user_prior_orders"
}, inplace=True)

In [23]:
last_prior_carts = (df_prior[['user_id', 'order_number', 'cart']].groupby("user_id", as_index=False)
                    .agg({"order_number": "max"}))

last_prior_carts.head(3)
carts = (last_prior_carts
         .merge(df_prior[['user_id', 'order_number', 'cart']],
                on=["user_id", "order_number"], suffixes=[None, "_last"]))
del last_prior_carts
carts.drop(columns="order_number", inplace=True)
carts.drop_duplicates(subset="user_id", inplace=True)
carts.head(4)


Unnamed: 0,user_id,cart
0,1,"[196, 46149, 39657, 38928, 25133, 10258, 35951..."
9,2,"[24852, 16589, 1559, 19156, 18523, 22825, 2741..."
25,3,"[39190, 18599, 23650, 21903, 47766, 24810]"
31,4,"[26576, 25623, 21573]"


In [24]:
X = (X.merge(carts, how='left', on="user_id", suffixes=[None, "_last"]))
del carts

X.head(3)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,prod_prior_sales,prod_prior_reorders,prod_pct_reorders,user_prior_orders,cart_last
0,1,138,2,[42475],0,1852,1136,0.613391,33,"[46802, 22128, 40199, 21573, 26152, 12341]"
1,1,709,1,"[4852, 3039, 49535, 14177, 28577, 10279, 27388...",0,1852,1136,0.613391,6,"[24161, 6046, 49535, 23760, 43611, 21760, 3604..."
2,1,777,1,"[16797, 43352, 6184]",0,1852,1136,0.613391,27,"[33493, 16797, 43352, 17122, 29223]"


In [25]:
X.drop(columns="user_prior_orders", inplace=True)
X.sample(5)

Unnamed: 0,product_id,user_id,num_orders,cart,in_cart,prod_prior_sales,prod_prior_reorders,prod_pct_reorders,cart_last
2984544,4812,164237,1,"[6873, 38012, 38647, 15572, 17803, 3550, 14462...",0,3218,2279,0.708204,"[30492, 10983, 43080, 45948, 22825, 10749, 131..."
1347298,27156,51719,10,"[11520, 47209, 35815, 14462, 15906, 45948, 219...",0,38001,22352,0.588195,"[9047, 36709, 44882, 11520, 44987, 17803, 1543..."
5616783,7952,20328,1,"[28449, 1766, 16096, 47141, 1481, 48532]",0,5730,3085,0.538394,"[31558, 14104, 39397, 10321, 35742, 31231, 481..."
216860,19468,199660,1,"[24850, 45747, 36929, 39982]",0,5158,1108,0.214812,"[41530, 19138, 32299, 19468, 45747, 14645, 764..."
7825086,39997,179693,1,"[4392, 27642, 23236, 39190, 48116, 8670, 2979,...",0,71,25,0.352113,"[42736, 29418, 23236, 27642, 2979, 432, 26882,..."


In [26]:
X.shape

(8474661, 9)

Nice! Now we can check if each item was or was not in the user's most recent purchase.

We'll call this column `repeat_order`.

In [27]:
mask = (df_prior.order_number)