In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
import itertools

from library.sb_utils import save_file

In [2]:
df = pd.read_csv('../data/processed/full_data_cleaned.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33819106 entries, 0 to 33819105
Data columns (total 15 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_by_user_sequence  int64  
 3   order_dow               int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
 6   product_id              int64  
 7   add_to_cart_sequence    int64  
 8   reordered               int64  
 9   product_name            object 
 10  aisle_name              object 
 11  dept_name               object 
 12  aisle_id                int64  
 13  department_id           int64  
 14  eval_set                object 
dtypes: float64(1), int64(10), object(4)
memory usage: 3.8+ GB


In [4]:
# Find just one user to explore with.
prolific_users = df[df['order_by_user_sequence']>100]
len(prolific_users['user_id'].unique())

0

In [5]:
prolific_users = df[df['order_by_user_sequence']==100]
len(prolific_users['user_id'].unique())

867

In [6]:
prolific_users['add_to_cart_sequence'].max()

47

In [7]:
prolific_user_w_biggest_cart = prolific_users[prolific_users['add_to_cart_sequence']==47]
prolific_user_w_biggest_cart

Unnamed: 0,order_id,user_id,order_by_user_sequence,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_sequence,reordered,product_name,aisle_name,dept_name,aisle_id,department_id,eval_set
5263305,1629423,32099,100,1,14,1.0,27966,47,1,Organic Raspberries,packaged vegetables fruits,produce,123,4,train


In [8]:
practice_user = df[df['user_id']==32099]
practice_user.sample(7)

Unnamed: 0,order_id,user_id,order_by_user_sequence,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_sequence,reordered,product_name,aisle_name,dept_name,aisle_id,department_id,eval_set
5261669,2154990,32099,13,2,15,1.0,48318,15,1,Taffy Dulce de Leche Sandwich Cookies,cookies cakes,snacks,61,19,prior
5262679,2245218,32099,63,4,10,2.0,15303,15,0,Comfort Sheer 60 ct Assorted Posted 5/7/2014 V...,first aid,personal care,118,11,prior
5262065,580425,32099,31,4,9,3.0,33398,17,1,Hand Made Style Flour Tortilla,tortillas flat bread,bakery,128,3,prior
5261515,455159,32099,7,2,8,2.0,27288,10,0,Chocolate Chip Cookie Dough,doughs gelatins bake mixes,pantry,105,13,prior
5261618,20145,32099,11,1,20,1.0,4462,16,1,Pink Lady Apple Kombucha,tea,beverages,94,7,prior
5262768,699779,32099,67,5,10,2.0,40663,24,1,Ricotta Original Cheese,packaged cheese,dairy eggs,21,16,prior
5261446,931900,32099,4,1,20,4.0,16696,11,1,Coke Classic,soft drinks,beverages,77,7,prior


In [9]:
# Tease out whether each item is reordered or not each order.
order1 = practice_user[practice_user['order_by_user_sequence']==1
                           ]['product_id'].unique().tolist()
len(order1)

26

In [10]:
order2 = practice_user[practice_user['order_by_user_sequence']==2
                           ]['product_id'].unique().tolist()
len(order2)

12

In [11]:
only1 = [x for x in order1 if x not in order2]
len (only1)

25

In [12]:
# Add rows so that non-reordered items are present in order 2
practice_user[practice_user['order_by_user_sequence']==1].sample(1)

Unnamed: 0,order_id,user_id,order_by_user_sequence,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_sequence,reordered,product_name,aisle_name,dept_name,aisle_id,department_id,eval_set
5261365,2959648,32099,1,0,17,,39581,20,0,French Vanilla Creamer,cream,dairy eggs,53,16,prior


In [13]:
practice_user[practice_user['order_by_user_sequence']==2].sample(1)

Unnamed: 0,order_id,user_id,order_by_user_sequence,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_sequence,reordered,product_name,aisle_name,dept_name,aisle_id,department_id,eval_set
5261396,309518,32099,2,1,15,1.0,41971,12,0,Puff Pastry Shells,frozen breads doughs,frozen,58,1,prior


In [14]:
order2deets = {'order_id':309518, 'user_id':32099, 
               'order_by_user_sequence':2, 'order_dow':1, 
               'order_hour_of_day':15, 'days_since_prior_order': 1.0,
               'product_id': only1, 'reordered':0}

order2_new_rows = pd.DataFrame.from_dict(order2deets)

order2_new_rows.head()

Unnamed: 0,order_id,user_id,order_by_user_sequence,order_dow,order_hour_of_day,days_since_prior_order,product_id,reordered
0,309518,32099,2,1,15,1.0,40285,0
1,309518,32099,2,1,15,1.0,27966,0
2,309518,32099,2,1,15,1.0,34969,0
3,309518,32099,2,1,15,1.0,7419,0
4,309518,32099,2,1,15,1.0,26209,0


In [15]:
practice_user = pd.concat([practice_user, order2_new_rows])
practice_user.sample(10)

Unnamed: 0,order_id,user_id,order_by_user_sequence,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_sequence,reordered,product_name,aisle_name,dept_name,aisle_id,department_id,eval_set
5262910,2334943,32099,76,6,14,1.0,44987,16.0,1,Heavy Whipping Cream,cream,dairy eggs,53.0,16.0,prior
5262417,780589,32099,48,1,14,1.0,2732,3.0,0,Gluten Free Pretzel Sticks,chips pretzels,snacks,107.0,19.0,prior
5262111,659525,32099,33,1,15,4.0,20175,32.0,1,The Original Vanilla Ice Cream Sandwich,ice cream ice,frozen,37.0,1.0,prior
5263165,1613797,32099,92,4,16,4.0,49105,4.0,1,Classic with Tuna Anchovies & Whitefish Cat Food,cat food care,pets,41.0,8.0,prior
5262242,1960865,32099,39,0,11,2.0,21616,32.0,0,Organic Baby Arugula,packaged vegetables fruits,produce,123.0,4.0,prior
5261515,455159,32099,7,2,8,2.0,27288,10.0,0,Chocolate Chip Cookie Dough,doughs gelatins bake mixes,pantry,105.0,13.0,prior
5263196,2290164,32099,93,0,11,3.0,49105,18.0,1,Classic with Tuna Anchovies & Whitefish Cat Food,cat food care,pets,41.0,8.0,prior
5261813,2690907,32099,18,1,20,4.0,7349,24.0,1,"Bread, Sliced, Extra Sourdough",bread,bakery,112.0,3.0,prior
5261731,429499,32099,16,1,12,1.0,42372,3.0,1,Pineapple on the Bottom Greek Yogurt,yogurt,dairy eggs,120.0,16.0,prior
5261942,3380792,32099,24,6,12,7.0,48705,27.0,1,Parmesan Cheese,packaged cheese,dairy eggs,21.0,16.0,prior


Now find a way to iterate and do this for every order. Find items from order n not reordered in order n+1. Find deets for order n+1. Add rows to practice_user with n+1 deets and n-only products

In [16]:
# This person has 100 orders, so I'm working from n
# (order_by_user_sequence) 2-99 and n+1 would be orders 3-100

# Start with creating orders' deets in dictionary form

orders_deets = practice_user.iloc[:,0:6].reset_index(drop=True)
orders_deets = orders_deets.groupby('order_by_user_sequence').first()
orders_deets['sequence_column'] = orders_deets.index
orders_deets = orders_deets.reset_index(drop=True)
orders_deets.index = orders_deets.index + 1
orders_deets = orders_deets.rename(columns={'sequence_column':
                                            'order_by_user_sequence'})
orders_deets = orders_deets[['order_id', 'user_id', 
                             'order_by_user_sequence', 'order_dow', 
                             'order_hour_of_day', 'days_since_prior_order']]
orders_deets = orders_deets.to_dict(orient='index')
orders_deets

{1: {'order_id': 2959648,
  'user_id': 32099,
  'order_by_user_sequence': 1,
  'order_dow': 0,
  'order_hour_of_day': 17,
  'days_since_prior_order': nan},
 2: {'order_id': 309518,
  'user_id': 32099,
  'order_by_user_sequence': 2,
  'order_dow': 1,
  'order_hour_of_day': 15,
  'days_since_prior_order': 1.0},
 3: {'order_id': 2494101,
  'user_id': 32099,
  'order_by_user_sequence': 3,
  'order_dow': 4,
  'order_hour_of_day': 11,
  'days_since_prior_order': 3.0},
 4: {'order_id': 931900,
  'user_id': 32099,
  'order_by_user_sequence': 4,
  'order_dow': 1,
  'order_hour_of_day': 20,
  'days_since_prior_order': 4.0},
 5: {'order_id': 2154511,
  'user_id': 32099,
  'order_by_user_sequence': 5,
  'order_dow': 5,
  'order_hour_of_day': 16,
  'days_since_prior_order': 4.0},
 6: {'order_id': 825019,
  'user_id': 32099,
  'order_by_user_sequence': 6,
  'order_dow': 0,
  'order_hour_of_day': 8,
  'days_since_prior_order': 2.0},
 7: {'order_id': 455159,
  'user_id': 32099,
  'order_by_user_sequen

In [17]:
# Already finished with adding rows to order 2, can drop order 1.

orders_deets.pop(1)

{'order_id': 2959648,
 'user_id': 32099,
 'order_by_user_sequence': 1,
 'order_dow': 0,
 'order_hour_of_day': 17,
 'days_since_prior_order': nan}

In [18]:
orders_deets

{2: {'order_id': 309518,
  'user_id': 32099,
  'order_by_user_sequence': 2,
  'order_dow': 1,
  'order_hour_of_day': 15,
  'days_since_prior_order': 1.0},
 3: {'order_id': 2494101,
  'user_id': 32099,
  'order_by_user_sequence': 3,
  'order_dow': 4,
  'order_hour_of_day': 11,
  'days_since_prior_order': 3.0},
 4: {'order_id': 931900,
  'user_id': 32099,
  'order_by_user_sequence': 4,
  'order_dow': 1,
  'order_hour_of_day': 20,
  'days_since_prior_order': 4.0},
 5: {'order_id': 2154511,
  'user_id': 32099,
  'order_by_user_sequence': 5,
  'order_dow': 5,
  'order_hour_of_day': 16,
  'days_since_prior_order': 4.0},
 6: {'order_id': 825019,
  'user_id': 32099,
  'order_by_user_sequence': 6,
  'order_dow': 0,
  'order_hour_of_day': 8,
  'days_since_prior_order': 2.0},
 7: {'order_id': 455159,
  'user_id': 32099,
  'order_by_user_sequence': 7,
  'order_dow': 2,
  'order_hour_of_day': 8,
  'days_since_prior_order': 2.0},
 8: {'order_id': 1116627,
  'user_id': 32099,
  'order_by_user_sequenc

In [19]:
# Now iterate over practice_user df to find un-reordered items for
# each order and add them as new rows.

for n in range(2,100):
    # Get items from order n not reordered in order n+1
    order_n = practice_user[practice_user['order_by_user_sequence']==n
                           ]['product_id'].unique().tolist()
    order_n1 = practice_user[practice_user['order_by_user_sequence']==(
        n+1)]['product_id'].unique().tolist()
    only_n = [x for x in order_n if x not in order_n1]
    # Get n1 deets from the big deets dict
    order_n1_deets = orders_deets.get(n+1)
    # Add to n1 deets dict with product ids from order_n
    order_n1_deets.update({'product_id': only_n})
    # Turn dict into df of new rows
    order_n1_new_rows = pd.DataFrame.from_dict(order_n1_deets)
    # Add new rows to practice_user df
    practice_user = pd.concat([practice_user, order_n1_new_rows])

practice_user.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34021 entries, 5261360 to 471
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   order_id                34021 non-null  int64  
 1   user_id                 34021 non-null  int64  
 2   order_by_user_sequence  34021 non-null  int64  
 3   order_dow               34021 non-null  int64  
 4   order_hour_of_day       34021 non-null  int64  
 5   days_since_prior_order  33995 non-null  float64
 6   product_id              34021 non-null  int64  
 7   add_to_cart_sequence    1992 non-null   float64
 8   reordered               2017 non-null   float64
 9   product_name            1992 non-null   object 
 10  aisle_name              1992 non-null   object 
 11  dept_name               1992 non-null   object 
 12  aisle_id                1992 non-null   float64
 13  department_id           1992 non-null   float64
 14  eval_set                1992 non-n

In [20]:
practice_user[practice_user['order_by_user_sequence']==3]

Unnamed: 0,order_id,user_id,order_by_user_sequence,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_sequence,reordered,product_name,aisle_name,dept_name,aisle_id,department_id,eval_set
5261398,2494101,32099,3,4,11,3.0,27966,14.0,1.0,Organic Raspberries,packaged vegetables fruits,produce,123.0,4.0,prior
5261399,2494101,32099,3,4,11,3.0,23909,20.0,0.0,2% Reduced Fat Milk,milk,dairy eggs,84.0,16.0,prior
5261400,2494101,32099,3,4,11,3.0,45698,38.0,0.0,One Ply Choose A Size Big Roll Paper Towel Rolls,paper goods,household,54.0,17.0,prior
5261401,2494101,32099,3,4,11,3.0,31717,27.0,0.0,Organic Cilantro,fresh herbs,produce,16.0,4.0,prior
5261402,2494101,32099,3,4,11,3.0,33198,45.0,0.0,Sparkling Natural Mineral Water,water seltzer sparkling water,beverages,115.0,7.0,prior
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,2494101,32099,3,4,11,3.0,18435,,,,,,,,
26,2494101,32099,3,4,11,3.0,29603,,,,,,,,
27,2494101,32099,3,4,11,3.0,47403,,,,,,,,
28,2494101,32099,3,4,11,3.0,34208,,,,,,,,


In [21]:
# Deal with all the NaNs I just made. 
# add_to_cart_sequence can stay for now: NaN is accurate for these new rows
# All new rows can have reordered value of 0

practice_user['reordered'] = practice_user['reordered'].fillna(0)
practice_user.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34021 entries, 5261360 to 471
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   order_id                34021 non-null  int64  
 1   user_id                 34021 non-null  int64  
 2   order_by_user_sequence  34021 non-null  int64  
 3   order_dow               34021 non-null  int64  
 4   order_hour_of_day       34021 non-null  int64  
 5   days_since_prior_order  33995 non-null  float64
 6   product_id              34021 non-null  int64  
 7   add_to_cart_sequence    1992 non-null   float64
 8   reordered               34021 non-null  float64
 9   product_name            1992 non-null   object 
 10  aisle_name              1992 non-null   object 
 11  dept_name               1992 non-null   object 
 12  aisle_id                1992 non-null   float64
 13  department_id           1992 non-null   float64
 14  eval_set                1992 non-n

In [None]:
# Remaining rows can be filled based on the product_id column value
# Need to get dictionaries for department_id:dept_name, aisle_id:aisle_name, 
# deptartment_id:aisle_id, aisle_id:product_id, product_id:product_name

prod_id_name_dict = df.set_index('product_id').to_dict()['product_name']
prod_id_name_dict

In [None]:
aisle_id_name_dict = df.set_index('aisle_id').to_dict()['aisle_name']
aisle_id_name_dict

In [None]:
dept_id_name_dict = df.set_index('department_id').todict()['dept_name']
dept_id_name_dict

In [None]:
aisle_prod_dict = df.set_index('aisle_id').to_dict()['product_id']
aisle_prod_dict

In [None]:
dept_aisle_dict = df.set_index('department_id').to_dict()['aisle_id']
aisle_name_dict

Next: cluster items in order to have fewer products to work with. 

Sometime: Other curriculum-recommended steps to review in case it sparks anything I really want to be exploring: sns.lmplot, sns.catplot, more in the eda cheat sheet. 