This notebook builds on EDA done here: https://github.com/fractaldatalearning/Capstone2/blob/main/notebooks/eda3_1user_modeling.ipynb

In [1]:
import pandas as pd
import numpy as np
import os
import random

import matplotlib.pyplot as plt
import seaborn as sns

import collections

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier

from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus

from library.sb_utils import save_file
import json

In [2]:
# import the original full df, drop  useless/redundant columns, fillna
df = pd.read_csv('../data/processed/full_data_cleaned.csv')
df = df.drop(columns = ['product_id', 'aisle_id', 'department_id', 'eval_set']).copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33819106 entries, 0 to 33819105
Data columns (total 11 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_by_user_sequence  int64  
 3   order_dow               int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
 6   add_to_cart_sequence    int64  
 7   reordered               int64  
 8   product_name            object 
 9   aisle_name              object 
 10  dept_name               object 
dtypes: float64(1), int64(7), object(3)
memory usage: 2.8+ GB


Decide what chunk of data to work with for the remainder of the project. Randomly choose users of some quantity to leave me with a df sized to function with the computer. Don't start out separating it into train/test split. My intuition is that cross-row calculations don't count as leakage and negatively impact modeling if I'm adding data that has to do with past orders. If this logic turns out to be inappropriate, I can just come back and split the set into separate users (or into certain orders per user) and re-run any subsequent code. 

In [3]:
# How many total users are there?
len(df['user_id'].unique())

206209

In [None]:
# Deal with null values
df['days_since_prior_order'] = df['days_since_prior_order'].fillna(-1)
df.isnull().any()

In [4]:
# After playing around, I found the computer was able to handle adding 
# rows to a df of appx. 1k users. Randomly select 0.5% of users.

all_users = set(df['user_id'].unique()
users1 = random.sample(list(all_users)), 1031)
df1 = df.loc[df['user_id'].isin(users), :].copy()
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 164808 entries, 14274 to 33814614
Data columns (total 11 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   order_id                164808 non-null  int64  
 1   user_id                 164808 non-null  int64  
 2   order_by_user_sequence  164808 non-null  int64  
 3   order_dow               164808 non-null  int64  
 4   order_hour_of_day       164808 non-null  int64  
 5   days_since_prior_order  154152 non-null  float64
 6   add_to_cart_sequence    164808 non-null  int64  
 7   reordered               164808 non-null  int64  
 8   product_name            164808 non-null  object 
 9   aisle_name              164808 non-null  object 
 10  dept_name               164808 non-null  object 
dtypes: float64(1), int64(7), object(3)
memory usage: 15.1+ MB


In [None]:
# Try going back to repeat and then concatenate to end up with 1-2% 
# of users for further analysis. Everything that comes after row 
# creation should go a bit faster so a larger df should work fine. 

not_users1 = all_users - set(users1)
users2 = random.sample(list(not_users1), 1031)  
df2 = df.loc[df['user_id'].isin(users2), :].copy()

not_users1or2 = not_users1 - users2
users3 = random.sample(list(not_users1or2), 1031)  
df3 = df.loc[df['user_id'].isin(users3), :].copy()

not_users1to3 = not_users1or2 - users3
users4 = random.sample(list(not_users1to3), 1031)  
df4 = df.loc[df['user_id'].isin(users4), :].copy()

In [None]:
# Renamed this df, df1 while the following cell was running.
# Go back and change the value there. 
# Then, repeat with some more users.


In [6]:
for user in users:
    # Work with 1 user at a time
    rows_to_work_w = df.loc[df['user_id']==user,:].copy()
    for n in range(2,101):
        
        # Get items from prior order not yet in order n
        prior_order_items = set(rows_to_work_w[rows_to_work_w[
            'order_by_user_sequence']==(n-1)]['product_name'].
                                unique().tolist())
        order_n_items = set(df[df['order_by_user_sequence']==n][
            'product_name'].unique().tolist())
        not_yet_in_n = prior_order_items-order_n_items
        
        # Specify new rows as copies of rows from prior order...
        new_rows = rows_to_work_w.loc[rows_to_work_w[
            'order_by_user_sequence']==(n-1),:].copy()
        # ... where the product was not ordered in order n
        new_rows = new_rows.loc[new_rows['product_name'].isin(
            not_yet_in_n),:].copy()
        
        # Change value of add_to_cart_sequence and reordered to 0
        new_rows['add_to_cart_sequence'] = new_rows[
            'add_to_cart_sequence'].replace(df[
            'add_to_cart_sequence'].unique().tolist(), 0)
        new_rows['reordered'] = new_rows['reordered'].replace(1,0)
        
        # Now change value of order_by_user_sequence from prior to n
        new_rows['order_by_user_sequence'] = new_rows[
            'order_by_user_sequence'].replace((n-1),n)

        # Add these rows to df so they're there when loop goes to 
        # next order_by_user_sequence value of n & 
        # get duplicated to every order thereafter
        df = pd.concat([df, new_rows]) 

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 217050 entries, 14274 to 15066991
Data columns (total 11 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   order_id                217050 non-null  int64  
 1   user_id                 217050 non-null  int64  
 2   order_by_user_sequence  217050 non-null  int64  
 3   order_dow               217050 non-null  int64  
 4   order_hour_of_day       217050 non-null  int64  
 5   days_since_prior_order  217050 non-null  float64
 6   add_to_cart_sequence    217050 non-null  int64  
 7   reordered               217050 non-null  int64  
 8   product_name            217050 non-null  object 
 9   aisle_name              217050 non-null  object 
 10  dept_name               217050 non-null  object 
dtypes: float64(1), int64(7), object(3)
memory usage: 19.9+ MB


In [7]:
df.sample(10)

Unnamed: 0,order_id,user_id,order_by_user_sequence,order_dow,order_hour_of_day,days_since_prior_order,add_to_cart_sequence,reordered,product_name,aisle_name,dept_name
2703009,525117,16594,16,6,14,30.0,0,0,Pepperoni Pizza Frozen Sandwiches,frozen appetizers sides,frozen
23857814,2641094,145325,12,1,18,16.0,0,0,Fat Free Small Curd Cottage Cheese,other creams cheeses,dairy eggs
13835019,1914486,83994,4,6,12,7.0,2,1,Creamy Almond Butter,spreads,pantry
7882752,3273099,47971,8,0,10,11.0,0,0,Crunchy Chickpeas Falafel,fruit vegetable snacks,snacks
21445475,1314492,130576,3,5,22,26.0,8,0,European Cucumber,fresh vegetables,produce
3885341,396038,23663,5,6,15,6.0,10,1,Fettuccine,fresh pasta,dry goods pasta
19371536,246843,117568,8,1,8,5.0,0,0,Butter with Canola Oil Spread,butter,dairy eggs
28910759,2821868,176198,8,4,15,10.0,6,1,Large Lemon,fresh fruits,produce
2985702,747058,18286,45,6,15,6.0,0,0,Organic Thompson Seedless Raisins,nuts seeds dried fruit,snacks
24593056,3395611,149851,4,6,12,30.0,11,1,Yellow Corn Organic Tortillas,tortillas flat bread,bakery


In [None]:
# This took hours to create. Immediately save as a file and 
# work with that file in a new notebook so I have the ability to 
# restart the kernel without having it take forever to run again.

In [10]:
users_incorrect_deets = df
datapath = '../data/processed'
save_file(users_incorrect_deets, 'users_incorrect_deets.csv', datapath)

Writing file.  "../data/processed/users_incorrect_deets.csv"


{(109, 1): {'order_id': 1656765,
  'user_id': 109,
  'order_by_user_sequence': 1,
  'order_dow': 1,
  'order_hour_of_day': 13,
  'days_since_prior_order': -1.0},
 (109, 2): {'order_id': 304607,
  'user_id': 109,
  'order_by_user_sequence': 2,
  'order_dow': 1,
  'order_hour_of_day': 12,
  'days_since_prior_order': 30.0},
 (109, 3): {'order_id': 1403849,
  'user_id': 109,
  'order_by_user_sequence': 3,
  'order_dow': 3,
  'order_hour_of_day': 17,
  'days_since_prior_order': 9.0},
 (109, 4): {'order_id': 659764,
  'user_id': 109,
  'order_by_user_sequence': 4,
  'order_dow': 2,
  'order_hour_of_day': 5,
  'days_since_prior_order': 20.0},
 (109, 5): {'order_id': 3116901,
  'user_id': 109,
  'order_by_user_sequence': 5,
  'order_dow': 0,
  'order_hour_of_day': 7,
  'days_since_prior_order': 26.0},
 (109, 6): {'order_id': 1382150,
  'user_id': 109,
  'order_by_user_sequence': 6,
  'order_dow': 1,
  'order_hour_of_day': 9,
  'days_since_prior_order': 15.0},
 (109, 7): {'order_id': 1997693,
 

In [9]:
# Create reorders_so_far column.
# Code I used with the practice user:

previous_round_items = set(df[df['reorders_so_far']==3][
    'product_name'])

grouped_by_product = df[df['reorders_so_far']==3].groupby(
    'product_name')['order_by_user_sequence']
df['keep'] = df.assign(min=grouped_by_product.transform(min))['min']

#for n in range (4,47):
    for order in range(7,101):
        items_this_order = set(products_reordered_each_order[order])
        reordered_this_iteration = set(
            previous_round_items.intersection(items_this_order))
        rows_to_change = df.loc[(df.loc[
        :,'order_by_user_sequence']==order) & (df.loc[
        :, 'reorders_so_far']==n-1) & (df['order_by_user_sequence']!=
        df['keep']) & (df.loc[:,'product_name'].isin(
        reordered_this_iteration))]
        df.loc[rows_to_change.index, 'reorders_so_far'] = n
    previous_round_items = set(df[df['reorders_so_far']==n][
        'product_name'])
    grouped_by_product = df[df['reorders_so_far']==n].groupby(
        'product_name')['order_by_user_sequence']
    df['keep'] = df.assign(min=grouped_by_product.transform(min))[
        'min']

IndentationError: unexpected indent (1973831926.py, line 12)

In [None]:
# Create past_orders column & delete reorders_so_far
# Code I used to create past_orders with one user:

items_already_ordered_ntimes = set(df[df['past_orders']==2][
    'product_name'])
when_items_first_ordered = list(df.groupby('product_name')[
    'order_by_user_sequence'].idxmin())

#for n in range(2,47): 
    rows_npast = df[df['product_name'].isin(
    items_already_ordered_ntimes)]
    rows_npast = rows_npast[rows_npast['past_orders']==0]
    rows_npast = rows_npast.drop(when_items_first_ordered, axis=0, 
                             errors='ignore')
    when_reordered_ntimes = df[df['reorders_so_far']==n].set_index(
    'product_name').to_dict()['order_by_user_sequence']
    for prod, order in when_reordered_ntimes.items():
        ind_to_delete = rows_npast[(rows_npast['product_name']==prod) &
                                   (rows_npast[
                                       'order_by_user_sequence']
                                    >order)].index
        rows_npast = rows_npast.drop(ind_to_delete)
    df.loc[np.array(rows_npast.index),'past_orders'] = n
    items_already_ordered_ntimes = set(df[df['past_orders']==n][
        'product_name'])

In [None]:
# Engineer columns for product keywords

In [None]:
# Change format of dow, hour columns

In [None]:
# Save work done so far as new csv file

In [None]:
# Move to new notebook for encoding & standardizing remaining features