This notebook builds on EDA done here: https://github.com/fractaldatalearning/Capstone2/blob/main/notebooks/eda3_1user_modeling.ipynb

In [20]:
import pandas as pd
import numpy as np
import os
import random

import matplotlib.pyplot as plt
import seaborn as sns

import collections

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier

from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus

from library.sb_utils import save_file
import json

In [3]:
# import the original full df, drop  useless/redundant columns
df = pd.read_csv('../data/processed/full_data_cleaned.csv')
df = df.drop(columns = ['product_id', 'aisle_id', 'department_id', 'eval_set'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33819106 entries, 0 to 33819105
Data columns (total 15 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_by_user_sequence  int64  
 3   order_dow               int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
 6   product_id              int64  
 7   add_to_cart_sequence    int64  
 8   reordered               int64  
 9   product_name            object 
 10  aisle_name              object 
 11  dept_name               object 
 12  aisle_id                int64  
 13  department_id           int64  
 14  eval_set                object 
dtypes: float64(1), int64(10), object(4)
memory usage: 3.8+ GB


In [4]:
# Get dictionaries connecting product-aisle-dept

with open('../data/processed/dicts/aisle_dept_dict.txt', 
          'r') as ad_file:
     ad_dict = json.load(ad_file)

with open('../data/processed/dicts/prod_aisle_dict.txt', 
          'r') as pa_file:
     pa_dict = json.load(pa_file)
        
with open('../data/processed/dicts/dept_id_name_dict.txt', 
          'r') as dd_file:
     dd_dict = json.load(dd_file)
        
with open('../data/processed/dicts/aisle_id_name_dict.txt', 
          'r') as aa_file:
     aa_dict = json.load(aa_file)
        
with open('../data/processed/dicts/prod_id_name_dict.txt', 
          'r') as pp_file:
     pp_dict = json.load(pp_file)
        
dd_dict

{'7': 'beverages',
 '16': 'dairy eggs',
 '19': 'snacks',
 '17': 'household',
 '4': 'produce',
 '14': 'breakfast',
 '13': 'pantry',
 '20': 'deli',
 '1': 'frozen',
 '11': 'personal care',
 '12': 'meat seafood',
 '6': 'international',
 '3': 'bakery',
 '15': 'canned goods',
 '9': 'dry goods pasta',
 '5': 'alcohol',
 '8': 'pets',
 '18': 'babies',
 '2': 'other',
 '21': 'missing',
 '10': 'bulk'}

In [5]:
# Fix dictionary to make the keys int rather than str

pp_dict = {int(k):v for k,v in pp_dict.items()}
aa_dict = {int(k):v for k,v in aa_dict.items()}
dd_dict = {int(k):v for k,v in dd_dict.items()}
pa_dict = {int(k):v for k,v in pa_dict.items()}
ad_dict = {int(k):v for k,v in ad_dict.items()}

dd_dict

{7: 'beverages',
 16: 'dairy eggs',
 19: 'snacks',
 17: 'household',
 4: 'produce',
 14: 'breakfast',
 13: 'pantry',
 20: 'deli',
 1: 'frozen',
 11: 'personal care',
 12: 'meat seafood',
 6: 'international',
 3: 'bakery',
 15: 'canned goods',
 9: 'dry goods pasta',
 5: 'alcohol',
 8: 'pets',
 18: 'babies',
 2: 'other',
 21: 'missing',
 10: 'bulk'}

Decide what chunk of data to work with for the remainder of the project. Randomly choose users of some quantity to leave me with a df sized to function with the computer. Don't start out separating it into train/test split. My intuition is that cross-row calculations don't count as leakage and negatively impact modeling if I'm adding data that has to do with past orders. If this logic turns out to be inappropriate, I can just come back and split the set into separate users (or into certain orders per user) and re-run any subsequent code. 

In [18]:
# How many total users are there?
len(df['user_id'].unique())

206209

In [22]:
# 1% (appx. 2k) users seems like a reasonable place to start. Just see what happens after I add rows. 
# Randomly select 1% of users.
users = random.sample(list(set(df['user_id'].unique())), 2062)
df = df.loc[df['user_id'].isin(users), :]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 335605 entries, 32144 to 33787479
Data columns (total 11 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   order_id                335605 non-null  int64  
 1   user_id                 335605 non-null  int64  
 2   order_by_user_sequence  335605 non-null  int64  
 3   order_dow               335605 non-null  int64  
 4   order_hour_of_day       335605 non-null  int64  
 5   days_since_prior_order  314799 non-null  float64
 6   add_to_cart_sequence    335605 non-null  int64  
 7   reordered               335605 non-null  int64  
 8   product_name            335605 non-null  object 
 9   aisle_name              335605 non-null  object 
 10  dept_name               335605 non-null  object 
dtypes: float64(1), int64(7), object(3)
memory usage: 30.7+ MB


In [None]:
# Add rows so that every order contains ever product ever ordered, with new rows as non-orders.

# Code from when I did it for just 1 user:
for n in range(2,100):
    # Get items from order n not reordered in order n+1
    order_n = practice_user[practice_user['order_by_user_sequence']==n
                           ]['product_id'].unique().tolist()
    order_n1 = practice_user[practice_user['order_by_user_sequence']==(
        n+1)]['product_id'].unique().tolist()
    only_n = [x for x in order_n if x not in order_n1]
    # Get n1 deets from the big deets dict
    order_n1_deets = orders_deets.get(n+1)
    # Add to n1 deets dict with product ids from order_n
    order_n1_deets.update({'product_id': only_n})
    # Turn dict into df of new rows
    order_n1_new_rows = pd.DataFrame.from_dict(order_n1_deets)
    # Add new rows to practice_user df
    practice_user = pd.concat([practice_user, order_n1_new_rows])


In [None]:
# Create reorders_so_far column

In [None]:
# Create past_orders column & delete reorders_so_far

In [None]:
# Engineer columns for product keywords

In [None]:
# Change format of dow, hour columns

In [None]:
# Save work done so far as new csv file

In [None]:
# Move to new notebook for encoding & standardizing remaining features