This notebook builds on EDA done here: https://github.com/fractaldatalearning/Capstone2/blob/main/notebooks/eda3_1user_modeling.ipynb

In [1]:
import pandas as pd
import numpy as np
import os
import random

import matplotlib.pyplot as plt
import seaborn as sns

import collections

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier

from sklearn.tree import export_graphviz
from six import StringIO  
from IPython.display import Image  
import pydotplus

from library.sb_utils import save_file
import json

In [2]:
# import the original full df, drop  useless/redundant columns, fillna
df = pd.read_csv('../data/processed/full_data_cleaned.csv')
df = df.drop(columns = ['product_id', 'aisle_id', 'department_id', 'eval_set']).copy()
df.info()

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
# Get dictionaries connecting product-aisle-dept

with open('../data/processed/dicts/aisle_dept_dict.txt', 
          'r') as ad_file:
     ad_dict = json.load(ad_file)

with open('../data/processed/dicts/prod_aisle_dict.txt', 
          'r') as pa_file:
     pa_dict = json.load(pa_file)
        
with open('../data/processed/dicts/dept_id_name_dict.txt', 
          'r') as dd_file:
     dd_dict = json.load(dd_file)
        
with open('../data/processed/dicts/aisle_id_name_dict.txt', 
          'r') as aa_file:
     aa_dict = json.load(aa_file)
        
with open('../data/processed/dicts/prod_id_name_dict.txt', 
          'r') as pp_file:
     pp_dict = json.load(pp_file)
        
dd_dict

In [None]:
# Fix dictionary to make the keys int rather than str

pp_dict = {int(k):v for k,v in pp_dict.items()}
aa_dict = {int(k):v for k,v in aa_dict.items()}
dd_dict = {int(k):v for k,v in dd_dict.items()}
pa_dict = {int(k):v for k,v in pa_dict.items()}
ad_dict = {int(k):v for k,v in ad_dict.items()}

dd_dict

Decide what chunk of data to work with for the remainder of the project. Randomly choose users of some quantity to leave me with a df sized to function with the computer. Don't start out separating it into train/test split. My intuition is that cross-row calculations don't count as leakage and negatively impact modeling if I'm adding data that has to do with past orders. If this logic turns out to be inappropriate, I can just come back and split the set into separate users (or into certain orders per user) and re-run any subsequent code. 

In [None]:
# How many total users are there?
len(df['user_id'].unique())

In [None]:
# After playing around, I found the computer was able to handle adding 
#more rows to a df of *** users. Randomly select ***% of users.
users = random.sample(list(set(df['user_id'].unique())), 1)
df = df.loc[df['user_id'].isin(users), :].copy()
df.info()

In [None]:
# Deal with null values
df['days_since_prior_order'] = df['days_since_prior_order'].fillna(-1)
df.isnull().any()

In [None]:
# Before adding rows, I might want a dictionary of details to include 
# in each new row.

order_deets = df.loc[:, ['order_id', 'user_id', 
                         'order_by_user_sequence', 'order_dow', 
                         'order_hour_of_day', 'days_since_prior_order'
                        ]].reset_index(drop=True)

order_deets['user_index'] = order_deets['user_id']
order_deets['order_index'] = order_deets['order_by_user_sequence']

order_deets = order_deets.groupby(['user_index', 'order_index']
                                 ).first().to_dict(orient='index')

order_deets

In [None]:
for user in users:
    # Work with 1 user at a time
    rows_to_work_w = df.loc[df['user_id']==user,:].copy()
    for n in range(2,101):
        
        # Get items from prior order not yet in order n
        prior_order_items = set(rows_to_work_w[rows_to_work_w[
            'order_by_user_sequence']==(n-1)]['product_name'].
                                unique().tolist())
        order_n_items = set(df[df['order_by_user_sequence']==n][
            'product_name'].unique().tolist())
        not_yet_in_n = prior_order_items-order_n_items
        
        # Specify new rows as copies of rows from prior order...
        new_rows = rows_to_work_w.loc[rows_to_work_w[
            'order_by_user_sequence']==(n-1),:].copy()
        # ... where the product was not ordered in order n
        new_rows = new_rows.loc[new_rows['product_name'].isin(
            not_yet_in_n),:].copy()
        
        # Change value of add_to_cart_sequence and reordered to 0
        new_rows['add_to_cart_sequence'] = new_rows[
            'add_to_cart_sequence'].replace(df[
            'add_to_cart_sequence'].unique().tolist(), 0)
        new_rows['reordered'] = new_rows['reordered'].replace(1,0)
        
        # Prod/aisle/dept in these new rows are correct. 
        # But other details aren't; should match order n, not prior.
        for column in ['order_id', 'order_dow', 'order_hour_of_day',
                       'days_since_prior_order']:
            new_rows[column] = new_rows[column].replace(
                new_rows.loc[:,column].values[0], 
                df.loc[df['order_by_user_sequence']==n, column].
                values[0])
        
        # Now change value of order_by_user_sequence from prior to n
        new_rows['order_by_user_sequence'] = new_rows[
            'order_by_user_sequence'].replace((n-1),n)

        # Add these rows to df so they're there when loop goes to 
        # next order_by_user_sequence value of n & 
        # get duplicated to every order thereafter
            df = pd.concat([df, new_rows]) 

df.info()

In [None]:
# Create reorders_so_far column.
# Code I used with the practice user:

previous_round_items = set(df[df['reorders_so_far']==3][
    'product_name'])

grouped_by_product = df[df['reorders_so_far']==3].groupby(
    'product_name')['order_by_user_sequence']
df['keep'] = df.assign(min=grouped_by_product.transform(min))['min']

#for n in range (4,47):
    for order in range(7,101):
        items_this_order = set(products_reordered_each_order[order])
        reordered_this_iteration = set(
            previous_round_items.intersection(items_this_order))
        rows_to_change = df.loc[(df.loc[
        :,'order_by_user_sequence']==order) & (df.loc[
        :, 'reorders_so_far']==n-1) & (df['order_by_user_sequence']!=
        df['keep']) & (df.loc[:,'product_name'].isin(
        reordered_this_iteration))]
        df.loc[rows_to_change.index, 'reorders_so_far'] = n
    previous_round_items = set(df[df['reorders_so_far']==n][
        'product_name'])
    grouped_by_product = df[df['reorders_so_far']==n].groupby(
        'product_name')['order_by_user_sequence']
    df['keep'] = df.assign(min=grouped_by_product.transform(min))[
        'min']

In [None]:
# Create past_orders column & delete reorders_so_far
# Code I used to create past_orders with one user:

items_already_ordered_ntimes = set(df[df['past_orders']==2][
    'product_name'])
when_items_first_ordered = list(df.groupby('product_name')[
    'order_by_user_sequence'].idxmin())

#for n in range(2,47): 
    rows_npast = df[df['product_name'].isin(
    items_already_ordered_ntimes)]
    rows_npast = rows_npast[rows_npast['past_orders']==0]
    rows_npast = rows_npast.drop(when_items_first_ordered, axis=0, 
                             errors='ignore')
    when_reordered_ntimes = df[df['reorders_so_far']==n].set_index(
    'product_name').to_dict()['order_by_user_sequence']
    for prod, order in when_reordered_ntimes.items():
        ind_to_delete = rows_npast[(rows_npast['product_name']==prod) &
                                   (rows_npast[
                                       'order_by_user_sequence']
                                    >order)].index
        rows_npast = rows_npast.drop(ind_to_delete)
    df.loc[np.array(rows_npast.index),'past_orders'] = n
    items_already_ordered_ntimes = set(df[df['past_orders']==n][
        'product_name'])

In [None]:
# Engineer columns for product keywords

In [None]:
# Change format of dow, hour columns

In [None]:
# Save work done so far as new csv file

In [None]:
# Move to new notebook for encoding & standardizing remaining features