# 4.10 Part 2: Instacart

### This script contains the following points:
#### 01. Import libraries and data (incl. security implications/PII)
#### 02. Consistency Check
#### 03. Create Additional Customer Profiles/Flags
#### 04. Export data df_all, df_excluded, and df_low activity as pkl files

# 01 Import libraries & data

In [1]:
#import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import scipy

In [2]:
#create data path
path = r'C:\Users\fa_an\OneDrive\CareerFoundry\Tasks\Data Analytics Immersion\Tasks 4.1-4.10\02_2024 Instacart Basket Analysis'

In [3]:
#import data - df (instacart_excluded.pkl)
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'instacart_excluded.pkl'))

In [4]:
pd.set_option('display.max_columns', None)

# 02 Consistency Check

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30964564 entries, 0 to 32380882
Data columns (total 37 columns):
 #   Column                      Dtype   
---  ------                      -----   
 0   order_id                    int32   
 1   user_id                     int32   
 2   order_number                int8    
 3   orders_day_of_week          category
 4   order_hour_of_day           int8    
 5   days_since_prior_order      float16 
 6   product_id                  int32   
 7   add_to_cart_order           uint8   
 8   reordered                   int8    
 9   product_name                object  
 10  department_id               int8    
 11  prices                      float32 
 12  price_label                 object  
 13  busiest_days                object  
 14  max_order                   int8    
 15  loyalty_flag                object  
 16  mean_product_price          float32 
 17  spending_flag               object  
 18  median_days_between_orders  float16 
 19  ord

In [6]:
df.head(20)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,department_id,prices,price_label,busiest_days,max_order,loyalty_flag,mean_product_price,spending_flag,median_days_between_orders,order_frequency_flag,Gender,State,Age,total_dependants,family_status,income,sum_order,department,Region,exclusion_flag,Age_Range,Income_Range,pet_items,baby_items,snacks,frozen,alcohol
0,2539329,1,1,Monday,8,,196,1,0,Soda,7,9.0,Mid-range product,Regularly busy,10,New Customer,6.367796,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,31.6,beverages,South,0,1.0,1.0,0,0,1,0,0
1,2398795,1,2,Tuesday,7,15.0,196,1,1,Soda,7,9.0,Mid-range product,Slowest days,10,New Customer,6.367796,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,35.400002,beverages,South,0,1.0,1.0,0,0,1,0,0
2,473747,1,3,Tuesday,12,21.0,196,1,1,Soda,7,9.0,Mid-range product,Slowest days,10,New Customer,6.367796,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,28.200001,beverages,South,0,1.0,1.0,0,0,1,0,0
3,2254736,1,4,Wednesday,7,29.0,196,1,1,Soda,7,9.0,Mid-range product,Slowest days,10,New Customer,6.367796,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,26.0,beverages,South,0,1.0,1.0,0,0,1,0,0
4,431534,1,5,Wednesday,15,28.0,196,1,1,Soda,7,9.0,Mid-range product,Slowest days,10,New Customer,6.367796,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,57.0,beverages,South,0,1.0,1.0,0,0,1,0,0
5,3367565,1,6,Monday,7,19.0,196,1,1,Soda,7,9.0,Mid-range product,Regularly busy,10,New Customer,6.367796,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,25.0,beverages,South,0,1.0,1.0,0,0,1,0,0
6,550135,1,7,Sunday,9,20.0,196,1,1,Soda,7,9.0,Mid-range product,Busiest days,10,New Customer,6.367796,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,29.0,beverages,South,0,1.0,1.0,0,0,1,0,0
7,3108588,1,8,Sunday,14,14.0,196,2,1,Soda,7,9.0,Mid-range product,Busiest days,10,New Customer,6.367796,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,40.200001,beverages,South,0,1.0,1.0,0,0,1,0,0
8,2295261,1,9,Sunday,16,0.0,196,4,1,Soda,7,9.0,Mid-range product,Busiest days,10,New Customer,6.367796,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,40.200001,beverages,South,0,1.0,1.0,0,0,1,0,0
9,2550362,1,10,Wednesday,8,30.0,196,1,1,Soda,7,9.0,Mid-range product,Slowest days,10,New Customer,6.367796,Low Spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,63.099998,beverages,South,0,1.0,1.0,0,0,1,0,0


In [7]:
#downsample data type to int8
df[['Age_Range', 'pet_items', 'baby_items', 'frozen', 'snacks', 'alcohol']] = df[['Age_Range', 'pet_items', 'baby_items', 'frozen', 'snacks', 'alcohol']].astype('int8')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30964564 entries, 0 to 32380882
Data columns (total 37 columns):
 #   Column                      Dtype   
---  ------                      -----   
 0   order_id                    int32   
 1   user_id                     int32   
 2   order_number                int8    
 3   orders_day_of_week          category
 4   order_hour_of_day           int8    
 5   days_since_prior_order      float16 
 6   product_id                  int32   
 7   add_to_cart_order           uint8   
 8   reordered                   int8    
 9   product_name                object  
 10  department_id               int8    
 11  prices                      float32 
 12  price_label                 object  
 13  busiest_days                object  
 14  max_order                   int8    
 15  loyalty_flag                object  
 16  mean_product_price          float32 
 17  spending_flag               object  
 18  median_days_between_orders  float16 
 19  ord

# 03 Create Additional Customer Profiles/Flags
 “Single male and female” and “Young parent.”

In [9]:
#let's create a profile for Single_MF (male/female) Column.

In [10]:
#Create 0= Single Male
df.loc[(df['Gender'] == 'Male') & (df['family_status'] == 'single'), 'Single_MF'] = 0

In [11]:
#Create 1= Single Female
df.loc[(df['Gender'] == 'Female') & (df['family_status'] == 'single'), 'Single_MF'] = 1

In [12]:
#For 2=Other, doesn't fit rules above
df['Single_MF'].fillna(2, inplace=True)

In [13]:
df['Single_MF'].value_counts(dropna = False)

Single_MF
2.0    25870154
0.0     2573256
1.0     2521154
Name: count, dtype: int64

In [14]:
# Group by 'user_id' and count the number of unique users where 'Single_MF' is 0 (Single Male)
single_male_counts = df[df['Single_MF'] == 0].groupby('user_id').size()

# Group by 'user_id' and count the number of unique users where 'Single_MF' is 1 (Single Female)
single_female_counts = df[df['Single_MF'] == 1].groupby('user_id').size()

# Print the combined counts
print(single_male_counts)
print(single_female_counts)

user_id
12         74
28        181
34         32
38        193
48        113
         ... 
206119     51
206158    194
206165    559
206171     23
206188     63
Length: 13587, dtype: int64
user_id
30         11
56        145
57         72
75        311
79         93
         ... 
206126    524
206146    236
206160    175
206175     49
206193    343
Length: 13309, dtype: int64


In [15]:
#13,587 (8.3% of base) single males and 13,309 (8.2% base) single females. All Singles (male & female) are approx 16.5% of the overall customer base.

In [16]:
#Create a profile for Adult Parent

In [17]:
#'Adult_Parent' column to 1 for rows that meet the conditions
df.loc[(df['family_status'] == 'married') & 
       (df['Age_Range'] == 1) & 
       (df['total_dependants'] > 0), 'Adult_Parent'] = 1

# Fill missing values in the 'Adult_Parent' column with 0
df['Adult_Parent'].fillna(0, inplace=True)

# Verify the count of each value in the 'Adult_Parent' column
df['Adult_Parent'].value_counts(dropna=False)

Adult_Parent
0.0    22633179
1.0     8331385
Name: count, dtype: int64

In [18]:
# Group by 'user_id' and count the number of unique users where 'Adult_Parent' is 1
unique_customers_with_condition = df[df['Adult_Parent'] == 1].groupby('user_id').size()

# Print the counts of unique customers who meet the condition
print(unique_customers_with_condition)

user_id
1          59
3          88
4          18
7         206
10        143
         ... 
206190     54
206195     67
206197    181
206198     41
206206    285
Length: 43479, dtype: int64


In [19]:
#Adult Parents are 43,479 of total customer base which is approx 27%.

In [20]:
#Create a profile for Married v Others

In [21]:
df.loc[(df['family_status'] == 'married'), 'Married'] = 1

#For 0=Other, doesn't fit rules above
df['Married'].fillna(0, inplace=True)

# Verify the count of each value in the 'Married' column
df['Married'].value_counts(dropna=False)

Married
1.0    21743711
0.0     9220853
Name: count, dtype: int64

In [22]:
# Group by 'user_id' and count the number of unique users where 'Single_Professional' is 1
Married_condition = df[df['Married'] == 1].groupby('user_id').size()

# Print the counts of unique customers as Single Professional
print(Married_condition)

user_id
1          59
2         195
3          88
4          18
7         206
         ... 
206203    119
206206    285
206207    223
206208    677
206209    129
Length: 114296, dtype: int64


In [None]:
#There are 114,296 Married customers which are approx. 70% of the overall customer base and 70% of overall revenue.

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30964564 entries, 0 to 32380882
Data columns (total 40 columns):
 #   Column                      Dtype   
---  ------                      -----   
 0   order_id                    int32   
 1   user_id                     int32   
 2   order_number                int8    
 3   orders_day_of_week          category
 4   order_hour_of_day           int8    
 5   days_since_prior_order      float16 
 6   product_id                  int32   
 7   add_to_cart_order           uint8   
 8   reordered                   int8    
 9   product_name                object  
 10  department_id               int8    
 11  prices                      float32 
 12  price_label                 object  
 13  busiest_days                object  
 14  max_order                   int8    
 15  loyalty_flag                object  
 16  mean_product_price          float32 
 17  spending_flag               object  
 18  median_days_between_orders  float16 
 19  ord

# 04 Export Data

In [25]:
#export df 'rev' as pkl
df.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'instacart_excluded_rev.pkl'))

In [26]:
#4.10 Part 3 will include visualizations