In [None]:
# This notebook is to perform aggregated data analyses over the dataset and 
# to derive new flag columns based on different conditions

In [None]:
# Content List
#
# 01. Importing libraries (Task 1)
# 02. Importing data (Task 1)
# 03. Grouping data with pandas
# 03.01. Analyzing average number of orders per user for each department
# Aggregating data with agg()
# Revised analysis
# 03.02. Creating a loyalty flag using loc (Task 4)
# Aggregating data with transform()
# Deriving columns with loc()
# 04. Analyzing average number of orders per user for each department - Entire Dataframe (Task 2, 3)
# 04.01. Using the approach instructed in Exercise (Task 2, 3)
# 04.02. Using revised approach (Task 2, 3 - revised approach)
# 05. Analyzing loyalty categories (Task 5)
# 06. Creating spending flag (Task 6)
# 07. Creating frequency flag (Task 7)
# 08. Exporting data (Task 9)

# 01. Importing libraries (Task 1)

In [1]:
# Task 1: Import libraries

import pandas as pd
import numpy as np
import os

# 02. Importing data (Task 1)

In [2]:
# Task 1: Define default path

path = r'/Users/bladael/Documents/Learning/CareerFoundry_DA/Data Immersion/Achievement 4/06-2023 Instacart Basket Analysis'

In [3]:
# Task 1: Import data

ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_with_busy_analyses.pkl'))

In [12]:
# Create a subset of the first one million entries

df = ords_prods_merge[:1000000]

In [13]:
# Check subset dimension
df.shape

(1000000, 17)

In [14]:
# Check subset head
df.head()

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,Busiest_Days,busiest_period_of_day
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Normal day,Average orders
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest day,Average orders
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest day,Most orders
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Average orders
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Most orders


# 03. Grouping data with pandas

In [15]:
# Attempt to group subset by 'product_name'

df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x124ada890>

## 03.01. Analyzing average number of orders per user for each department

## Aggregating data with agg()

In [16]:
# Split data into groups based on 'department_id'

df.groupby('department_id').agg({'order_sequence_number': ['mean']})

Unnamed: 0_level_0,order_sequence_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,18.82578
7,17.472355
13,17.993423
14,19.246334
16,19.463012
17,11.294069
19,19.305237
20,17.599636


In [41]:
# Another way to aggregate data without agg() function

df.groupby('department_id')['order_sequence_number'].mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: order_sequence_number, dtype: float64

In [43]:
# Perform multiple aggregate functions

df.groupby('department_id').agg({'order_sequence_number': ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_sequence_number,order_sequence_number,order_sequence_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,18.82578,1,99
7,17.472355,1,99
13,17.993423,1,99
14,19.246334,1,99
16,19.463012,1,99
17,11.294069,1,98
19,19.305237,1,99
20,17.599636,1,99


CP: In the context of the Exercise, it refers the result above as the "average number of orders per user for each deparment ID," but I believe this is misleading. The 'order_number' (in my dataframe renamed as 'order_sequence_number') column contains "ordinal value," not "nominal value" and it cannot be used for computation. The column simply mentions the 'n'th number of order that customers placed. In order to really obtain the "average number of orders per user for each deparment ID", (1) count how many times each department appears in this dataframe and (2) dive those figures by the total number of users.

## Revised analysis

In [28]:
# Side tracking: my revised approach to obtain "the average number of orders per user for each deparment ID"
# Check dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 17 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   order_id                1000000 non-null  int64  
 1   user_id                 1000000 non-null  int64  
 2   order_sequence_number   1000000 non-null  int64  
 3   order_days_of_week      1000000 non-null  int64  
 4   order_hour_of_day       1000000 non-null  int64  
 5   days_since_prior_order  1000000 non-null  float64
 6   product_id              1000000 non-null  int64  
 7   add_to_cart_order       1000000 non-null  int64  
 8   reordered               1000000 non-null  int64  
 9   product_name            1000000 non-null  object 
 10  aisle_id                1000000 non-null  int64  
 11  department_id           1000000 non-null  int64  
 12  prices                  1000000 non-null  float64
 13  price_range_loc         1000000 non-null  object 
 14  bus

In [29]:
# Side tracking: my revised approach to obtain "the average number of orders per user for each deparment ID"
# Count how many times each departement appear in the list

df['department_id'].value_counts(dropna = False)

4     611084
16    169624
20    108866
7      44710
19     38095
13     22656
14      3751
17      1214
Name: department_id, dtype: int64

In [34]:
# Side tracking: my revised approach to obtain "the average number of orders per user for each deparment ID"
# Value count of unique customer IDs

df['user_id'].nunique()

124698

In [35]:
# Side tracking: my revised approach to obtain "the average number of orders per user for each deparment ID"
# Turn the value count list into a dataframe

df_dep_avg_ord = df['department_id'].value_counts(dropna = False).rename_axis('department_id').reset_index(name='order_counts')



In [36]:
df_dep_avg_ord

Unnamed: 0,department_id,order_counts
0,4,611084
1,16,169624
2,20,108866
3,7,44710
4,19,38095
5,13,22656
6,14,3751
7,17,1214


In [37]:
# Side tracking: my revised approach to obtain "the average number of orders per user for each deparment ID"
# Add the average number of orders per user for each department column

df_dep_avg_ord['avg_order_per_user'] = (df_dep_avg_ord['order_counts'] / df['user_id'].nunique())

In [38]:
# Side tracking: my revised approach to obtain "the average number of orders per user for each deparment ID"
# Result check 

df_dep_avg_ord

Unnamed: 0,department_id,order_counts,avg_order_per_user
0,4,611084,4.900512
1,16,169624,1.360278
2,20,108866,0.873037
3,7,44710,0.358546
4,19,38095,0.305498
5,13,22656,0.181687
6,14,3751,0.030081
7,17,1214,0.009736


## 03.02. Creating a loyalty flag using loc (Task 4)

## Aggregating data with transform()

In [48]:
# Create new column with the max number orders made by customers

ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_sequence_number'].transform(np.max)

In [50]:
# Check head of the appended dataframe

ords_prods_merge.head(15)

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,Busiest_Days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Normal day,Average orders,10
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest day,Average orders,10
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest day,Most orders,10
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Average orders,10
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Most orders,10
5,3367565,1,6,2,7,19.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Normal day,Average orders,10
6,550135,1,7,1,9,20.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest day,Most orders,10
7,3108588,1,8,1,14,14.0,196,2,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest day,Most orders,10
8,2295261,1,9,1,16,0.0,196,4,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest day,Most orders,10
9,2550362,1,10,4,8,30.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Average orders,10


In [51]:
# Check head of the appended dataframe for the first 100 rows

ords_prods_merge.head(100)

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,Busiest_Days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Normal day,Average orders,10
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest day,Average orders,10
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest day,Most orders,10
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Average orders,10
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Most orders,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3226575,360,1,5,12,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Normal day,Most orders,3
96,1469869,377,3,5,17,3.0,196,9,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Normal day,Average orders,3
97,1927023,387,2,4,10,22.0,196,3,0,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Most orders,8
98,858092,420,4,1,19,30.0,196,2,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest day,Average orders,22


In [53]:
# Change pandas display option as the code above did not display full 100 rows

pd.options.display.max_rows = None

In [54]:
# Re-run head of the appended dataframe for the first 100 rows

ords_prods_merge.head(100)

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,Busiest_Days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Normal day,Average orders,10
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest day,Average orders,10
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest day,Most orders,10
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Average orders,10
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Most orders,10
5,3367565,1,6,2,7,19.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Normal day,Average orders,10
6,550135,1,7,1,9,20.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest day,Most orders,10
7,3108588,1,8,1,14,14.0,196,2,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest day,Most orders,10
8,2295261,1,9,1,16,0.0,196,4,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest day,Most orders,10
9,2550362,1,10,4,8,30.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Average orders,10


## Deriving columns with loc()

In [63]:
ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [64]:
ords_prods_merge.loc[(ords_prods_merge['max_order'] > 10) & (ords_prods_merge['max_order'] <= 40), 'loyalty_flag'] ='Regular customer'

In [67]:
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [68]:
# Check appended dataframe head

ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,Busiest_Days,busiest_period_of_day,max_order,loyalty_flag
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Normal day,Average orders,10,New customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest day,Average orders,10,New customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest day,Most orders,10,New customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Average orders,10,New customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Most orders,10,New customer


In [66]:
# Loyalty flag value counts

ords_prods_merge['loyalty_flag'].value_counts(dropna = False)

Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: loyalty_flag, dtype: int64

In [71]:
# Check appended dataframe head with selected columns

ords_prods_merge[['user_id', 'loyalty_flag', 'order_sequence_number']].head(60)

Unnamed: 0,user_id,loyalty_flag,order_sequence_number
0,1,New customer,1
1,1,New customer,2
2,1,New customer,3
3,1,New customer,4
4,1,New customer,5
5,1,New customer,6
6,1,New customer,7
7,1,New customer,8
8,1,New customer,9
9,1,New customer,10


 # 04. Analyzing average number of orders per user for each department - Entire Dataframe (Task 2, 3)

## 04.01. Using the approach instructed in Exercise (Task 2, 3)

In [74]:
# Task 3: Using agg() to calculate means

ords_prods_merge.groupby('department_id').agg({'order_sequence_number': ['mean']})

Unnamed: 0_level_0,order_sequence_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,15.457838
2,17.27792
3,17.170395
4,17.811403
5,15.215751
6,16.439806
7,17.225802
8,15.34065
9,15.895474
10,20.197148


In [75]:
# Task 3: Conduct the same analysis for the first 1 mil rows only for comparison

df.groupby('department_id').agg({'order_sequence_number': ['mean']})

Unnamed: 0_level_0,order_sequence_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,18.82578
7,17.472355
13,17.993423
14,19.246334
16,19.463012
17,11.294069
19,19.305237
20,17.599636


CP: Unlike the analysis of the first 1 mil rows where Department 16 had the highest number and Department 17 had the lowest number, the full data shows Department 10 with the highest and Deparment 5 with the lowest numbers. I still don't agree that the figures computed here are the average number orders per user by each department. Refer to the steps below for the result based on my revised approach

## 04.02. Using revised approach (Task 2, 3 - revised approach)

In [76]:
# Task 2: Check dataframe info

ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 19 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_sequence_number   int64  
 3   order_days_of_week      int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
 6   product_id              int64  
 7   add_to_cart_order       int64  
 8   reordered               int64  
 9   product_name            object 
 10  aisle_id                int64  
 11  department_id           int64  
 12  prices                  float64
 13  price_range_loc         object 
 14  busiest_day             object 
 15  Busiest_Days            object 
 16  busiest_period_of_day   object 
 17  max_order               int64  
 18  loyalty_flag            object 
dtypes: float64(2), int64(11), object(6)
memory usage: 4.8+ GB


In [77]:
# Task 2: Count how many times each departement appear in the list

ords_prods_merge['department_id'].value_counts(dropna = False)

4     9479291
16    5398747
19    2887550
7     2688123
1     2234743
13    1875369
3     1172428
15    1068058
20    1051249
9      866627
17     738666
12     708927
14     703033
11     447572
18     423802
6      269253
5      153696
8       97716
21      69145
2       36291
10      34573
Name: department_id, dtype: int64

In [78]:
# Task 2: Value count of unique customer IDs

ords_prods_merge['user_id'].nunique()

206209

In [79]:
# Task 2: Turn the value count list into a dataframe

df_dep_avg_ord_all = ords_prods_merge['department_id'].value_counts(dropna = False).rename_axis('department_id').reset_index(name='order_counts')


In [80]:
df_dep_avg_ord_all

Unnamed: 0,department_id,order_counts
0,4,9479291
1,16,5398747
2,19,2887550
3,7,2688123
4,1,2234743
5,13,1875369
6,3,1172428
7,15,1068058
8,20,1051249
9,9,866627


In [82]:
# Task 2: Add the average number of orders per user for each department column

df_dep_avg_ord_all['avg_order_per_user'] = (df_dep_avg_ord_all['order_counts'] / ords_prods_merge['user_id'].nunique())

In [83]:
# Task 2: Result check 

df_dep_avg_ord_all

Unnamed: 0,department_id,order_counts,avg_order_per_user
0,4,9479291,45.969337
1,16,5398747,26.180947
2,19,2887550,14.003026
3,7,2688123,13.035915
4,1,2234743,10.837272
5,13,1875369,9.094506
6,3,1172428,5.68563
7,15,1068058,5.179493
8,20,1051249,5.097978
9,9,866627,4.202663


In [84]:
# Task 3: Review the same analysis result for the first 1 mil rows only for comparison

df_dep_avg_ord

Unnamed: 0,department_id,order_counts,avg_order_per_user
0,4,611084,4.900512
1,16,169624,1.360278
2,20,108866,0.873037
3,7,44710,0.358546
4,19,38095,0.305498
5,13,22656,0.181687
6,14,3751,0.030081
7,17,1214,0.009736


CP: Unlike the analysis of the first 1 mil rows where Department 17 had the lowest average number of orders per user, the full data shows that Department 10 had the lowest average number of orders per user; however, the deparment with the highest number of orders per user is the same between the limited dataframe and the full dataframe, which was Department 4.

In [119]:
# Task 3: Import department data to check the department name

df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv')).T

In [120]:
df_dep

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


# 05. Analyzing loyalty categories (Task 5)

In [86]:
# Compute descriptive stats for each loyalty category

ords_prods_merge.groupby('loyalty_flag').agg({'prices': ['mean', 'min', 'max']})

Unnamed: 0_level_0,prices,prices,prices
Unnamed: 0_level_1,mean,min,max
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Loyal customer,10.386336,1.0,99999.0
New customer,13.29467,1.0,99999.0
Regular customer,12.495717,1.0,99999.0


In [None]:
# The result above appears abnormal where the max prices for all categories appear as $99,999.
# Given that these are groceries, there shouldn't be any products priced that high
# Pull the details for those rows with price of $99,999 to check

ords_prods_merge[ords_prods_merge['prices'] == 99999]

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,Busiest_Days,busiest_period_of_day,max_order,loyalty_flag
29165516,183964,873,3,0,10,7.0,33664,11,0,2 % Reduced Fat Milk,84,16,99999.0,High-range product,Busiest day,Busiest day,Most orders,8,New customer
29165517,1851256,873,4,6,12,13.0,33664,8,1,2 % Reduced Fat Milk,84,16,99999.0,High-range product,Regularly busy,Normal day,Most orders,8,New customer
29165518,1915696,1893,1,5,17,0.0,33664,10,0,2 % Reduced Fat Milk,84,16,99999.0,High-range product,Regularly busy,Normal day,Average orders,6,New customer
29165519,2763293,1893,2,4,16,13.0,33664,6,1,2 % Reduced Fat Milk,84,16,99999.0,High-range product,Least busy,Slowest day,Most orders,6,New customer
29165520,2564805,1893,4,1,17,30.0,33664,3,1,2 % Reduced Fat Milk,84,16,99999.0,High-range product,Regularly busy,Busiest day,Average orders,6,New customer
29165521,949012,3339,1,1,14,0.0,33664,8,0,2 % Reduced Fat Milk,84,16,99999.0,High-range product,Regularly busy,Busiest day,Most orders,6,New customer
29165522,420057,3339,2,0,11,13.0,33664,29,1,2 % Reduced Fat Milk,84,16,99999.0,High-range product,Busiest day,Busiest day,Most orders,6,New customer
29165523,1511997,3339,3,1,18,8.0,33664,14,1,2 % Reduced Fat Milk,84,16,99999.0,High-range product,Regularly busy,Busiest day,Average orders,6,New customer
29165524,1153448,3339,4,4,16,10.0,33664,1,1,2 % Reduced Fat Milk,84,16,99999.0,High-range product,Least busy,Slowest day,Most orders,6,New customer
29165525,2669259,3339,5,4,14,21.0,33664,6,1,2 % Reduced Fat Milk,84,16,99999.0,High-range product,Least busy,Slowest day,Most orders,6,New customer


CP: 2 % Reduced Fat Milk appears to be priced at $99,999. Not sure whether this is an error caused during data manipulation or whether original data contains this error.

In [88]:
# Pull the original pricing data from 'products' dataframe for verification
# Import data

df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'))

In [93]:
# Subset dataframe to verify how '2 % Reduced Fat Milk' is priced

df_prods[df_prods['product_id'] == 33664]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


CP: Appears that the original contains a pricing error. The data will still be used for analysis as the true pricing information cannot be obtained for this product; however, will make a note of this error as a disclaimer for the future analyses .

# 06. Creating spending flag (Task 6)

In [94]:
# Task 6: Create different "spending flags for each user based on the average price across all their orders"
# Steps: (1) Obtain the spending total by user_id (2) Obtain the total number of unique orders by user_id, (3) divide (1) or (2) 

# Step (1)

ords_prods_merge['total_ord_spend'] = ords_prods_merge.groupby(['user_id'])['prices'].transform(np.sum)

In [96]:
# Task 6: Create different "spending flags for each user based on the average price across all their orders"
# Steps: (1) Obtain the spending total by user_id (2) Obtain the total number of unique orders by user_id, (3) divide (1) or (2) 

# Step (2)

ords_prods_merge['total_ord_count'] = ords_prods_merge.groupby(['user_id'])['order_id'].transform('nunique')

In [98]:
# Task 6: Create different "spending flags for each user based on the average price across all their orders"
# Steps: (1) Obtain the spending total by user_id (2) Obtain the total number of unique orders by user_id, (3) divide (1) or (2) 

# Step (3)

ords_prods_merge['avg_ord_spend'] = ords_prods_merge['total_ord_spend'] / ords_prods_merge['total_ord_count']

In [100]:
# Task 6: Check to see whether the logic was applied correctly by looking at one user 

ords_prods_merge[ords_prods_merge['user_id'] == 1]

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,prices,price_range_loc,busiest_day,Busiest_Days,busiest_period_of_day,max_order,loyalty_flag,total_ord_spend,total_ord_count,avg_ord_spend
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,9.0,Mid-range product,Regularly busy,Normal day,Average orders,10,New customer,375.7,10,37.57
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,9.0,Mid-range product,Regularly busy,Slowest day,Average orders,10,New customer,375.7,10,37.57
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,9.0,Mid-range product,Regularly busy,Slowest day,Most orders,10,New customer,375.7,10,37.57
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,9.0,Mid-range product,Least busy,Slowest day,Average orders,10,New customer,375.7,10,37.57
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,9.0,Mid-range product,Least busy,Slowest day,Most orders,10,New customer,375.7,10,37.57
5,3367565,1,6,2,7,19.0,196,1,1,Soda,...,9.0,Mid-range product,Regularly busy,Normal day,Average orders,10,New customer,375.7,10,37.57
6,550135,1,7,1,9,20.0,196,1,1,Soda,...,9.0,Mid-range product,Regularly busy,Busiest day,Most orders,10,New customer,375.7,10,37.57
7,3108588,1,8,1,14,14.0,196,2,1,Soda,...,9.0,Mid-range product,Regularly busy,Busiest day,Most orders,10,New customer,375.7,10,37.57
8,2295261,1,9,1,16,0.0,196,4,1,Soda,...,9.0,Mid-range product,Regularly busy,Busiest day,Most orders,10,New customer,375.7,10,37.57
9,2550362,1,10,4,8,30.0,196,1,1,Soda,...,9.0,Mid-range product,Least busy,Slowest day,Average orders,10,New customer,375.7,10,37.57


In [104]:
# Task 6: Appears that the logic was applied correctly based on the sample check of one user_id
# Creating spending flag column using loc()

ords_prods_merge.loc[ords_prods_merge['avg_ord_spend'] < 10, 'spending_flag'] = 'Low spender'

In [105]:
# Task 6: Appears that the logic was applied correctly based on the sample check of one user_id
# Creating spending flag column using loc()

ords_prods_merge.loc[ords_prods_merge['avg_ord_spend'] >= 10, 'spending_flag'] = 'High spender'

In [106]:
# Task 6: Check head to verify the new column creation

ords_prods_merge.head(100)

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,price_range_loc,busiest_day,Busiest_Days,busiest_period_of_day,max_order,loyalty_flag,total_ord_spend,total_ord_count,avg_ord_spend,spending_flag
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,Mid-range product,Regularly busy,Normal day,Average orders,10,New customer,375.7,10,37.57,High spender
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Mid-range product,Regularly busy,Slowest day,Average orders,10,New customer,375.7,10,37.57,High spender
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Mid-range product,Regularly busy,Slowest day,Most orders,10,New customer,375.7,10,37.57,High spender
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Mid-range product,Least busy,Slowest day,Average orders,10,New customer,375.7,10,37.57,High spender
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Mid-range product,Least busy,Slowest day,Most orders,10,New customer,375.7,10,37.57,High spender
5,3367565,1,6,2,7,19.0,196,1,1,Soda,...,Mid-range product,Regularly busy,Normal day,Average orders,10,New customer,375.7,10,37.57,High spender
6,550135,1,7,1,9,20.0,196,1,1,Soda,...,Mid-range product,Regularly busy,Busiest day,Most orders,10,New customer,375.7,10,37.57,High spender
7,3108588,1,8,1,14,14.0,196,2,1,Soda,...,Mid-range product,Regularly busy,Busiest day,Most orders,10,New customer,375.7,10,37.57,High spender
8,2295261,1,9,1,16,0.0,196,4,1,Soda,...,Mid-range product,Regularly busy,Busiest day,Most orders,10,New customer,375.7,10,37.57,High spender
9,2550362,1,10,4,8,30.0,196,1,1,Soda,...,Mid-range product,Least busy,Slowest day,Average orders,10,New customer,375.7,10,37.57,High spender


In [107]:
# Task 6: Value counts for additional check

ords_prods_merge['spending_flag'].value_counts(dropna = False)

High spender    32369132
Low spender        35727
Name: spending_flag, dtype: int64

# 07. Creating frequency flag (Task 7)

In [109]:
# Task 7: Calculate the median value of "days_since_prior_order"

ords_prods_merge['median_frequency'] = ords_prods_merge.groupby(['user_id'])['days_since_prior_order'].transform(np.median)

In [110]:
# Task 7: Check head to verify the new column creation

ords_prods_merge.head(100)

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,busiest_day,Busiest_Days,busiest_period_of_day,max_order,loyalty_flag,total_ord_spend,total_ord_count,avg_ord_spend,spending_flag,median_frequency
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,Regularly busy,Normal day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Regularly busy,Slowest day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Regularly busy,Slowest day,Most orders,10,New customer,375.7,10,37.57,High spender,20.0
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Least busy,Slowest day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Least busy,Slowest day,Most orders,10,New customer,375.7,10,37.57,High spender,20.0
5,3367565,1,6,2,7,19.0,196,1,1,Soda,...,Regularly busy,Normal day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0
6,550135,1,7,1,9,20.0,196,1,1,Soda,...,Regularly busy,Busiest day,Most orders,10,New customer,375.7,10,37.57,High spender,20.0
7,3108588,1,8,1,14,14.0,196,2,1,Soda,...,Regularly busy,Busiest day,Most orders,10,New customer,375.7,10,37.57,High spender,20.0
8,2295261,1,9,1,16,0.0,196,4,1,Soda,...,Regularly busy,Busiest day,Most orders,10,New customer,375.7,10,37.57,High spender,20.0
9,2550362,1,10,4,8,30.0,196,1,1,Soda,...,Least busy,Slowest day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0


In [112]:
# Task 7: Appears that the logic was applied correctly based on the sample check of one user_id
# Creating spending frequency column using loc()

ords_prods_merge.loc[ords_prods_merge['median_frequency'] > 20, 'frequency_flag'] = 'Non-frequent customer'

In [113]:
# Task 7: Creating spending frequency column using loc()

ords_prods_merge.loc[(ords_prods_merge['median_frequency'] <= 20) & (ords_prods_merge['median_frequency'] > 10), 'frequency_flag'] = 'Regular customer'

In [114]:
# Task 7: Creating spending frequency column using loc()

ords_prods_merge.loc[ords_prods_merge['median_frequency'] <= 10, 'frequency_flag'] = 'Frequent customer'

In [115]:
# Task 7: Check head to verify the new column creation

ords_prods_merge.head(100)

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,Busiest_Days,busiest_period_of_day,max_order,loyalty_flag,total_ord_spend,total_ord_count,avg_ord_spend,spending_flag,median_frequency,frequency_flag
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,Normal day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Slowest day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Slowest day,Most orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Slowest day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Slowest day,Most orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer
5,3367565,1,6,2,7,19.0,196,1,1,Soda,...,Normal day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer
6,550135,1,7,1,9,20.0,196,1,1,Soda,...,Busiest day,Most orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer
7,3108588,1,8,1,14,14.0,196,2,1,Soda,...,Busiest day,Most orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer
8,2295261,1,9,1,16,0.0,196,4,1,Soda,...,Busiest day,Most orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer
9,2550362,1,10,4,8,30.0,196,1,1,Soda,...,Slowest day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer


In [125]:
# Task 7: Check frequency value counts by unique user_id

ords_prods_merge.groupby('frequency_flag')['user_id'].nunique(dropna = False)

frequency_flag
Frequent customer        103446
Non-frequent customer     43279
Regular customer          59484
Name: user_id, dtype: int64

In [130]:
# Task 7: Check total unique user_id for verification

ords_prods_merge['user_id'].nunique(dropna = False)

206209

# 08. Exporting data (Task 9)

In [116]:
# Export data to pkl

ords_prods_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_aggregate_data_analyses.pkl'))