### 01. Environment Setup

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
path = r'/Users/Cel/Documents/Data Analytics/09-2023 Instacart Basket Analysis'

In [9]:
#Import file
ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_derived_columns.pkl'))

In [10]:
#Create df with first 1000000 rows
df = ords_prods_merged[:1000000]

In [11]:
#Check shape
df.shape

(1000000, 18)

In [12]:
#Check columns
df.head()

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_period_of_day,busiest_day,two_busiest_days
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Average orders,Regularly busy,Regularly busy
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Average orders,Regularly busy,Least busy days
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Most orders,Regularly busy,Least busy days
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Average orders,Least busy,Least busy days
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Most orders,Least busy,Least busy days


### 02. Single Aggregations

In [13]:
#Group by functions
df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x195037a50>

The groupby function needs more arguments

In [15]:
#Need to split data into groups, then apply the agg function to each group
df.groupby('department_id').agg({'number_of_orders': ['mean']})

Unnamed: 0_level_0,number_of_orders
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,18.82578
7,17.472355
13,17.993423
14,19.246334
16,19.463012
17,11.294069
19,19.305237
20,17.599636


### 03. Multiple Aggregations

In [16]:
#Simply add more arguments to agg function
df.groupby('department_id').agg({'number_of_orders': ['mean', 'min', 'max']})

Unnamed: 0_level_0,number_of_orders,number_of_orders,number_of_orders
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,18.82578,1,99
7,17.472355,1,99
13,17.993423,1,99
14,19.246334,1,99
16,19.463012,1,99
17,11.294069,1,98
19,19.305237,1,99
20,17.599636,1,99


### 04. Aggregate Data with transform() function

In [19]:
#Create new column 'max_order' - aggregate number of orders per user_id
ords_prods_merged['max_order'] = ords_prods_merged.groupby(['user_id'])['number_of_orders'].transform(np.max)

In [22]:
ords_prods_merged.head(100)

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_period_of_day,busiest_day,two_busiest_days,max_order
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Average orders,Regularly busy,Regularly busy,10
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Average orders,Regularly busy,Least busy days,10
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Most orders,Regularly busy,Least busy days,10
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Average orders,Least busy,Least busy days,10
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Most orders,Least busy,Least busy days,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,3226575,360,1,5,12,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Most orders,Regularly busy,Regularly busy,3
96,1469869,377,3,5,17,3.0,196,9,0,Soda,77,7,9.0,both,Mid-range product,Average orders,Regularly busy,Regularly busy,3
97,1927023,387,2,4,10,22.0,196,3,0,Soda,77,7,9.0,both,Mid-range product,Most orders,Least busy,Least busy days,8
98,858092,420,4,1,19,30.0,196,2,0,Soda,77,7,9.0,both,Mid-range product,Average orders,Regularly busy,Busiest days,22


In [23]:
#Change pandas settings to allow unlimited rows to be displayed, then check first 100 rows
pd.options.display.max_rows = None

### 05. Derive new column from 'max_order'

In [28]:
#Create a flag that assigns a loyalty value based on 'max_order'
ords_prods_merged.loc[ords_prods_merged['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [26]:
ords_prods_merged.loc[(ords_prods_merged['max_order'] <= 40) & (ords_prods_merged['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [27]:
ords_prods_merged.loc[ords_prods_merged['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [31]:
#Check frequency counts
ords_prods_merged['loyalty_flag'].value_counts()

loyalty_flag
Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: count, dtype: int64

In [39]:
#Index first 60 rows of user_id, loyalty_flag, number_of_orders
ords_prods_merged[['user_id', 'loyalty_flag', 'number_of_orders']].head(60)

Unnamed: 0,user_id,loyalty_flag,number_of_orders
0,1,New customer,1
1,1,New customer,2
2,1,New customer,3
3,1,New customer,4
4,1,New customer,5
5,1,New customer,6
6,1,New customer,7
7,1,New customer,8
8,1,New customer,9
9,1,New customer,10


### 06. Export file

In [37]:
#Export df_merged to pickle
ords_prods_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_grouped_data.pkl'))