## 4.8 Grouping Data & Aggregating Variables

### This script contains the following points:
#### 1. Import libraries and data
#### 2. Grouping Data
#### 3. Grouping Data w/ Pandas
#### 4. Aggregating Data w/ agg ()
#### 5. Aggregating Data w/ transform ()
#### 6. Deriving Columns w/ loc()
#### 7. Task

# 1. Import Libraries & Data

In [1]:
#import libraries
import pandas as pd
import numpy as np
import os

In [2]:
#create data path
path = r'C:\Users\fa_an\OneDrive\CareerFoundry\Tasks\Data Analytics Immersion\Tasks 4.1-4.10\02_2024 Instacart Basket Analysis'

In [3]:
#import data - ords_prods_merge.pkl as df_ords_prods_merge
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_derived.pkl'))

In [4]:
#Create a subset
df = ords_prods_merge[:1000000]

In [5]:
#check shape
df.shape

(1000000, 19)

In [6]:
df.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_or_repeat_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_label,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,prior,1,2,8,,First Order,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average Orders
1,2398795,1,prior,2,3,7,15.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Average Orders
2,473747,1,prior,3,3,12,21.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Most Orders
3,2254736,1,prior,4,4,7,29.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Average Orders
4,431534,1,prior,5,4,15,28.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Most Orders


# 2. Grouping Data w/ Pandas

In [7]:
#use the groupby() function
df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000017ED5944610>

# 4. Aggregating Data w/ agg()

In [8]:
#performing a single aggregation
df.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,18.82578
7,17.472355
13,17.993423
14,19.246334
16,19.463012
17,11.294069
19,19.305237
20,17.599636


In [9]:
#aggregating without the use of agg()
df.groupby('department_id')['order_number'].mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: order_number, dtype: float64

In [10]:
#dot notation vs square brackets: aggregating w/out the use of agg()
df.groupby('department_id').order_number.mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: order_number, dtype: float64

In [11]:
#peforming multiple aggregations
df.groupby('department_id').agg({'order_number': ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,18.82578,1,99
7,17.472355,1,99
13,17.993423,1,99
14,19.246334,1,99
16,19.463012,1,99
17,11.294069,1,98
19,19.305237,1,99
20,17.599636,1,99


# 5. Aggregating Data w/ transform()

In [12]:
#create a customer loytalty flag using loc()

In [20]:
#split the data into groups based on the 'user_id' column and apply transform 'order_number' to generate max orders per user
ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform('max')

In [14]:
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_or_repeat_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_label,busiest_day,busiest_days,busiest_period_of_day,max_order
0,2539329,1,prior,1,2,8,,First Order,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average Orders,10
1,2398795,1,prior,2,3,7,15.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Average Orders,10
2,473747,1,prior,3,3,12,21.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Most Orders,10
3,2254736,1,prior,4,4,7,29.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Average Orders,10
4,431534,1,prior,5,4,15,28.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Most Orders,10


In [19]:
#check values of new max_order column
ords_prods_merge.head(100)

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_or_repeat_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_label,busiest_day,busiest_days,busiest_period_of_day,max_order
0,2539329,1,prior,1,2,8,,First Order,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average Orders,10
1,2398795,1,prior,2,3,7,15.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Average Orders,10
2,473747,1,prior,3,3,12,21.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest days,Most Orders,10
3,2254736,1,prior,4,4,7,29.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Average Orders,10
4,431534,1,prior,5,4,15,28.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Most Orders,10
5,3367565,1,prior,6,2,7,19.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Regularly busy,Average Orders,10
6,550135,1,prior,7,1,9,20.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest days,Most Orders,10
7,3108588,1,prior,8,1,14,14.0,Repeat Order,196,2,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest days,Most Orders,10
8,2295261,1,prior,9,1,16,0.0,Repeat Order,196,4,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Busiest days,Most Orders,10
9,2550362,1,prior,10,4,8,30.0,Repeat Order,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest days,Average Orders,10


# 6. Deriving Columns w/ loc()

In [21]:
#create a "loyalty" flag based on the max_order column
ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal Customer'

In [22]:
ords_prods_merge.loc[(ords_prods_merge['max_order'] > 10) & (ords_prods_merge['max_order'] <= 40), 'loyalty_flag'] = 'Regular Customer'


In [24]:
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New Customer'

In [25]:
#check frequency of new column 'loyalty_flag'
ords_prods_merge['loyalty_flag'].value_counts(dropna = False)

loyalty_flag
Regular Customer    15876776
Loyal Customer      10284093
New Customer         6243990
Name: count, dtype: int64

In [26]:
#first 60 rows of df using user_id,loyalty_flag, and order_number
ords_prods_merge[['user_id', 'loyalty_flag', 'order_number']].head(60)

Unnamed: 0,user_id,loyalty_flag,order_number
0,1,New Customer,1
1,1,New Customer,2
2,1,New Customer,3
3,1,New Customer,4
4,1,New Customer,5
5,1,New Customer,6
6,1,New Customer,7
7,1,New Customer,8
8,1,New Customer,9
9,1,New Customer,10
