## Importing libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

## Importing data

In [3]:
# Define path variable

path = r'C:\Users\fiyin\OneDrive\Documents\InstaCart Basket Analysis'

In [4]:
# Import orders_products_merged.csv

ords_prods_merged = pd.read_pickle(os.path.join(path, '02. Data', 'Prepared data', 'orders_products_merged.pkl'))

In [5]:
# Create subset df using first one million  rows

df = ords_prods_merged[:1000000]

In [6]:
df.shape

(1000000, 16)

In [7]:
df.head()

Unnamed: 0,order_id,user_id,number_of_items_ordered,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Slowest days,Average orders
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Slowest days,Most orders
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Slowest days,Average orders
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Slowest days,Most orders


## Grouping and aggregating data

In [8]:
df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001B1DE4F4A60>

### Aggregating data with agg()

In [9]:
df.groupby('department_id').agg({'number_of_items_ordered': ['mean']})

Unnamed: 0_level_0,number_of_items_ordered
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,18.82578
7,17.472355
13,17.993423
14,19.246334
16,19.463012
17,11.294069
19,19.305237
20,17.599636


In [10]:
# Without agg() function

df.groupby('department_id')['number_of_items_ordered'].mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: number_of_items_ordered, dtype: float64

In [11]:
# Different method, same results

df.groupby('department_id').number_of_items_ordered.mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: number_of_items_ordered, dtype: float64

### Performing multiple aggregations

In [12]:
df.groupby('department_id').agg({'number_of_items_ordered': ['mean', 'min', 'max']})

Unnamed: 0_level_0,number_of_items_ordered,number_of_items_ordered,number_of_items_ordered
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,18.82578,1,99
7,17.472355,1,99
13,17.993423,1,99
14,19.246334,1,99
16,19.463012,1,99
17,11.294069,1,98
19,19.305237,1,99
20,17.599636,1,99


### Aggregating data with transform()

In [14]:
# Create new column 'max_order' to store max order for each user
ords_prods_merged['max_order'] = ords_prods_merged.groupby(['user_id'])['number_of_items_ordered'].transform(np.max)

In [15]:
# Check output

ords_prods_merged.head()

Unnamed: 0,order_id,user_id,number_of_items_ordered,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range,busiest_days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders,10
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Slowest days,Average orders,10
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Slowest days,Most orders,10
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Slowest days,Average orders,10
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Slowest days,Most orders,10


In [21]:
# Display all rows

pd.options.display.max_rows = None

##### Setting loyalty flags

In [17]:
ords_prods_merged.loc[ords_prods_merged['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [18]:
ords_prods_merged.loc[(ords_prods_merged['max_order'] <= 40) & (ords_prods_merged['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [19]:
ords_prods_merged.loc[ords_prods_merged['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [20]:
ords_prods_merged['loyalty_flag'].value_counts(dropna = False)

Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: loyalty_flag, dtype: int64

In [21]:
# Check output

ords_prods_merged[['user_id', 'loyalty_flag', 'number_of_items_ordered']].head(60)

Unnamed: 0,user_id,loyalty_flag,number_of_items_ordered
0,1,New customer,1
1,1,New customer,2
2,1,New customer,3
3,1,New customer,4
4,1,New customer,5
5,1,New customer,6
6,1,New customer,7
7,1,New customer,8
8,1,New customer,9
9,1,New customer,10


## Exporting data

In [22]:
ords_prods_merged.to_pickle(os.path.join(path, '02. Data', 'Prepared data', 'orders_products_merged.pkl'))