### Contents:
    01 Importing libraries and data
    02 Aggregation practice
        a using a subset
        b groupby() and agg() functions
        c transform function with subset
        d transform function with whole df
        e variable derivation with loc[]
    03 Exporting

# Exercise 4.8

## 01 Importing libraries and data

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
path = r'/Users/Emily/Documents/CF Data Analysis Program/Immersion 4/Instacart Basket Analysis'

In [3]:
ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'prepared data', 'orders_products_merged_2.pkl'))

## 02 Aggregation practice

### a) Select a slice, check data

In [4]:
# slice of top 1M records
df = ords_prods[:1000000]

In [5]:
# df checks part 1
df.shape

(1000000, 19)

In [6]:
# df checks part 2
df.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range,busiest_day,busiest_days,busiest_period_of_day
0,2539329.0,1.0,1.0,2.0,8.0,,True,196,1.0,0.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Regularly busy,average orders
1,2398795.0,1.0,2.0,3.0,7.0,15.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Least busy days,average orders
2,473747.0,1.0,3.0,3.0,12.0,21.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Least busy days,most orders
3,2254736.0,1.0,4.0,4.0,7.0,29.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Least busy,Least busy days,average orders
4,431534.0,1.0,5.0,4.0,15.0,28.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Least busy,Least busy days,most orders


### b) Groupby and agg functions

In [7]:
# group by departments and then aggregate order numbers by average/mean...?
# this doesn't make sense logically to take the mean of the order_number....
# order_number is a sequential number unique to the customer (order #1, order #2)
# so it only makes sense to use 'count' with this variable
# and even then, you'd have to count unique within each customer, because the order_number is 
# included for each item in each order.
df.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4.0,18.82578
7.0,17.472355
13.0,17.993423
14.0,19.246334
16.0,19.463012
17.0,11.294069
19.0,19.305237
20.0,17.599636


In [8]:
# how many items were ordered from each department
# this is the only thing that makes sense to do with the order_number column
# but really, a count could be accomplished in this example by using any column
df.groupby('department_id').agg({'order_number': ['count']}).sort_values([('order_number','count')], ascending = False)

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,count
department_id,Unnamed: 1_level_2
4.0,611084
16.0,169624
20.0,108866
7.0,44710
19.0,38095
13.0,22656
14.0,3751
17.0,1214


In [9]:
# the same thing, but without using .agg function
df.groupby('department_id')['order_number'].count().sort_values(ascending = False)

department_id
4.0     611084
16.0    169624
20.0    108866
7.0      44710
19.0     38095
13.0     22656
14.0      3751
17.0      1214
Name: order_number, dtype: int64

In [10]:
# what's the average price that's being paid for items in each department?
# when using the prices column, it makes sense to use the mean
df.groupby('department_id')['prices'].mean().sort_values(ascending = False)

department_id
7.0     9.845439
20.0    9.794003
4.0     8.850788
19.0    4.589048
14.0    4.000000
16.0    3.725790
13.0    3.596284
17.0    1.000000
Name: prices, dtype: float64

In [11]:
# min, mean, and max of prices by department sorted on mean
df.groupby('department_id').agg({'prices': ['mean', 'min', 'max']}).sort_values([('prices','mean')],ascending = False)

Unnamed: 0_level_0,prices,prices,prices
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
7.0,9.845439,9.0,13.4
20.0,9.794003,2.3,14.8
4.0,8.850788,2.7,14.0
19.0,4.589048,2.7,6.8
14.0,4.0,4.0,4.0
16.0,3.72579,1.3,12.6
13.0,3.596284,3.2,10.9
17.0,1.0,1.0,1.0


### c) Transform function (using slice to practice)

In [15]:
# this is throwing an error that is still allowing the code to execute, but it's not happy about it
# seems like chained indexing isn't effecient in the long run
df['max_order'] = df.groupby(['user_id'])['order_number'].transform(np.max)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['max_order'] = df.groupby(['user_id'])['order_number'].transform(np.max)


In [36]:
# Another way to accomplish the same thing, but in separate steps
# there's probably/definitely a way to do this using loc[], but I can't figure it out
most_orders = df.groupby('user_id')['order_number'].max()

In [45]:
# it's originally saved as a series, but in order to merge, we need it to be in a df
most_orders2 = pd.DataFrame(most_orders)
most_orders2 = most_orders2.rename(columns = {'order_number':'most_orders'})

In [48]:
# left merge on user_id to add the new column
df = df.merge(most_orders2, how = 'left', on = 'user_id')

In [50]:
# columms of max_orders and most_orders match, so that's good
df.head(15)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,aisle_id,department_id,prices,_merge,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order,most_orders
0,2539329.0,1.0,1.0,2.0,8.0,,True,196,1.0,0.0,...,77.0,7.0,9.0,both,Mid-range,Regularly busy,Regularly busy,average orders,10.0,10.0
1,2398795.0,1.0,2.0,3.0,7.0,15.0,False,196,1.0,1.0,...,77.0,7.0,9.0,both,Mid-range,Regularly busy,Least busy days,average orders,10.0,10.0
2,473747.0,1.0,3.0,3.0,12.0,21.0,False,196,1.0,1.0,...,77.0,7.0,9.0,both,Mid-range,Regularly busy,Least busy days,most orders,10.0,10.0
3,2254736.0,1.0,4.0,4.0,7.0,29.0,False,196,1.0,1.0,...,77.0,7.0,9.0,both,Mid-range,Least busy,Least busy days,average orders,10.0,10.0
4,431534.0,1.0,5.0,4.0,15.0,28.0,False,196,1.0,1.0,...,77.0,7.0,9.0,both,Mid-range,Least busy,Least busy days,most orders,10.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2836489.0,164627.0,14.0,0.0,15.0,6.0,False,30489,2.0,0.0,...,67.0,20.0,7.5,both,Mid-range,Busiest day,Busiest days,most orders,23.0,23.0
999996,1843600.0,164632.0,5.0,1.0,19.0,9.0,False,30489,2.0,0.0,...,67.0,20.0,7.5,both,Mid-range,Regularly busy,Busiest days,average orders,31.0,31.0
999997,733106.0,164632.0,9.0,2.0,22.0,10.0,False,30489,3.0,1.0,...,67.0,20.0,7.5,both,Mid-range,Regularly busy,Regularly busy,fewest orders,31.0,31.0
999998,1650124.0,164632.0,17.0,6.0,17.0,13.0,False,30489,1.0,1.0,...,67.0,20.0,7.5,both,Mid-range,Regularly busy,Regularly busy,average orders,31.0,31.0


### d) Transform function (using merged df)

In [4]:
# time to do this on the full ords_prods
# this time it doesn't give an error!
ords_prods['max_order'] = ords_prods.groupby(['user_id'])['order_number'].transform(np.max)

In [5]:
# check to make sure it worked
ords_prods.head(15)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order
0,2539329.0,1.0,1.0,2.0,8.0,,True,196,1.0,0.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Regularly busy,average orders,10.0
1,2398795.0,1.0,2.0,3.0,7.0,15.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Least busy days,average orders,10.0
2,473747.0,1.0,3.0,3.0,12.0,21.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Least busy days,most orders,10.0
3,2254736.0,1.0,4.0,4.0,7.0,29.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Least busy,Least busy days,average orders,10.0
4,431534.0,1.0,5.0,4.0,15.0,28.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Least busy,Least busy days,most orders,10.0
5,3367565.0,1.0,6.0,2.0,7.0,19.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Regularly busy,average orders,10.0
6,550135.0,1.0,7.0,1.0,9.0,20.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Busiest days,average orders,10.0
7,3108588.0,1.0,8.0,1.0,14.0,14.0,False,196,2.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Busiest days,most orders,10.0
8,2295261.0,1.0,9.0,1.0,16.0,0.0,False,196,4.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Regularly busy,Busiest days,average orders,10.0
9,2550362.0,1.0,10.0,4.0,8.0,30.0,False,196,1.0,1.0,Soda,77.0,7.0,9.0,both,Mid-range,Least busy,Least busy days,average orders,10.0


### e) Variable derivation with loc

In [6]:
ords_prods.loc[ords_prods['max_order'] > 40, 'loyalty_flag'] = 'loyal customer'

In [8]:
ords_prods.loc[(ords_prods['max_order'] <= 40) & (ords_prods['max_order'] > 10), 'loyalty_flag'] = 'regular customer'

In [9]:
ords_prods.loc[ords_prods['max_order'] <= 10, 'loyalty_flag'] = 'new customer'

In [10]:
# most customers are regular
# there are still some products that were never purchased on this big list
# because I did an outer merge
ords_prods['loyalty_flag'].value_counts(dropna = False)

regular customer    15891507
loyal customer      10294027
new customer         6249525
NaN                       11
Name: loyalty_flag, dtype: int64

## Export

In [11]:
ords_prods.to_pickle(os.path.join(path, '02 Data', 'prepared data', 'orders_products_merged_3.pkl'))