# 4.8 Grouping data & aggregating variables

Table of Contents

#### 1. Import the data sets into Jupyter
#### 2. Create a subset of the dataframe
#### 3. Check the shape of the newly created dataframe
#### 4. Grouping data
#### 5. Create a column to generate the maximum orders for each user
#### 6. Create a loyalty flag using loc
#### 7. Find the aggregated mean of the 'order_number' column grouped by 'department_id'
#### 8. Look at spending habits of loyalty groups
#### 9. Determine order frequency of each user
#### 10. Quality check for the 'prices' column
#### 11. Export updated dataframe

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

### 1. Import the data sets into Jupyter

In [2]:
# Tell Python to remember a main folder path
path = r'/Users/giadairene/Documents/CareerFoundry Data Analytics/Data Analytics Immersion/Achievement 4/Instacart Basket Analysis'

In [3]:
# Import dataset ords_prods_merge_derived.pkl
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_derived.pkl'))

### 2. Create a subset of the dataframe

In [4]:
# Create a subset of the dataframe
df = ords_prods_merge[:1000000]

### 3. Check the shape of the newly created dataframe

In [5]:
df.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_day,busiest_period_of_day
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,False,5,0,both,Mid-range product,Regularly busy,Most orders
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,False,1,1,both,Mid-range product,Regularly busy,Average orders
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,False,20,0,both,Mid-range product,Busiest days,Average orders
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,True,10,0,both,Mid-range product,Least busy,Most orders
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,False,11,1,both,Mid-range product,Least busy,Average orders
5,1,Chocolate Sandwich Cookies,61,19,5.8,1701441,777,16,1,7,26.0,False,7,0,both,Mid-range product,Busiest days,Average orders
6,1,Chocolate Sandwich Cookies,61,19,5.8,1871483,825,3,2,14,30.0,False,2,0,both,Mid-range product,Regularly busy,Most orders
7,1,Chocolate Sandwich Cookies,61,19,5.8,1290456,910,12,3,10,30.0,False,1,0,both,Mid-range product,Least busy,Most orders
8,1,Chocolate Sandwich Cookies,61,19,5.8,369558,1052,10,1,20,19.0,False,1,0,both,Mid-range product,Busiest days,Average orders
9,1,Chocolate Sandwich Cookies,61,19,5.8,589712,1052,15,1,12,15.0,False,2,1,both,Mid-range product,Busiest days,Most orders


In [6]:
df.shape

(1000000, 18)

### 4. Grouping data

In [7]:
# Split the data into groups based on 'department_id'

df = df.groupby('department_id')

In [8]:
# Check the output 

df.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_day,busiest_period_of_day
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,False,5,0,both,Mid-range product,Regularly busy,Most orders
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,False,1,1,both,Mid-range product,Regularly busy,Average orders
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,False,20,0,both,Mid-range product,Busiest days,Average orders
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,True,10,0,both,Mid-range product,Least busy,Most orders
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,False,11,1,both,Mid-range product,Least busy,Average orders
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323488,503,Wild Rice Blend,68,10,10.8,3320117,14236,3,1,14,21.0,False,6,1,both,Mid-range product,Busiest days,Most orders
323489,503,Wild Rice Blend,68,10,10.8,1365415,18847,14,0,15,23.0,False,9,0,both,Mid-range product,Busiest days,Most orders
323490,503,Wild Rice Blend,68,10,10.8,1679007,22364,8,5,15,3.0,False,9,0,both,Mid-range product,Regularly busy,Most orders
323491,503,Wild Rice Blend,68,10,10.8,3356324,22858,6,0,10,5.0,False,7,0,both,Mid-range product,Busiest days,Most orders


In [9]:
# Apply the agg() function to each group to obtain the mean values for the 'order_number' column

df.agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,15.577493
2,17.320781
3,16.084944
4,17.530458
5,14.763075
6,16.658449
7,17.03159
8,15.076662
9,15.44758
10,18.681852


In [10]:
df['order_number'].mean()

department_id
1     15.577493
2     17.320781
3     16.084944
4     17.530458
5     14.763075
6     16.658449
7     17.031590
8     15.076662
9     15.447580
10    18.681852
11    15.447411
12    14.327957
13    16.548642
14    16.960241
15    16.121948
16    17.803851
17    15.593633
18    19.674252
19    16.899756
20    16.255442
21    25.535479
Name: order_number, dtype: float64

In [11]:
df.order_number.mean()

department_id
1     15.577493
2     17.320781
3     16.084944
4     17.530458
5     14.763075
6     16.658449
7     17.031590
8     15.076662
9     15.447580
10    18.681852
11    15.447411
12    14.327957
13    16.548642
14    16.960241
15    16.121948
16    17.803851
17    15.593633
18    19.674252
19    16.899756
20    16.255442
21    25.535479
Name: order_number, dtype: float64

In [12]:
# Produce multiple statistics at the same time

df.agg({'order_number': ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,15.577493,1,99
2,17.320781,1,96
3,16.084944,1,99
4,17.530458,1,99
5,14.763075,1,99
6,16.658449,1,99
7,17.03159,1,99
8,15.076662,1,98
9,15.44758,1,99
10,18.681852,1,99


### 5. Create a column to generate the maximum orders for each user

In [13]:
ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform('max')

In [14]:
# Check the output

ords_prods_merge.head(15) # Data doesn´t seem to be grouped by the 'user_id' column

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_day,busiest_period_of_day,max_order
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,False,5,0,both,Mid-range product,Regularly busy,Most orders,32
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,False,1,1,both,Mid-range product,Regularly busy,Average orders,32
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,False,20,0,both,Mid-range product,Busiest days,Average orders,5
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,True,10,0,both,Mid-range product,Least busy,Most orders,3
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,False,11,1,both,Mid-range product,Least busy,Average orders,3
5,1,Chocolate Sandwich Cookies,61,19,5.8,1701441,777,16,1,7,26.0,False,7,0,both,Mid-range product,Busiest days,Average orders,26
6,1,Chocolate Sandwich Cookies,61,19,5.8,1871483,825,3,2,14,30.0,False,2,0,both,Mid-range product,Regularly busy,Most orders,9
7,1,Chocolate Sandwich Cookies,61,19,5.8,1290456,910,12,3,10,30.0,False,1,0,both,Mid-range product,Least busy,Most orders,12
8,1,Chocolate Sandwich Cookies,61,19,5.8,369558,1052,10,1,20,19.0,False,1,0,both,Mid-range product,Busiest days,Average orders,20
9,1,Chocolate Sandwich Cookies,61,19,5.8,589712,1052,15,1,12,15.0,False,2,1,both,Mid-range product,Busiest days,Most orders,20


In [15]:
ords_prods_merge[['order_id', 'user_id', 'order_number', 'max_order']][ords_prods_merge['user_id']==1]

Unnamed: 0,order_id,user_id,order_number,max_order
92368,2539329,1,1,10
92369,2398795,1,2,10
92370,473747,1,3,10
92371,2254736,1,4,10
92372,431534,1,5,10
92373,3367565,1,6,10
92374,550135,1,7,10
92375,3108588,1,8,10
92376,2295261,1,9,10
92377,2550362,1,10,10


In [16]:
ords_prods_merge[['order_id', 'user_id', 'order_number', 'max_order']][ords_prods_merge['user_id']==15]

Unnamed: 0,order_id,user_id,order_number,max_order
92378,2968173,15,15,22
92379,1870022,15,17,22
92380,1911383,15,18,22
92381,2715276,15,21,22
92382,487368,15,22,22
...,...,...,...,...
27546087,3135826,15,2,22
28007052,2570697,15,10,22
31447948,2685110,15,7,22
31447949,2883598,15,12,22


### 6. Create a loyalty flag using loc

In [17]:
ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [18]:
ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [19]:
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [20]:
ords_prods_merge['loyalty_flag'].value_counts(dropna = False)

loyalty_flag
Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: count, dtype: int64

In [21]:
ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_day,busiest_period_of_day,max_order,loyalty_flag
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,False,5,0,both,Mid-range product,Regularly busy,Most orders,32,Regular customer
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,False,1,1,both,Mid-range product,Regularly busy,Average orders,32,Regular customer
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,False,20,0,both,Mid-range product,Busiest days,Average orders,5,New customer
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,True,10,0,both,Mid-range product,Least busy,Most orders,3,New customer
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,False,11,1,both,Mid-range product,Least busy,Average orders,3,New customer


### 7. Find the aggregated mean of the 'order_number' column grouped by 'department_id'

In [22]:
ords_prods_merge.groupby('department_id')['order_number'].mean()

department_id
1     15.457838
2     17.277920
3     17.170395
4     17.811403
5     15.215751
6     16.439806
7     17.225802
8     15.340650
9     15.895474
10    20.197148
11    16.170638
12    15.887671
13    16.583536
14    16.773669
15    16.165037
16    17.665606
17    15.694469
18    19.310397
19    17.177343
20    16.473447
21    22.902379
Name: order_number, dtype: float64

In [23]:
df['order_number'].mean()

department_id
1     15.577493
2     17.320781
3     16.084944
4     17.530458
5     14.763075
6     16.658449
7     17.031590
8     15.076662
9     15.447580
10    18.681852
11    15.447411
12    14.327957
13    16.548642
14    16.960241
15    16.121948
16    17.803851
17    15.593633
18    19.674252
19    16.899756
20    16.255442
21    25.535479
Name: order_number, dtype: float64

#### Overall the aggregated mean values seem to be slightly higher for each department when looking at the entire data frame.

### 8.  Look at spending habits of loyalty groups

#### Question 5

The marketing team at Instacart wants to know whether there’s a difference between the spending habits of the three types of customers you identified. Use the loyalty flag you created and check the basic statistics of the product prices for each loyalty category (Loyal Customer, Regular Customer, and New Customer). What you’re trying to determine is whether the prices of products purchased by loyal customers differ from those purchased by regular or new customers.

In [24]:
ords_prods_merge.groupby('loyalty_flag').agg({'prices': ['mean', 'min', 'max']})

Unnamed: 0_level_0,prices,prices,prices
Unnamed: 0_level_1,mean,min,max
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Loyal customer,10.386336,1.0,99999.0
New customer,13.29467,1.0,99999.0
Regular customer,12.495717,1.0,99999.0


#### The average price of a product purchased by a new customer is about 3 US dollars more than that of a loyal customer.

#### Question 6

The team now wants to target different types of spenders in their marketing campaigns. This can be achieved by looking at the prices of the items people are buying. Create a spending flag for each user based on the average price across all their orders using the following criteria: if the mean of the prices of products purchased by a user is lower than 10, then flag them as a “Low spender.” If the mean of the prices of products purchased by a user is higher than or equal to 10, then flag them as a “High spender.”

In [25]:
# Use transform() to determine the average price across all products purchased by each user

ords_prods_merge['average_product_price'] = ords_prods_merge.groupby(['user_id'])['prices'].transform('mean')

In [26]:
# View relevant columns

ords_prods_merge[['user_id', 'average_product_price']].head(15) # Again the data isn´t grouped by 'user_id'

Unnamed: 0,user_id,average_product_price
0,138,6.935811
1,138,6.935811
2,709,7.930208
3,764,4.972414
4,764,4.972414
5,777,6.935398
6,825,5.957576
7,910,6.68
8,1052,7.1625
9,1052,7.1625


In [27]:
# Create spending_flag column with given criteria using loc() function

ords_prods_merge.loc[ords_prods_merge['average_product_price'] >= 10, 'spending_flag'] = 'High spender'

In [28]:
ords_prods_merge.loc[ords_prods_merge['average_product_price'] < 10, 'spending_flag'] = 'Low spender'

In [29]:
# Check frequency of new spending_flag column

ords_prods_merge['spending_flag'].value_counts(dropna = False)

spending_flag
Low spender     31770614
High spender      634245
Name: count, dtype: int64

In [30]:
# View relevant columns of updated dataframe

ords_prods_merge[['user_id', 'average_product_price', 'spending_flag']][ords_prods_merge['user_id']==1]

Unnamed: 0,user_id,average_product_price,spending_flag
92368,1,6.367797,Low spender
92369,1,6.367797,Low spender
92370,1,6.367797,Low spender
92371,1,6.367797,Low spender
92372,1,6.367797,Low spender
92373,1,6.367797,Low spender
92374,1,6.367797,Low spender
92375,1,6.367797,Low spender
92376,1,6.367797,Low spender
92377,1,6.367797,Low spender


### 9. Determine order frequency of each user

#### Question 7

In order to send relevant notifications to users within the app (for instance, asking users if they want to buy the same item again), the Instacart team wants you to determine frequent versus non-frequent customers. Create an order frequency flag that marks the regularity of a user’s ordering behavior according to the median in the “days_since_prior_order” column. The criteria for the flag should be as follows:
If the median of “days_since_prior_order” is higher than 20, then the customer should be labeled a “Non-frequent customer.”
If the median is higher than 10 and lower than or equal to 20, then the customer should be labeled a “Regular customer.”
If the median is lower than or equal to 10, then the customer should be labeled a “Frequent customer.”

In [31]:
# Use transform() to determine the median number of days between orders for each user

ords_prods_merge['median_days_between_orders'] = ords_prods_merge.groupby(['user_id'])['days_since_prior_order'].transform('median')

In [32]:
# View relevant columns

ords_prods_merge[['user_id', 'median_days_between_orders']][ords_prods_merge['user_id']==1]

Unnamed: 0,user_id,median_days_between_orders
92368,1,20.5
92369,1,20.5
92370,1,20.5
92371,1,20.5
92372,1,20.5
92373,1,20.5
92374,1,20.5
92375,1,20.5
92376,1,20.5
92377,1,20.5


In [33]:
# Create frequency_flag column based on given criteria using loc() function

ords_prods_merge.loc[ords_prods_merge['median_days_between_orders'] > 20, 'frequency_flag'] = 'Non-frequent customer'

In [34]:
ords_prods_merge.loc[(ords_prods_merge['median_days_between_orders'] <= 20) & (ords_prods_merge['median_days_between_orders'] > 10) , 'frequency_flag'] = 'Regular customer'

In [35]:
ords_prods_merge.loc[ords_prods_merge['median_days_between_orders'] <= 10, 'frequency_flag'] = 'Frequent customer'

In [36]:
# Check frequency of new frequency_flag column

ords_prods_merge['frequency_flag'].value_counts(dropna = False)

frequency_flag
Frequent customer        21559853
Regular customer          7208564
Non-frequent customer     3636437
nan                             5
Name: count, dtype: int64

In [37]:
# Check nan values

ords_prods_merge[['user_id', 'days_since_prior_order', 'median_days_between_orders', 'frequency_flag']][ords_prods_merge['frequency_flag']=='nan']

Unnamed: 0,user_id,days_since_prior_order,median_days_between_orders,frequency_flag
6234909,159838,,,
12947653,159838,,,
13839012,159838,,,
14758536,159838,,,
21673807,159838,,,


#### A possible explanation for these 'nan' values is that customer 159838 has only made one order on the platform.

In [38]:
# View relevant columns of updated dataframe

ords_prods_merge[['user_id', 'days_since_prior_order', 'median_days_between_orders', 'frequency_flag']][ords_prods_merge['user_id']==1]

Unnamed: 0,user_id,days_since_prior_order,median_days_between_orders,frequency_flag
92368,1,,20.5,Non-frequent customer
92369,1,15.0,20.5,Non-frequent customer
92370,1,21.0,20.5,Non-frequent customer
92371,1,29.0,20.5,Non-frequent customer
92372,1,28.0,20.5,Non-frequent customer
92373,1,19.0,20.5,Non-frequent customer
92374,1,20.0,20.5,Non-frequent customer
92375,1,14.0,20.5,Non-frequent customer
92376,1,0.0,20.5,Non-frequent customer
92377,1,30.0,20.5,Non-frequent customer


In [39]:
# Check entire dataframe

ords_prods_merge.head(15)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,_merge,price_range_loc,busiest_day,busiest_period_of_day,max_order,loyalty_flag,average_product_price,spending_flag,median_days_between_orders,frequency_flag
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,both,Mid-range product,Regularly busy,Most orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,both,Mid-range product,Regularly busy,Average orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,...,both,Mid-range product,Busiest days,Average orders,5,New customer,7.930208,Low spender,8.0,Frequent customer
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,...,both,Mid-range product,Least busy,Most orders,3,New customer,4.972414,Low spender,9.0,Frequent customer
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,...,both,Mid-range product,Least busy,Average orders,3,New customer,4.972414,Low spender,9.0,Frequent customer
5,1,Chocolate Sandwich Cookies,61,19,5.8,1701441,777,16,1,7,...,both,Mid-range product,Busiest days,Average orders,26,Regular customer,6.935398,Low spender,11.0,Regular customer
6,1,Chocolate Sandwich Cookies,61,19,5.8,1871483,825,3,2,14,...,both,Mid-range product,Regularly busy,Most orders,9,New customer,5.957576,Low spender,20.0,Regular customer
7,1,Chocolate Sandwich Cookies,61,19,5.8,1290456,910,12,3,10,...,both,Mid-range product,Least busy,Most orders,12,Regular customer,6.68,Low spender,6.0,Frequent customer
8,1,Chocolate Sandwich Cookies,61,19,5.8,369558,1052,10,1,20,...,both,Mid-range product,Busiest days,Average orders,20,Regular customer,7.1625,Low spender,10.0,Frequent customer
9,1,Chocolate Sandwich Cookies,61,19,5.8,589712,1052,15,1,12,...,both,Mid-range product,Busiest days,Most orders,20,Regular customer,7.1625,Low spender,10.0,Frequent customer


### 10. Quality check for the 'prices' column

In [42]:
# Check the column statistics
ords_prods_merge['prices'].describe()

count    3.240486e+07
mean     1.198023e+01
std      4.956554e+02
min      1.000000e+00
25%      4.200000e+00
50%      7.400000e+00
75%      1.130000e+01
max      9.999900e+04
Name: prices, dtype: float64

In [43]:
ords_prods_merge['prices'].mean()

11.980225638383468

In [44]:
ords_prods_merge['prices'].median()

7.4

In [45]:
ords_prods_merge['prices'].max()

99999.0

In [47]:
# Look for outliers

ords_prods_merge.loc[ords_prods_merge['prices'] > 100]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,_merge,price_range_loc,busiest_day,busiest_period_of_day,max_order,loyalty_flag,average_product_price,spending_flag,median_days_between_orders,frequency_flag
13100147,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,912404,17,12,2,14,...,both,High-range product,Regularly busy,Most orders,40,Regular customer,108.648299,High spender,5.0,Frequent customer
13100148,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,603376,17,22,6,16,...,both,High-range product,Regularly busy,Average orders,40,Regular customer,108.648299,High spender,5.0,Frequent customer
13100149,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,3264360,135,2,2,21,...,both,High-range product,Regularly busy,Average orders,4,New customer,1154.792308,High spender,12.0,Regular customer
13100150,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,892534,135,3,0,8,...,both,High-range product,Busiest days,Most orders,4,New customer,1154.792308,High spender,12.0,Regular customer
13100151,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0,229704,342,8,1,19,...,both,High-range product,Busiest days,Average orders,16,Regular customer,114.426619,High spender,23.0,Non-frequent customer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21786876,33664,2 % Reduced Fat Milk,84,16,99999.0,2249946,204099,29,0,8,...,both,High-range product,Busiest days,Most orders,39,Regular customer,1106.743956,High spender,4.0,Frequent customer
21786877,33664,2 % Reduced Fat Milk,84,16,99999.0,2363282,204099,31,0,9,...,both,High-range product,Busiest days,Most orders,39,Regular customer,1106.743956,High spender,4.0,Frequent customer
21786878,33664,2 % Reduced Fat Milk,84,16,99999.0,3181945,204395,13,3,15,...,both,High-range product,Least busy,Most orders,15,Regular customer,451.153540,High spender,5.0,Frequent customer
21786879,33664,2 % Reduced Fat Milk,84,16,99999.0,2486215,205227,7,3,20,...,both,High-range product,Least busy,Average orders,12,Regular customer,1178.381871,High spender,12.0,Regular customer


In [48]:
# Transform the outliers into missing values

ords_prods_merge.loc[ords_prods_merge['prices'] > 100, 'prices'] = np.nan

In [49]:
# Check the output

ords_prods_merge['prices'].max()

25.0

### 11. Export updated dataframe

In [50]:
# Export derived data frame as pickle file
ords_prods_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_grouped.pkl'))