# Importing Libraries

In [1]:
import os
import pandas as pd
import numpy as np

# Importing Data

In [2]:
path = r'/Users/docopeland/04 Instacart Basket Analysis'
derived = pd.read_pickle(os.path.join(path,'02 Data','prepared data','merged_derived_vars.pkl'))

# Grouping Data & Deriving New Variables

In [3]:
derived.dtypes

order_id                     int64
user_id                      int64
total_customer_orders        int64
order_day_of_week            int64
order_hour_of_day            int64
days_since_prior_order     float64
product_id                   int64
add_to_cart_order            int64
reordered                    int64
_merge                    category
product_name                object
aisle_id                     int64
department_id                int64
prices                     float64
price_range_loc             object
busiest_day                 object
busiest_2_days              object
busiest_period_of_day       object
dtype: object

In [4]:
derived.columns

Index(['order_id', 'user_id', 'total_customer_orders', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', '_merge', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest_day',
       'busiest_2_days', 'busiest_period_of_day'],
      dtype='object')

## Create a loyalty_flag

In [5]:
#create a max_order column with the max number of orders by user
derived['max_order'] = derived.groupby('user_id')['total_customer_orders'].transform(np.max)

In [6]:
#create a loyalty_flag based on the following logic:
#if the max_order is less than or equal to 10, new customer
#if the max_order is greater 10 and less than 40, regular customer
#if the max_order is greater than 40, loyal customer

In [7]:
#new customer
derived.loc[derived['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [8]:
#regular customer
derived.loc[(derived['max_order'] > 10) & (derived['max_order'] <= 40), 'loyalty_flag'] = 'Regular customer'

In [9]:
#loyal customer
derived.loc[derived['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [10]:
#checking the values
derived['loyalty_flag'].value_counts()

Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: loyalty_flag, dtype: int64

## Create a spender flag

In [11]:
#create an user_avg column with the mean of the prices by user
derived['user_avg'] = derived.groupby('user_id')['prices'].transform(np.mean)

In [12]:
#create a spender flag based on the following logic:
#if the user_avg is less than 10, low spender
#if the user_avg is greater or equal to 10, high spender

In [13]:
#low spender
derived.loc[derived['user_avg'] < 10, 'spender_flag'] = 'Low spender'

In [14]:
#high spender
derived.loc[derived['user_avg'] >= 10, 'spender_flag'] = 'High spender'

In [15]:
#checking the values
derived['spender_flag'].value_counts()

Low spender     31770614
High spender      634245
Name: spender_flag, dtype: int64

## Create a frequency flag

In [16]:
#create a regularity column with the median of days_since_prior_order by user
derived['regularity'] = derived.groupby('user_id')['days_since_prior_order'].transform(np.median)

In [17]:
#create a frequency flag based on the following logic:
#if the regularity is less than or equal to 10, frequent customer
#if the regularity is greater than 10 and less than equal to 20, regular customer
#if the regularity is greater than 20, non-frequent customer

In [18]:
#frequent customer
derived.loc[derived['regularity'] <= 10, 'frequency_flag'] = 'Frequent customer'

In [19]:
#regular customer
derived.loc[(derived['regularity'] > 10) & (derived['regularity'] <= 20), 'frequency_flag'] = 'Regular customer'

In [20]:
#non-frequent
derived.loc[derived['regularity'] > 20, 'frequency_flag'] = 'Non-frequent customer'

In [21]:
#checking the values
derived['frequency_flag'].value_counts()

Frequent customer        21559853
Regular customer          7208564
Non-frequent customer     3636437
Name: frequency_flag, dtype: int64

# Exporting Data

In [22]:
#exporting df as a pickle
derived.to_pickle(os.path.join(path,'02 Data','prepared data','merged_group_derived.pkl'))