# Table of Contents

0.1 Importing Libraries

0.2 Importing Data

0.3 Exploring Original Df

0.4 Create new columns

    0.4.1  region
    
    0.4.2  low_activity_flag
    
    0.4.3  total_spent_in_order
    
    0.4.4  total_spent_in_lifetime
    
    0.4.5  total_items_in_lifetime
    
    0.4.6  avg_items_per_order_by_user

0.5 Confirming Final Df 

0.6 Exporting Final Df

### 0.1 Importing Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

### 0.2 Importing Data

In [2]:
# Identify the file pathway to data files
path = r'C:\Users\CJ\Documents\_CJ-Stuff\Career Foundry\Data Immersion\Ach 4 - Python\2023-03 Instacart Basket Analysis'

In [3]:
# Import data
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_all.pkl'))

### 0.3 Exploring Original Df

In [4]:
df.shape

(32434489, 33)

In [5]:
df.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order_of_new_customer,product_id,add_to_cart_order,reordered,...,spender_type,median_days_since_ordering,frequent_shopper,gender,state,age,date_joined,n_dependants,fam_status,income
0,2539329,1,1,2,8,,True,196,1,0,...,Low spender,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423
1,2539329,1,1,2,8,,True,14084,2,0,...,Low spender,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423
2,2539329,1,1,2,8,,True,12427,3,0,...,Low spender,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423
3,2539329,1,1,2,8,,True,26088,4,0,...,Low spender,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423
4,2539329,1,1,2,8,,True,26405,5,0,...,Low spender,20.0,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 33 columns):
 #   Column                       Dtype         
---  ------                       -----         
 0   order_id                     int64         
 1   user_id                      int64         
 2   order_number                 int16         
 3   order_day_of_week            int8          
 4   order_hour_of_day            int8          
 5   days_since_prior_order       float16       
 6   first_order_of_new_customer  bool          
 7   product_id                   int32         
 8   add_to_cart_order            int16         
 9   reordered                    int8          
 10  product_name                 object        
 11  aisle_id                     float16       
 12  department_id                float16       
 13  price                        float32       
 14  product_info_available       bool          
 15  price_available              bool          
 16

In [7]:
# Checking for nulls
df.isnull().sum()

order_id                             0
user_id                              0
order_number                         0
order_day_of_week                    0
order_hour_of_day                    0
days_since_prior_order         2078068
first_order_of_new_customer          0
product_id                           0
add_to_cart_order                    0
reordered                            0
product_name                     30770
aisle_id                         30770
department_id                    30770
price                            35897
product_info_available               0
price_available                      0
price_range                      35897
busiest_day                          0
busiest_days                         0
busiest_period_of_day                0
max_order                            0
loyalty_flag                         0
mean_spending                        0
spender_type                         0
median_days_since_ordering           0
frequent_shopper         

In [8]:
# The null values for days_since_prior_order 
# are flagged by first_order of new customer
df['first_order_of_new_customer'].value_counts()

False    30356421
True      2078068
Name: first_order_of_new_customer, dtype: int64

In [9]:
# The null values for price
# are flagged by price_available
df['price_available'].value_counts()

True     32398592
False       35897
Name: price_available, dtype: int64

In [10]:
# The null values for product_name, aisle_id, and department_id 
# are flagged by product_info_available
df['product_info_available'].value_counts()

True     32403719
False       30770
Name: product_info_available, dtype: int64

All null/NaN values are appropriately flagged.

###     0.4.1 region

In [11]:
# Create lists for each region

In [12]:
northeast = ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey']

In [13]:
midwest = ['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']

In [14]:
south = ['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']

In [15]:
west = ['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']

In [16]:
# Assign a region to go with each state value

result = []

for value in df["state"]:
  if value in northeast:
    result.append("Northeast")
  elif value in midwest:
    result.append("Midwest")
  elif value in south:
    result.append("South")
  elif value in west:
    result.append("West")
  else:
    result.append("Unknown region")

In [17]:
# Explore result of loop
result

['South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'South',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'West',
 'Wes

In [18]:
# Append the results of the loop to the df
df['region'] = result

In [19]:
# Confirming the data in 'region'
df['region'].value_counts(dropna = False)

South        10801796
West          8300481
Midwest       7603861
Northeast     5728351
Name: region, dtype: int64

In [20]:
df[['state', 'region']].head()

Unnamed: 0,state,region
0,Alabama,South
1,Alabama,South
2,Alabama,South
3,Alabama,South
4,Alabama,South


In [21]:
df.shape

(32434489, 34)

In [22]:
# Reclaiming memory
del result

### 0.4.2  low_activity_flag

In [23]:
# Calculating the number of low_activity rows
len(df[df['max_order']<5])

1441523

In [24]:
# Creating a column 'low_activity_flag' with values based 
# on if max_order is < 5, then TRUE
#    else FALSE
df['low_activity_flag'] = df['max_order']<5

In [25]:
# Checking the data
df['low_activity_flag'].value_counts(dropna = False)

False    30992966
True      1441523
Name: low_activity_flag, dtype: int64

In [26]:
df[['max_order', 'low_activity_flag']].head()

Unnamed: 0,max_order,low_activity_flag
0,10,False
1,10,False
2,10,False
3,10,False
4,10,False


In [27]:
df.shape

(32434489, 35)

In [28]:
# Per customer request (CF instructions), we are exporting a copy of the df
# with only data from customers with >=5 orders

df_subset = df[df['low_activity_flag'] == False]

In [29]:
# Checking subset
df_subset.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order_of_new_customer,product_id,add_to_cart_order,reordered,...,frequent_shopper,gender,state,age,date_joined,n_dependants,fam_status,income,region,low_activity_flag
0,2539329,1,1,2,8,,True,196,1,0,...,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,South,False
1,2539329,1,1,2,8,,True,14084,2,0,...,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,South,False
2,2539329,1,1,2,8,,True,12427,3,0,...,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,South,False
3,2539329,1,1,2,8,,True,26088,4,0,...,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,South,False
4,2539329,1,1,2,8,,True,26405,5,0,...,Regular customer,Female,Alabama,31,2019-02-17,3,married,40423,South,False


In [30]:
# Confirming subset has correct number of rows
df_subset.shape

(30992966, 35)

In [31]:
#Exporting subset df
df_subset.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_5_plus.pkl'))

In [32]:
# Reclaiming memory
del df_subset

### 0.4.3  total_spent_in_order

In [33]:
# Create a new column 'total_spent_in_order'
# By first grouping data by order_id, then
# appling the transform() function to the prices column
# with the np.sum argument
df['total_spent_in_order'] = df.groupby(['order_id'])['price'].transform(np.sum)

In [34]:
# Confirming the column was created correctly
df[['order_id', 'price', 'total_spent_in_order']].head(20)

Unnamed: 0,order_id,price,total_spent_in_order
0,2539329,9.0,31.6
1,2539329,12.5,31.6
2,2539329,4.4,31.6
3,2539329,4.7,31.6
4,2539329,1.0,31.6
5,2398795,9.0,35.400002
6,2398795,3.0,35.400002
7,2398795,4.4,35.400002
8,2398795,10.3,35.400002
9,2398795,4.7,35.400002


In [35]:
df.shape

(32434489, 36)

### 0.4.4 total_spent_in_lifetime

In [36]:
# Create a new column 'total_spent_in_lifetime'
# By first grouping data by user_id, then
# appling the transform() function to the prices column
# with the np.sum argument
df['total_spent_in_lifetime'] = df.groupby(['user_id'])['price'].transform(np.sum)

In [37]:
# Confirming the column was created correctly
df[['user_id', 'order_id', 'price', 'total_spent_in_lifetime']].head(100)

Unnamed: 0,user_id,order_id,price,total_spent_in_lifetime
0,1,2539329,9.0,375.700012
1,1,2539329,12.5,375.700012
2,1,2539329,4.4,375.700012
3,1,2539329,4.7,375.700012
4,1,2539329,1.0,375.700012
...,...,...,...,...
95,2,738281,5.9,1465.599976
96,2,1673511,14.0,1465.599976
97,2,1673511,2.9,1465.599976
98,2,1673511,1.6,1465.599976


In [38]:
df.shape

(32434489, 37)

###     0.4.5  total_items_in_lifetime

In [39]:
# Identifying the number of unique users
df['user_id'].nunique()

206209

In [40]:
# Confirming the range of user_ids
df['user_id'].min()

1

In [41]:
# Confirming the range of user_ids
df['user_id'].max()

206209

In [42]:
# Creating a temporary dataframe to hold the number of items
# purchased by each user
df_items = pd.DataFrame()

In [43]:
df_items['total_items_in_lifetime'] = df['user_id'].value_counts().sort_index()

In [44]:
df_items.head()

Unnamed: 0,total_items_in_lifetime
1,59
2,195
3,88
4,18
5,37


In [45]:
df_items.shape

(206209, 1)

In [46]:
# Exploring totals
df_items.describe()

Unnamed: 0,total_items_in_lifetime
count,206209.0
mean,157.289396
std,204.208233
min,3.0
25%,39.0
50%,83.0
75%,188.0
max,3725.0


In [47]:
# Making the user_id for each row explicit
df_items['user_id'] = df_items.index

In [48]:
df_items['user_id'].max()

206209

In [49]:
df_items['user_id'].min()

1

In [50]:
df_items.shape

(206209, 2)

In [51]:
df_items.head()

Unnamed: 0,total_items_in_lifetime,user_id
1,59,1
2,195,2
3,88,3
4,18,4
5,37,5


In [52]:
# Merging total_items_in_lifetime with full df
df = df.merge(df_items, on = ['user_id'], how = 'left', indicator = True)

In [53]:
# Exploring the merge
df.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order_of_new_customer,product_id,add_to_cart_order,reordered,...,date_joined,n_dependants,fam_status,income,region,low_activity_flag,total_spent_in_order,total_spent_in_lifetime,total_items_in_lifetime,_merge
0,2539329,1,1,2,8,,True,196,1,0,...,2019-02-17,3,married,40423,South,False,31.6,375.700012,59,both
1,2539329,1,1,2,8,,True,14084,2,0,...,2019-02-17,3,married,40423,South,False,31.6,375.700012,59,both
2,2539329,1,1,2,8,,True,12427,3,0,...,2019-02-17,3,married,40423,South,False,31.6,375.700012,59,both
3,2539329,1,1,2,8,,True,26088,4,0,...,2019-02-17,3,married,40423,South,False,31.6,375.700012,59,both
4,2539329,1,1,2,8,,True,26405,5,0,...,2019-02-17,3,married,40423,South,False,31.6,375.700012,59,both


In [54]:
df.shape

(32434489, 39)

The shape is what we would expect: same number of rows, columns increased by 2 (the new column plus the merge field)

In [55]:
df['_merge'].value_counts()

both          32434489
left_only            0
right_only           0
Name: _merge, dtype: int64

In [56]:
# Removing the merge flag so it doesn't interfere with future merges
df = df.drop(columns = ['_merge'])

In [57]:
df.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order_of_new_customer,product_id,add_to_cart_order,reordered,...,age,date_joined,n_dependants,fam_status,income,region,low_activity_flag,total_spent_in_order,total_spent_in_lifetime,total_items_in_lifetime
0,2539329,1,1,2,8,,True,196,1,0,...,31,2019-02-17,3,married,40423,South,False,31.6,375.700012,59
1,2539329,1,1,2,8,,True,14084,2,0,...,31,2019-02-17,3,married,40423,South,False,31.6,375.700012,59
2,2539329,1,1,2,8,,True,12427,3,0,...,31,2019-02-17,3,married,40423,South,False,31.6,375.700012,59
3,2539329,1,1,2,8,,True,26088,4,0,...,31,2019-02-17,3,married,40423,South,False,31.6,375.700012,59
4,2539329,1,1,2,8,,True,26405,5,0,...,31,2019-02-17,3,married,40423,South,False,31.6,375.700012,59


In [58]:
df.shape

(32434489, 38)

### 0.4.6 avg_items_per_order_by_user

In [59]:
# Create a column with the avg_items_per_order_by_user
df['avg_items_per_order_by_user'] = df['total_items_in_lifetime'] / df['max_order']

In [60]:
# Explore new column
df['avg_items_per_order_by_user'].describe()

count    3.243449e+07
mean     1.318593e+01
std      6.547892e+00
min      1.000000e+00
25%      8.535354e+00
50%      1.200000e+01
75%      1.657143e+01
max      7.025000e+01
Name: avg_items_per_order_by_user, dtype: float64

In [61]:
df.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order_of_new_customer,product_id,add_to_cart_order,reordered,...,date_joined,n_dependants,fam_status,income,region,low_activity_flag,total_spent_in_order,total_spent_in_lifetime,total_items_in_lifetime,avg_items_per_order_by_user
0,2539329,1,1,2,8,,True,196,1,0,...,2019-02-17,3,married,40423,South,False,31.6,375.700012,59,5.9
1,2539329,1,1,2,8,,True,14084,2,0,...,2019-02-17,3,married,40423,South,False,31.6,375.700012,59,5.9
2,2539329,1,1,2,8,,True,12427,3,0,...,2019-02-17,3,married,40423,South,False,31.6,375.700012,59,5.9
3,2539329,1,1,2,8,,True,26088,4,0,...,2019-02-17,3,married,40423,South,False,31.6,375.700012,59,5.9
4,2539329,1,1,2,8,,True,26405,5,0,...,2019-02-17,3,married,40423,South,False,31.6,375.700012,59,5.9


In [62]:
df.shape

(32434489, 39)

### 0.5 Confirming Final Df

In [63]:
df.shape

(32434489, 39)

Final df has expected number of rows (32434489) and columns (33+4).

In [64]:
# Checking datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 39 columns):
 #   Column                       Dtype         
---  ------                       -----         
 0   order_id                     int64         
 1   user_id                      int64         
 2   order_number                 int16         
 3   order_day_of_week            int8          
 4   order_hour_of_day            int8          
 5   days_since_prior_order       float16       
 6   first_order_of_new_customer  bool          
 7   product_id                   int32         
 8   add_to_cart_order            int16         
 9   reordered                    int8          
 10  product_name                 object        
 11  aisle_id                     float16       
 12  department_id                float16       
 13  price                        float32       
 14  product_info_available       bool          
 15  price_available              bool          
 16

In [65]:
# Adjust datatypes for new columns to be more appropriately sized
df['region'] = df['region'].astype('category')
df['total_items_in_lifetime'] = df['total_items_in_lifetime'].astype('int32')
df['avg_items_per_order_by_user'] = df['avg_items_per_order_by_user'].astype('float16')


In [66]:
# Confirming changes
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 39 columns):
 #   Column                       Dtype         
---  ------                       -----         
 0   order_id                     int64         
 1   user_id                      int64         
 2   order_number                 int16         
 3   order_day_of_week            int8          
 4   order_hour_of_day            int8          
 5   days_since_prior_order       float16       
 6   first_order_of_new_customer  bool          
 7   product_id                   int32         
 8   add_to_cart_order            int16         
 9   reordered                    int8          
 10  product_name                 object        
 11  aisle_id                     float16       
 12  department_id                float16       
 13  price                        float32       
 14  product_info_available       bool          
 15  price_available              bool          
 16

### 0.6 Exporting the new df

In [67]:
#Exporting final df
df.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_all2.pkl'))