# Table of Contents

0.1 Importing Libraries

0.2 Importing Data

0.3 Exploring Original Dfs

0.4 Merging Dfs

0.5 Confirming Merged Df

0.6 Adding Flag for product_info_available

0.7 Adding Flag for price_available

0.8 Removing the Merge Flag

0.9 Adjusting Data Types in the final Df

0.10 Exporting Merged Df

### 0.1 Importing Libraries

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import os

### 0.2 Importing Data

In [4]:
# Identify the file pathway to data files
path = r'C:\Users\CJ\Documents\_CJ-Stuff\Career Foundry\Data Immersion\Ach 4 - Python\2023-03 Instacart Basket Analysis'

In [5]:
# Import data from products_clean and orders_products_combined.csv

In [6]:
df_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'products_clean.pkl'))

In [7]:
df_orders_products_combined = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

### 0.3 Exploring Original Dfs

In [8]:
# Comparing shapes of the two dfs
df_orders_products_combined.shape

(32434489, 10)

In [9]:
df_prods.shape

(49668, 5)

Merge should be 32434489 rows and 15 columns.
(Columns = 10 + 5 + 1 merge field - 1 key overlap)

In [10]:
# Checking for nulls in each df
df_orders_products_combined.isnull().sum()

order_id                             0
user_id                              0
order_number                         0
order_day_of_week                    0
order_hour_of_day                    0
days_since_prior_order         2078068
first_order_of_new_customer          0
product_id                           0
add_to_cart_order                    0
reordered                            0
dtype: int64

In [11]:
df_prods.isnull().sum()

product_id       0
product_name     0
aisle_id         0
department_id    0
price            2
dtype: int64

The nulls in each df are known nulls.
The days_since_prior_order NaNs are flagged via first_order_of_new_customer
The price NaNs will havea flag added to them in this script (price_available)

In [12]:
# Let's predict the number of price nulls to expect in the final df
# Currently there are two rows with NaN prices
df_prods[df_prods['price'].isnull() == True]

Unnamed: 0,product_id,product_name,aisle_id,department_id,price
21554,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,
33666,33664,2 % Reduced Fat Milk,84,16,


In [13]:
# First extract the product_id for the rows with NaN prices
no_price = df_prods.loc[(df_prods['price'].isnull() == True), 'product_id']

In [14]:
no_price

21554    21553
33666    33664
Name: product_id, dtype: int32

In [15]:
# Identify how many times those products were ordered 
df_orders_products_combined[df_orders_products_combined['product_id'].isin(no_price)]

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order_of_new_customer,product_id,add_to_cart_order,reordered
1576,912404,17,12,2,14,5.0,False,21553,5,0
1638,603376,17,22,6,16,4.0,False,21553,3,1
16534,3264360,135,2,2,21,13.0,False,21553,6,0
16540,892534,135,3,0,8,12.0,False,21553,3,1
53711,229704,342,8,1,19,30.0,False,21553,9,0
...,...,...,...,...,...,...,...,...,...,...
32350064,3172853,205650,18,1,9,7.0,False,21553,17,1
32377297,2504315,205818,3,5,15,3.0,False,21553,13,0
32377307,1108388,205818,5,4,5,1.0,False,21553,5,1
32410137,1916142,206049,1,2,17,,True,21553,2,0


The products with NaN prices were ordered 5127 times, so we can expect to see that reflected in the final merged df.

In [16]:
# Check the output of the two dfs
df_orders_products_combined.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order_of_new_customer,product_id,add_to_cart_order,reordered
0,2539329,1,1,2,8,,True,196,1,0
1,2539329,1,1,2,8,,True,14084,2,0
2,2539329,1,1,2,8,,True,12427,3,0
3,2539329,1,1,2,8,,True,26088,4,0
4,2539329,1,1,2,8,,True,26405,5,0


In [17]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,price
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [18]:
# Comparing the datatypes of the two dfs
df_orders_products_combined.dtypes

order_id                         int64
user_id                          int64
order_number                     int16
order_day_of_week                 int8
order_hour_of_day                 int8
days_since_prior_order         float16
first_order_of_new_customer       bool
product_id                       int32
add_to_cart_order                int16
reordered                         int8
dtype: object

In [19]:
df_prods.dtypes

product_id         int32
product_name      object
aisle_id           int16
department_id      int16
price            float32
dtype: object

The product_id column will be the key for merging the two tables.

### 0.4 Merging Dfs

In [20]:
# An exploratory OUTER merge resulted in value_counts of:
#       both          32403719
#       left_only        30770
#       right_only          11
# 
# LEFT merge is used for the final merge because we want to be able
# explore all orders -- even if we aren't sure what product was purchased.
# 
df_merged = df_orders_products_combined.merge(df_prods, on = ['product_id'], how = 'left', indicator = True)

### 0.5 Confirming Merged Df

In [21]:
df_merged['_merge'].value_counts()

both          32403719
left_only        30770
right_only           0
Name: _merge, dtype: int64

In [22]:
df_merged.shape

(32434489, 15)

Merge has expected number of rows (32434489) and columns (15).

In [23]:
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order_of_new_customer,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,price,_merge
0,2539329,1,1,2,8,,True,196,1,0,Soda,77.0,7.0,9.0,both
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,both
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23.0,19.0,4.4,both
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23.0,19.0,4.7,both
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,both


In [24]:
# Checking for nulls across df
df_merged.isnull().sum()

order_id                             0
user_id                              0
order_number                         0
order_day_of_week                    0
order_hour_of_day                    0
days_since_prior_order         2078068
first_order_of_new_customer          0
product_id                           0
add_to_cart_order                    0
reordered                            0
product_name                     30770
aisle_id                         30770
department_id                    30770
price                            35897
_merge                               0
dtype: int64

The null values in days_since_prior_order have already been accounted for with the first_order_of_new_customer flag.

We anticipated 30,770 nulls for products where no info was available.

We also anticipated the additional 5127 nulls for prices due to two products not having a price available. (30,770 + 5127 = 35,897)

Now we'll add two flags:

* product_info_avaialble for the 30,770 instances where no product_name, aisle_id, or department_id is available.

* price_availalbe for the 35,897 instances of no price available.



### 0.6 Adding Flag for product_info_available

In [25]:
# Assigning boolean values to new column
df_merged['product_info_available'] = (df_merged['_merge'] == 'both')

In [26]:
#Checking to see if first set of values were correctly labeled as True/False in the new column.
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order_of_new_customer,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,price,_merge,product_info_available
0,2539329,1,1,2,8,,True,196,1,0,Soda,77.0,7.0,9.0,both,True
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,both,True
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23.0,19.0,4.4,both,True
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23.0,19.0,4.7,both,True
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,both,True


In [27]:
# Confirming that the correct number of True/False values appear
df_merged['product_info_available'].value_counts()

True     32403719
False       30770
Name: product_info_available, dtype: int64

In [28]:
# Confirming that the new column is showing up as the correct datatype
df_merged['product_info_available'].dtype

dtype('bool')

In [29]:
#Confirming shape after adding flag
df_merged.shape

(32434489, 16)

### 0.7 Adding Flag for price_available

In [30]:
# Assigning boolean values to new column
df_merged['price_available'] = (df_merged['price'].isnull() == False)

In [31]:
#Checking to see if first set of values were correctly labeled as True/False in the new column.
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order_of_new_customer,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,price,_merge,product_info_available,price_available
0,2539329,1,1,2,8,,True,196,1,0,Soda,77.0,7.0,9.0,both,True,True
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,both,True,True
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23.0,19.0,4.4,both,True,True
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23.0,19.0,4.7,both,True,True
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,both,True,True


In [32]:
# Confirming that the correct number of True/False values appear
df_merged['price_available'].value_counts()

True     32398592
False       35897
Name: price_available, dtype: int64

In [33]:
# Confirming that the new column is showing up as the correct datatype
df_merged['price_available'].dtype

dtype('bool')

In [34]:
#Confirming shape after adding flag
df_merged.shape

(32434489, 17)

### 0.8 Removing the Merge Flag

In [35]:
# Removing the merge flag so it doesn't interfere with future merges
df_merged_final = df_merged.drop(columns = ['_merge'])

In [36]:
df_merged_final.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order_of_new_customer,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,price,product_info_available,price_available
0,2539329,1,1,2,8,,True,196,1,0,Soda,77.0,7.0,9.0,True,True
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,True,True
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23.0,19.0,4.4,True,True
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23.0,19.0,4.7,True,True
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,True,True


In [37]:
#Confirming shape after removing flag
df_merged_final.shape

(32434489, 16)

### 0.9 Adjusting Data Types in the final Df

In [38]:
# Confirming final df details
df_merged_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 16 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   order_id                     int64  
 1   user_id                      int64  
 2   order_number                 int16  
 3   order_day_of_week            int8   
 4   order_hour_of_day            int8   
 5   days_since_prior_order       float16
 6   first_order_of_new_customer  bool   
 7   product_id                   int32  
 8   add_to_cart_order            int16  
 9   reordered                    int8   
 10  product_name                 object 
 11  aisle_id                     float64
 12  department_id                float64
 13  price                        float32
 14  product_info_available       bool   
 15  price_available              bool   
dtypes: bool(3), float16(1), float32(1), float64(2), int16(2), int32(1), int64(2), int8(3), object(1)
memory usage: 2.1+ GB


Since we now have nulls/NaNs in the aisle_id and department_id columns, they must be a float instead of an integer.  However, they do not need to be that large of a float since each one contains values < 150, so we'll retype them to help keep the df more efficient. 

In [39]:
# aisle_id ranges between 1 and 134
# department_id ranges between 1 and 21
# and float16 would allow them to grow into the thousands
df_merged_final['aisle_id'] = df_merged_final['aisle_id'].astype('float16')
df_merged_final['department_id'] = df_merged_final['department_id'].astype('float16')

In [40]:
# Confirming final df details
df_merged_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 16 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   order_id                     int64  
 1   user_id                      int64  
 2   order_number                 int16  
 3   order_day_of_week            int8   
 4   order_hour_of_day            int8   
 5   days_since_prior_order       float16
 6   first_order_of_new_customer  bool   
 7   product_id                   int32  
 8   add_to_cart_order            int16  
 9   reordered                    int8   
 10  product_name                 object 
 11  aisle_id                     float16
 12  department_id                float16
 13  price                        float32
 14  product_info_available       bool   
 15  price_available              bool   
dtypes: bool(3), float16(3), float32(1), int16(2), int32(1), int64(2), int8(3), object(1)
memory usage: 1.7+ GB


### 0.10 Exporting the Df

In [41]:
#Exporting df_merged_final
df_merged_final.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged.pkl'))