In [None]:
# This notebook is to merge the orders dataset with the order_products_prior dataset

In [None]:
# Content List
#
# 01. Importing libraries
# 02. Importing data
# Importing order data
# Importing order_products_prior data
# 03. Consistency check (df_ords_prior)
# Missing value check
# Mixed-type data check
# Duplicate check
# 04. Merging data (Task 1)
# Checking common columns
# Checking for match
# Merging dataframes
# 05. Exporting data (Task 2)

# 01. Importing libraries

In [2]:
# Import libraries

import pandas as pd
import numpy as np
import os

# 02. Importing data

In [3]:
# Defining default file directory

path = r'/Users/bladael/Documents/Learning/CareerFoundry_DA/Data Immersion/Achievement 4/06-2023 Instacart Basket Analysis'

## Importing order data

In [None]:
# Import checked order data

df_ords = pd.read_csv(os.path.join(path, '02 Data','Prepared Data','orders_checked.csv'), index_col = False)

In [9]:
# Dimension check

df_ords.shape

(3421083, 6)

## Importing order_products_prior data

In [11]:
# Import raw order_products_prior

df_ords_prior = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'order_products__prior.csv'), index_col = False)

In [13]:
# Dimension check

df_ords_prior.shape

(32434489, 4)

# 03. Consistency check (df_ords_prior)

## Missing value check

In [25]:
df_ords_prior.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

## Mixed-type data check

In [26]:
for col in df_ords_prior.columns.tolist():
    weird = (df_ords_prior[[col]].applymap(type) != df_ords_prior[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_ords_prior[weird]) > 0:
        print(col)
    else:
        print(col,': no mixed-type data')

order_id : no mixed-type data
product_id : no mixed-type data
add_to_cart_order : no mixed-type data
reordered : no mixed-type data


## Duplicate check

In [28]:
df_ords_prior[df_ords_prior.duplicated()]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


In [34]:
# Check data shape again after consistency check

df_ords_prior.shape

(32434489, 4)

CP: The raw data looks clean. Proceeding with the merge

# 04. Merging data (Task 1)

## Checking common columns

In [35]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,0.0
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [36]:
df_ords_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


CP: order_id is the only common column between two dataframes

## Checking for match

In [37]:
# Test merge before creating merging actual dataframes

pd.merge(df_ords, df_ords_prior, on = 'order_id', indicator = True)

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,0.0,196,1,0,both
1,2539329,1,1,2,8,0.0,14084,2,0,both
2,2539329,1,1,2,8,0.0,12427,3,0,both
3,2539329,1,1,2,8,0.0,26088,4,0,both
4,2539329,1,1,2,8,0.0,26405,5,0,both
...,...,...,...,...,...,...,...,...,...,...
32434484,2977660,206209,13,1,12,7.0,14197,5,1,both
32434485,2977660,206209,13,1,12,7.0,38730,6,0,both
32434486,2977660,206209,13,1,12,7.0,31477,7,0,both
32434487,2977660,206209,13,1,12,7.0,6567,8,0,both


In [38]:
# Count of full match

pd.merge(df_ords, df_ords_prior, on = 'order_id', indicator = True, how = 'outer')['_merge'].value_counts()

both          32434489
left_only       206209
right_only           0
Name: _merge, dtype: int64

CP: Based on the outer join, more records are presented in the left table (df_ords); however, as only the fully matched records are needed for the purpose of this Exercise, the default inner join will be used to create a merged dataframe. 

## Merging dataframes

In [39]:
# Create a merged dataframe

df_merged_large = df_ords.merge(df_ords_prior, on = 'order_id', indicator = True)

In [40]:
df_merged_large.head()

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,0.0,196,1,0,both
1,2539329,1,1,2,8,0.0,14084,2,0,both
2,2539329,1,1,2,8,0.0,12427,3,0,both
3,2539329,1,1,2,8,0.0,26088,4,0,both
4,2539329,1,1,2,8,0.0,26405,5,0,both


In [41]:
# Double check to see whether the merged dataframe only contains fullly matched records only

df_merged_large['_merge'].value_counts()

both          32434489
left_only            0
right_only           0
Name: _merge, dtype: int64

# 05. Exporting data (Task 2)

In [42]:
# Export data to pkl

df_merged_large.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_combined.pkl'))