In [None]:
# This notebook is to merge the orders_products data with customer data

In [None]:
# Content List
#
# 01. Importing libraries and data (Task 3)
# 02. Data wrangling (Task 4)
# 03. Consistency checks (Task 5)
# 03.01. Exploratory data analysis
# 03.02. Missing values
# 03.03. Mixed data types
# 03.04. Duplicates
# 04. Combining data (Task 6)
# 05. Exporting data (Task 8)

# 01. Importing libraries and data (Task 3)

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Defining default file path for data access
path = r'/Users/bladael/Documents/Learning/CareerFoundry_DA/Data Immersion/Achievement 4/06-2023 Instacart Basket Analysis'

In [3]:
# Import data (customers.csv)
df_customer = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

# 02. Data wrangling (Task 4)

In [5]:
# Check data shape
df_customer.shape

(206209, 10)

In [6]:
# Check head
df_customer.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


CP: Given that the analyses to be conducted focus on the spending pattern based on different customer demographics, decided to remove the columns containing irrelevant personal information such as first and last name

In [24]:
# Deleting 'First Name' and 'Surnam' columns
df_customer.drop(['First Name', 'Surnam'], axis = 1, inplace = True)

In [25]:
# Check head of the wrangled dataframe
df_customer.head()

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


# 03. Consistency checks (Task 5)

## 03.01. Exploratory data analysis

In [26]:
# Check info
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   Gender        206209 non-null  object
 2   STATE         206209 non-null  object
 3   Age           206209 non-null  int64 
 4   date_joined   206209 non-null  object
 5   n_dependants  206209 non-null  int64 
 6   fam_status    206209 non-null  object
 7   income        206209 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 12.6+ MB


In [27]:
# Check descriptive stats on quantitivate columns
df_customer['Age'].describe()

count    206209.000000
mean         49.501646
std          18.480962
min          18.000000
25%          33.000000
50%          49.000000
75%          66.000000
max          81.000000
Name: Age, dtype: float64

In [28]:
# Check descriptive stats on quantitivate columns
df_customer['n_dependants'].describe()

count    206209.000000
mean          1.499823
std           1.118433
min           0.000000
25%           0.000000
50%           1.000000
75%           3.000000
max           3.000000
Name: n_dependants, dtype: float64

In [29]:
# Check descriptive stats on quantitivate columns
df_customer['income'].describe()

count    206209.000000
mean      94632.852548
std       42473.786988
min       25903.000000
25%       59874.000000
50%       93547.000000
75%      124244.000000
max      593901.000000
Name: income, dtype: float64

CP: The descriptive stats over the key quantitative columns look reasonable

## 03.02. Missing values

In [30]:
df_customer.isnull().sum()

user_id         0
Gender          0
STATE           0
Age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

CP: No columns with missing values identified

## 03.03. Mixed data types

In [31]:
# Check for columns with mixed type data

for col in df_customer.columns.tolist():
    weird = (df_customer[[col]].applymap(type) != df_customer[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_customer[weird]) > 0:
        print(col,': contains mixed-type data')
    else:
        print(col,': no mixed-type data')

user_id : no mixed-type data
Gender : no mixed-type data
STATE : no mixed-type data
Age : no mixed-type data
date_joined : no mixed-type data
n_dependants : no mixed-type data
fam_status : no mixed-type data
income : no mixed-type data


CP: No columns with mixed-type data identified

## 03.04. Duplicates

In [32]:
# Check for duplicates

df_customer[df_customer.duplicated()]

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,n_dependants,fam_status,income


CP: No duplicates identified

# 04. Combining data (Task 6)

In [34]:
# Import the master data (orders_products merged) for the data merging

ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_aggregate_data_analyses_cleaned.pkl'))

In [35]:
# Check info of the imported data

ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32404859 entries, 0 to 32404858
Data columns (total 25 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_sequence_number   int64  
 3   order_days_of_week      int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
 6   product_id              int64  
 7   add_to_cart_order       int64  
 8   reordered               int64  
 9   product_name            object 
 10  aisle_id                int64  
 11  department_id           int64  
 12  prices                  float64
 13  price_range_loc         object 
 14  busiest_day             object 
 15  Busiest_Days            object 
 16  busiest_period_of_day   object 
 17  max_order               int64  
 18  loyalty_flag            object 
 19  total_ord_spend         float64
 20  total_ord_count         int64  
 21  avg_ord_spend           float

In [36]:
# Check info of the dataframe to be combined (customers)

df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   Gender        206209 non-null  object
 2   STATE         206209 non-null  object
 3   Age           206209 non-null  int64 
 4   date_joined   206209 non-null  object
 5   n_dependants  206209 non-null  int64 
 6   fam_status    206209 non-null  object
 7   income        206209 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 12.6+ MB


CP: Appears that 'user_id' column can be used the common key for merging the two dataframes

In [38]:
# Check matched vs. unmatched rows if a full join were to be conducted

pd.merge(ords_prods_merge, df_customer, on = 'user_id', indicator = True, how = 'outer')['_merge'].value_counts(dropna = False)

both          32404859
left_only            0
right_only           0
Name: _merge, dtype: int64

CP: Appears it's a full match between two dataframes proceeding with the merge

In [39]:
# Merge two data frames. Given that it's a full match, using the default inner join merege

ords_prods_custs_merge = pd.merge(ords_prods_merge, df_customer, on = 'user_id', indicator = True)

In [40]:
# Check head of the merged data

ords_prods_custs_merge.head()

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,median_frequency,frequency_flag,Gender,STATE,Age,date_joined,n_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,0.0,196,1,0,Soda,...,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both


In [41]:
# Tweak the pandas option to display full list of columns

pd.options.display.max_columns = None

In [42]:
# Check the head again after tweaking the display option

ords_prods_custs_merge.head()

Unnamed: 0,order_id,user_id,order_sequence_number,order_days_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,Busiest_Days,busiest_period_of_day,max_order,loyalty_flag,total_ord_spend,total_ord_count,avg_ord_spend,spending_flag,median_frequency,frequency_flag,Gender,STATE,Age,date_joined,n_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Normal day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Regularly busy,Slowest day,Most orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Average orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Least busy,Slowest day,Most orders,10,New customer,375.7,10,37.57,High spender,20.0,Regular customer,Female,Alabama,31,2/17/2019,3,married,40423,both


In [43]:
# Count the merge flag of the merged dataframe

ords_prods_custs_merge['_merge'].value_counts(dropna = False)

both          32404859
left_only            0
right_only           0
Name: _merge, dtype: int64

In [44]:
# Check the count against the master data (ords_prods_merge)

ords_prods_merge.shape

(32404859, 25)

CP: Appears that the merge was successfully executed

# 05. Exporting data (Task 8)

In [45]:
# Export the combined dataframe into pkl

ords_prods_custs_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_customers_merged.pkl'))