## Environment setup

In [3]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [4]:
path = r'/Users/Cel/Documents/Data Analytics/09-2023 Instacart Basket Analysis'

In [5]:
# Import customer file
df_cust = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

In [6]:
# Import merged file
df_ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_grouped_data.pkl'))

## Data wrangling and quality checks

#### 01. Drop unnecessary columns from both data sets

In [7]:
# Check columns in orders and products df
df_ords_prods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 24 columns):
 #   Column                         Dtype   
---  ------                         -----   
 0   order_id                       int64   
 1   user_id                        int64   
 2   number_of_orders               int64   
 3   orders_day_of_week             int64   
 4   order_hour_of_day              int64   
 5   days_since_prior_order         float64 
 6   product_id                     int64   
 7   add_to_cart_order              int64   
 8   reordered                      int64   
 9   product_name                   object  
 10  aisle_id                       int64   
 11  department_id                  int64   
 12  prices                         float64 
 13  _merge                         category
 14  price_range_loc                object  
 15  busiest_period_of_day          object  
 16  busiest_day                    object  
 17  two_busiest_days         

In [8]:
# Drop unnecessary columns
df_ords_prods = df_ords_prods.drop(columns = ['_merge','aisle_id'])

In [9]:
# Check columns in customers df
df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


All columns are potentially useful. 

#### 02. Check data types

In [37]:
df_cust.dtypes

user_id             int64
first_name         object
surname            object
gender             object
state              object
age                 int64
date_joined        object
number_children     int64
family_status      object
income              int64
dtype: object

In [10]:
# Change user_id in df_cust to string
df_cust['user_id'] = df_cust['user_id'].astype('str')

In [45]:
# Ensure user_id key column is string in df_ords_prods
df_ords_prods.dtypes

order_id                            int64
user_id                             int64
number_of_orders                    int64
orders_day_of_week                  int64
order_hour_of_day                   int64
days_since_prior_order            float64
product_id                          int64
add_to_cart_order                   int64
reordered                           int64
product_name                       object
aisle_id                            int64
department_id                       int64
prices                            float64
_merge                           category
price_range_loc                    object
busiest_period_of_day              object
busiest_day                        object
two_busiest_days                   object
max_order                           int64
loyalty_flag                       object
average_spend                     float64
spending_category                  object
median_days_since_prior_order     float64
order_frequency                   

In [11]:
# Change user_id to string in df_ords_prods
df_ords_prods['user_id'] = df_ords_prods['user_id'].astype('str')

#### 03. Renaming columns

In [12]:
#Renaming columns
df_cust.rename(columns = {'Surnam' : 'surname'}, inplace = True)

In [13]:
df_cust.rename(columns = {'First Name' : 'first_name'}, inplace = True)

In [14]:
df_cust.rename(columns = {'Gender' : 'gender'}, inplace = True)

In [15]:
df_cust.rename(columns = {'STATE' : 'state'}, inplace = True)

In [16]:
df_cust.rename(columns = {'Age' : 'age'}, inplace = True)

In [17]:
df_cust.rename(columns = {'n_dependants' : 'number_children'}, inplace = True)

In [18]:
df_cust.rename(columns = {'fam_status' : 'family_status'}, inplace = True)

In [19]:
# Check renamed columns
df_cust.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,number_children,family_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


#### 04. Fix mixed-type columns

In [21]:
# Use for-loop to check for mixed-type columns
for col in df_cust.columns.tolist():
  weird = (df_cust[[col]].applymap(type) != df_cust[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_cust[weird]) > 0:
    print (col)

first_name


In [20]:
# first_name has mixed-type columns - change entire column data type to string
df_cust['first_name'] = df_cust['first_name'].astype('str')

#### 05. Find missing values

In [21]:
# Find total number of missing values in each column
df_cust.isnull().sum()

user_id            0
first_name         0
surname            0
gender             0
state              0
age                0
date_joined        0
number_children    0
family_status      0
income             0
dtype: int64

No missing values in df_cust.

#### 06. Find duplicates

In [22]:
# Create new df containing duplicates
df_dups = df_cust[df_cust.duplicated()]

In [23]:
df_dups

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,number_children,family_status,income


No full duplicates in df_cust.

## Joining data

In [24]:
# Merge data with user_id key
df_merged = df_ords_prods.merge(df_cust, on = ['user_id'])

In [26]:
df_merged.shape

(32404859, 31)

In [27]:
#Export to pickle
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_all.pkl'))