# 03.9 Merging customers file with combined orders and products

### This script contains the following points:

### 1. Import libraries and files
### 2. Data wrangling
   #### 2.1 Dropping columns
   #### 2.2 Renaming columns
### 3. Data consistency checks
   #### 3.1 Checking for mixed-type data
   #### 3.2 Checking for missing values
   #### 3.3 Checking for duplicates
### 4. Merge the dataframe with prepared Instacart data

### 1. Import libraries and files

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Import files

path = r'C:\Users\dsadl\OneDrive\Documents\Career Foundry\Data Immersion\Project 4\05-10-2023 Instacart Basket Analysis'

In [3]:
df_cust = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

In [4]:
df_ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_viz.pkl'))

### 2. Data wrangling

In [5]:
# Examine dataframe

df_cust.shape

(206209, 10)

In [6]:
df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


#### 2.1 Dropping columns

In [7]:
# Inspecting column date_joined to see if all values are the same

df_cust['date_joined'].value_counts(dropna=False)

9/17/2018     213
2/10/2018     212
4/1/2019      211
9/21/2019     211
12/19/2017    210
             ... 
9/1/2018      141
1/22/2018     140
11/24/2017    139
7/18/2019     138
8/6/2018      128
Name: date_joined, Length: 1187, dtype: int64

In [8]:
df_cust.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

#### There are multiple values of "date_joined", so the column will not be dropped. It does seem that there might be mixed data types in the column, which will be part of the the consistency checks. There are no columns to drop.

#### 2.2 Renaming columns

In [9]:
# Renaming columns for consistency

df_cust.rename(columns = {'First Name' : 'first_name'}, inplace = True)

In [10]:
df_cust.rename(columns = {'Surnam' : 'last_name'}, inplace = True)

In [11]:
df_cust.rename(columns = {'Gender' : 'gender'}, inplace = True)

In [12]:
df_cust.rename(columns = {'STATE' : 'state'}, inplace = True)

In [13]:
df_cust.rename(columns = {'Age' : 'age'}, inplace = True)

In [14]:
# Check that the renaming was successful

df_cust.columns

Index(['user_id', 'first_name', 'last_name', 'gender', 'state', 'age',
       'date_joined', 'n_dependants', 'fam_status', 'income'],
      dtype='object')

### 3. Data consistency checks

#### 3.1 Checking for mixed-type data

In [15]:
# Mixed-type check

for col in df_cust.columns.tolist():
  weird = (df_cust[[col]].applymap(type) != df_cust[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_cust[weird]) > 0:
    print (col)

first_name


In [16]:
df_cust['first_name'].dtype

dtype('O')

In [17]:
# Changing datatype of "first_name" column to string

df_cust['first_name'] = df_cust['first_name'].astype('str')

In [18]:
# Re-run the check for mixed-type data to confirm

for col in df_cust.columns.tolist():
  weird = (df_cust[[col]].applymap(type) != df_cust[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_cust[weird]) > 0:
    print (col)

#### 3.2 Checking for missing values

In [19]:
# Missing values check

df_cust.isnull().sum()

user_id         0
first_name      0
last_name       0
gender          0
state           0
age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

#### There are no missing values to address in the dataset.

#### 3.3 Checking for duplicates

In [20]:
# Checking for full duplicates

df_dups = df_cust[df_cust.duplicated()]

In [21]:
df_dups

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income


#### There are no full duplicate values to address.

In [23]:
df_cust.shape

(206209, 10)

### 4. Merge the dataframe with prepared Instacart data

#### The datatype for the "user_id" column in both datasets is verified to be the same, "int64."

In [23]:
# Merge orders_products_aggregated.pkl with cleaned customers.csv file

df_merged = df_cust.merge(df_ords_prods, on = 'user_id')

In [24]:
# Check the merged file

df_merged

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,n_dependants,fam_status,income,...,busiest day,price_label,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_price,spending_flag,median_days,frequency_flag
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regularly busy,Mid-range product,Busiest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
1,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regularly busy,Mid-range product,Regularly busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
2,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regularly busy,Mid-range product,Busiest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
3,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regularly busy,Mid-range product,Regularly busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
4,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regularly busy,Mid-range product,Least busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32404854,80148,Cynthia,Noble,Female,New York,55,4/1/2020,1,married,57095,...,Regularly busy,Low-range product,Regularly busy,Most orders,4,New customer,3.886667,Low spender,12.0,Regular customer
32404855,80148,Cynthia,Noble,Female,New York,55,4/1/2020,1,married,57095,...,Regularly busy,Low-range product,Regularly busy,Most orders,4,New customer,3.886667,Low spender,12.0,Regular customer
32404856,80148,Cynthia,Noble,Female,New York,55,4/1/2020,1,married,57095,...,Regularly busy,Low-range product,Regularly busy,Average orders,4,New customer,3.886667,Low spender,12.0,Regular customer
32404857,80148,Cynthia,Noble,Female,New York,55,4/1/2020,1,married,57095,...,Regularly busy,Low-range product,Regularly busy,Most orders,4,New customer,3.886667,Low spender,12.0,Regular customer


In [25]:
df_merged.columns

Index(['user_id', 'first_name', 'last_name', 'gender', 'state', 'age',
       'date_joined', 'n_dependants', 'fam_status', 'income', 'Unnamed: 0_x',
       'order_id', 'order_number', 'orders_day_of_week', 'order_time',
       'days_since_prior_order', 'product_id', 'add_to_cart_order',
       'reordered', '_merge', 'Unnamed: 0_y', 'product_name', 'aisle_id',
       'department_id', 'prices', 'price_range_loc', 'busiest day',
       'price_label', 'busiest_days', 'busiest_period_of_day', 'max_order',
       'loyalty_flag', 'mean_price', 'spending_flag', 'median_days',
       'frequency_flag'],
      dtype='object')

In [26]:
# Export the merged dataframe

df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_customers.pkl'))