# 4.9.1 Incorporate customer data set

Table of Contents

#### 1. Import data and libraries
#### 2. Data wrangling
#### 3. Data consistency checks
#### 4. Merge dataframes
#### 5. Export final dataframe

## 01. Import data and libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Create a path

path = r'/Users/giadairene/Documents/CareerFoundry Data Analytics/Data Analytics Immersion/Achievement 4/Instacart Basket Analysis'

In [3]:
# Import customer data set

df_customer = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

## 02. Data wrangling

In [4]:
# Take a glance at the dataframe

df_customer.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [5]:
df_customer.shape

(206209, 10)

### Renaming columns

In [6]:
df_customer.rename(columns = {'First Name' : 'first_name'}, inplace = True)

In [8]:
df_customer.rename(columns = {'surname' : 'last_name'}, inplace = True)

In [9]:
df_customer.rename(columns = {'Gender' : 'gender'}, inplace = True)

In [10]:
df_customer.rename(columns = {'STATE' : 'state'}, inplace = True)

In [11]:
df_customer.rename(columns = {'Age' : 'age'}, inplace = True)

In [12]:
df_customer.rename(columns = {'n_dependants' : 'no_of_dependants'}, inplace = True)

In [13]:
df_customer.rename(columns = {'fam_status' : 'family_status'}, inplace = True)

In [14]:
# Check output

df_customer.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,no_of_dependants,family_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


### Changing variables datatype

In [15]:
# Check variables datatype for different columns

df_customer['user_id'].dtype

dtype('int64')

In [16]:
df_customer['first_name'].dtype

dtype('O')

In [17]:
df_customer['last_name'].dtype

dtype('O')

In [18]:
df_customer['gender'].dtype

dtype('O')

In [19]:
df_customer['state'].dtype

dtype('O')

In [20]:
df_customer['age'].dtype

dtype('int64')

In [21]:
df_customer['date_joined'].dtype

dtype('O')

In [28]:
df_customer[['user_id', 'date_joined']][df_customer['date_joined']!='1/1/2017']

Unnamed: 0,user_id,date_joined
159,51556,1/2/2017
160,59989,1/2/2017
161,183738,1/2/2017
162,190771,1/2/2017
163,11866,1/2/2017
...,...,...
206204,168073,4/1/2020
206205,49635,4/1/2020
206206,135902,4/1/2020
206207,81095,4/1/2020


In [22]:
df_customer['no_of_dependants'].dtype

dtype('int64')

In [23]:
df_customer['family_status'].dtype

dtype('O')

In [24]:
df_customer['income'].dtype

dtype('int64')

In [25]:
# Change variables datatype

df_customer['user_id'] = df_customer['user_id'].astype('str')

In [27]:
# Check the output

df_customer['user_id'].dtype

dtype('O')

#### All the other variables types look fine to me.

## 03. Data consistency checks

In [29]:
# Check basic stats

df_customer.describe()

Unnamed: 0,age,no_of_dependants,income
count,206209.0,206209.0,206209.0
mean,49.501646,1.499823,94632.852548
std,18.480962,1.118433,42473.786988
min,18.0,0.0,25903.0
25%,33.0,0.0,59874.0
50%,49.0,1.0,93547.0
75%,66.0,3.0,124244.0
max,81.0,3.0,593901.0


#### The max income value looks a bit odd, should be checked with the client.

### Missing values

In [30]:
# Check for missing values

df_customer.isnull().sum()

user_id                 0
first_name          11259
last_name               0
gender                  0
state                   0
age                     0
date_joined             0
no_of_dependants        0
family_status           0
income                  0
dtype: int64

In [32]:
# Create a subset containing only missing values

df_nan = df_customer[df_customer['first_name'].isnull() == True]

In [33]:
df_nan

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,no_of_dependants,family_status,income
53,76659,,Gilbert,Male,Colorado,26,1/1/2017,2,married,41709
73,13738,,Frost,Female,Louisiana,39,1/1/2017,0,single,82518
82,89996,,Dawson,Female,Oregon,52,1/1/2017,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,1/1/2017,1,married,155673
105,29778,,Dawson,Female,Utah,63,1/1/2017,3,married,151819
...,...,...,...,...,...,...,...,...,...,...
206038,121317,,Melton,Male,Pennsylvania,28,3/31/2020,3,married,87783
206044,200799,,Copeland,Female,Hawaii,52,4/1/2020,2,married,108488
206090,167394,,Frost,Female,Hawaii,61,4/1/2020,1,married,45275
206162,187532,,Floyd,Female,California,39,4/1/2020,0,single,56325


#### The missing values in the 'first_name' column will not affect the analysis, therefore no action is required.

### Duplicate values

In [34]:
# Looking for full duplicates within the dataframe

df_dups = df_customer[df_customer.duplicated()]

In [35]:
df_dups

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,no_of_dependants,family_status,income


#### No duplicates found.

## 04. Merge dataframes

In [36]:
# Import remaining Instacart data

ords_prods_grouped = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_grouped.pkl'))

In [37]:
# Check dataframe

ords_prods_grouped.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,_merge,price_range_loc,busiest_day,busiest_period_of_day,max_order,loyalty_flag,average_product_price,spending_flag,median_days_between_orders,frequency_flag
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,both,Mid-range product,Regularly busy,Most orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,both,Mid-range product,Regularly busy,Average orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,...,both,Mid-range product,Busiest days,Average orders,5,New customer,7.930208,Low spender,8.0,Frequent customer
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,...,both,Mid-range product,Least busy,Most orders,3,New customer,4.972414,Low spender,9.0,Frequent customer
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,...,both,Mid-range product,Least busy,Average orders,3,New customer,4.972414,Low spender,9.0,Frequent customer


In [39]:
# Check datatype of ords_prods_grouped['user_id'] column

ords_prods_grouped['user_id'].dtype

dtype('int64')

In [40]:
# Change datatype to string

ords_prods_grouped['user_id'] = ords_prods_grouped['user_id'].astype('str')

In [41]:
# Merge dataframes

df_final = df_customer.merge(ords_prods_grouped, on = 'user_id', indicator = 'merge_flag')

In [42]:
# Check the output

df_final.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,date_joined,no_of_dependants,family_status,income,...,price_range_loc,busiest_day,busiest_period_of_day,max_order,loyalty_flag,average_product_price,spending_flag,median_days_between_orders,frequency_flag,merge_flag
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Mid-range product,Busiest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both
1,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Mid-range product,Regularly busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both
2,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Mid-range product,Busiest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both
3,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Low-range product,Regularly busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both
4,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Low-range product,Least busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both


In [44]:
# Check whether there was a full match between the two dataframes

df_final['merge_flag'].value_counts()

merge_flag
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

#### The resulting dataframe (after the merge) has 32,404,859 rows, and each of those rows have information found in both input dataframes, as we used an inner join for the purposes of this project.

## 05. Export final dataframe

In [45]:
# Export data to pkl

df_final.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'final.pkl'))