# 4.9 Part 1: Instacart


### This script contains the following points:
#### 01. Import libraries and data
#### 02. Descriptive Analysis
#### 03. Wrangle Data
#### 04. Data Quality & Consistency Checks
#### 05. Combine Data
#### 06. Export data

# 01 Import libraries and Data

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
#Create data path
path = r'C:\Users\fa_an\OneDrive\CareerFoundry\Tasks\Data Analytics Immersion\Tasks 4.1-4.10\02_2024 Instacart Basket Analysis'

In [3]:
path

'C:\\Users\\fa_an\\OneDrive\\CareerFoundry\\Tasks\\Data Analytics Immersion\\Tasks 4.1-4.10\\02_2024 Instacart Basket Analysis'

In [4]:
#Import customers.csv df
df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

# 02 Descriptive Analysis

In [5]:
df_customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [6]:
df_customers.info

<bound method DataFrame.info of         user_id First Name    Surnam  Gender           STATE  Age date_joined  \
0         26711    Deborah  Esquivel  Female        Missouri   48    1/1/2017   
1         33890   Patricia      Hart  Female      New Mexico   36    1/1/2017   
2         65803    Kenneth    Farley    Male           Idaho   35    1/1/2017   
3        125935   Michelle     Hicks  Female            Iowa   40    1/1/2017   
4        130797        Ann   Gilmore  Female        Maryland   26    1/1/2017   
...         ...        ...       ...     ...             ...  ...         ...   
206204   168073       Lisa      Case  Female  North Carolina   44    4/1/2020   
206205    49635     Jeremy   Robbins    Male          Hawaii   62    4/1/2020   
206206   135902      Doris  Richmond  Female        Missouri   66    4/1/2020   
206207    81095       Rose   Rollins  Female      California   27    4/1/2020   
206208    80148    Cynthia     Noble  Female        New York   55    4/1/2020

In [7]:
df_customers.dtypes

user_id          int64
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [8]:
df_customers.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


# 03 Wrangle Data

In [9]:
#create vars_list to not import First Name, Surnam
vars_list = ['user_id', 'Gender', 'STATE', 'Age', 'date_joined', 'n_dependants', 'fam_status', 'income']

In [10]:
#re-import customers.csv using vars_list
df_customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), usecols = vars_list)

In [11]:
df_customers.head()

Unnamed: 0,user_id,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


In [12]:
#update column names
df_customers.rename(columns = {'STATE' : 'State'}, inplace = True)

In [13]:
df_customers.rename(columns = {'n_dependants' : 'total_dependants'}, inplace = True)

In [14]:
df_customers.rename(columns = {'fam_status' : 'family_status'}, inplace = True)

In [15]:
df_customers.head()

Unnamed: 0,user_id,Gender,State,Age,date_joined,total_dependants,family_status,income
0,26711,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Female,Maryland,26,1/1/2017,1,married,40374


In [16]:
df_customers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_id           206209 non-null  int64 
 1   Gender            206209 non-null  object
 2   State             206209 non-null  object
 3   Age               206209 non-null  int64 
 4   date_joined       206209 non-null  object
 5   total_dependants  206209 non-null  int64 
 6   family_status     206209 non-null  object
 7   income            206209 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 12.6+ MB


# 04 Data Quality & Consistency Checks

In [17]:
#find missing values & frequency of variables
df_customers.isnull().sum()

user_id             0
Gender              0
State               0
Age                 0
date_joined         0
total_dependants    0
family_status       0
income              0
dtype: int64

In [18]:
df_customers['Gender'].value_counts(dropna=False)

Gender
Male      104067
Female    102142
Name: count, dtype: int64

In [19]:
df_customers['State'].value_counts(dropna=False)

State
Florida                 4044
Colorado                4044
Illinois                4044
Alabama                 4044
District of Columbia    4044
Hawaii                  4044
Arizona                 4044
Connecticut             4044
California              4044
Indiana                 4044
Arkansas                4044
Alaska                  4044
Delaware                4044
Iowa                    4044
Idaho                   4044
Georgia                 4044
Wyoming                 4043
Mississippi             4043
Oklahoma                4043
Utah                    4043
New Hampshire           4043
Kentucky                4043
Maryland                4043
Rhode Island            4043
Massachusetts           4043
Michigan                4043
New Jersey              4043
Kansas                  4043
South Dakota            4043
Minnesota               4043
Tennessee               4043
New York                4043
Washington              4043
Louisiana               4043
Montana 

In [20]:
df_customers['Age'].value_counts(dropna=False)

Age
19    3329
55    3317
51    3317
56    3306
32    3305
      ... 
65    3145
25    3127
66    3114
50    3102
36    3101
Name: count, Length: 64, dtype: int64

In [21]:
df_customers['date_joined'].value_counts(dropna = False)

date_joined
9/17/2018     213
2/10/2018     212
4/1/2019      211
9/21/2019     211
12/19/2017    210
             ... 
9/1/2018      141
1/22/2018     140
11/24/2017    139
7/18/2019     138
8/6/2018      128
Name: count, Length: 1187, dtype: int64

In [22]:
df_customers['total_dependants'].value_counts(dropna = False)

total_dependants
0    51602
3    51594
1    51531
2    51482
Name: count, dtype: int64

In [23]:
df_customers['family_status'].value_counts(dropna = False)

family_status
married                             144906
single                               33962
divorced/widowed                     17640
living with parents and siblings      9701
Name: count, dtype: int64

In [24]:
df_customers['income'].value_counts(dropna = False)

income
57192     10
95891     10
95710     10
97532      9
98675      9
          ..
73141      1
71524      1
74408      1
44780      1
148828     1
Name: count, Length: 108012, dtype: int64

In [25]:
df_customers.describe()

Unnamed: 0,user_id,Age,total_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [26]:
#change dtype of date_joined from int64 to datetime
df_customers['date_joined'] = pd.to_datetime(df_customers['date_joined'])

In [27]:
df_customers['date_joined'].dtype

dtype('<M8[ns]')

In [28]:
#change date_joined date/time format to be MDY
df_customers['date_joined'] = df_customers['date_joined'].dt.strftime('%m-%d-%Y')

In [29]:
df_customers.head()

Unnamed: 0,user_id,Gender,State,Age,date_joined,total_dependants,family_status,income
0,26711,Female,Missouri,48,01-01-2017,3,married,165665
1,33890,Female,New Mexico,36,01-01-2017,0,single,59285
2,65803,Male,Idaho,35,01-01-2017,2,married,99568
3,125935,Female,Iowa,40,01-01-2017,0,single,42049
4,130797,Female,Maryland,26,01-01-2017,1,married,40374


In [30]:
#finding mixed-type data
for col in df_customers.columns.tolist():
  weird = (df_customers[[col]].map(type) != df_customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_customers[weird]) > 0:
    print (col)
else:
    print('All is ok')

All is ok


In [31]:
#finding duplicates
df_dups = df_customers[df_customers.duplicated()]

In [32]:
df_dups

Unnamed: 0,user_id,Gender,State,Age,date_joined,total_dependants,family_status,income


In [33]:
#no dupes found!

In [34]:
df_customers.shape

(206209, 8)

# 05 Combine Data

In [35]:
#Combine your customer data with the rest of your prepared Instacart data.
#(Hint: Make sure the key columns are the same data type!)

In [24]:
#import ords_prods_grouped.pkl
df_ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_grouped.pkl'))

In [25]:
df_ords_prods.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_or_repeat_order,product_id,add_to_cart_order,...,price_label,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_product_price,spending_flag,median_days_between_orders,order_frequency_flag
0,2539329,1,prior,1,2,8,,First Order,196,1,...,Mid-range product,Regularly busy,Regularly busy,Average Orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent customer
1,2398795,1,prior,2,3,7,15.0,Repeat Order,196,1,...,Mid-range product,Regularly busy,Slowest days,Average Orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent customer
2,473747,1,prior,3,3,12,21.0,Repeat Order,196,1,...,Mid-range product,Regularly busy,Slowest days,Most Orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent customer
3,2254736,1,prior,4,4,7,29.0,Repeat Order,196,1,...,Mid-range product,Least busy,Slowest days,Average Orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent customer
4,431534,1,prior,5,4,15,28.0,Repeat Order,196,1,...,Mid-range product,Least busy,Slowest days,Most Orders,10,New Customer,6.367797,Low Spender,20.5,Non-frequent customer


In [26]:
df_ords_prods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 25 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   order_id                    int64  
 1   user_id                     int64  
 2   eval_set                    object 
 3   order_number                int64  
 4   orders_day_of_week          int64  
 5   order_hour_of_day           int64  
 6   days_since_prior_order      float64
 7   first_or_repeat_order       object 
 8   product_id                  int64  
 9   add_to_cart_order           int64  
 10  reordered                   int64  
 11  product_name                object 
 12  aisle_id                    int64  
 13  department_id               int64  
 14  prices                      float64
 15  price_label                 object 
 16  busiest_day                 object 
 17  busiest_days                object 
 18  busiest_period_of_day       object 
 19  max_order          

In [None]:
#merge df_ords_prods and df_customers
df_merged = df_ords_prods.merge(df_customers, on = 'user_id', indicator = True)

In [None]:
df_merged.isnull().sum()

In [47]:
df_merged.head(25)

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_or_repeat_order,product_id,add_to_cart_order,...,median_days_between_orders,order_frequency_flag,Gender,State,Age,date_joined,total_dependants,family_status,income,_merge
0,2539329,1,prior,1,2,8,,First Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
1,2398795,1,prior,2,3,7,15.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
2,473747,1,prior,3,3,12,21.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
3,2254736,1,prior,4,4,7,29.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
4,431534,1,prior,5,4,15,28.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
5,3367565,1,prior,6,2,7,19.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
6,550135,1,prior,7,1,9,20.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
7,3108588,1,prior,8,1,14,14.0,Repeat Order,196,2,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
8,2295261,1,prior,9,1,16,0.0,Repeat Order,196,4,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
9,2550362,1,prior,10,4,8,30.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both


In [53]:
df_merged[df_merged['user_id'] == 1]

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_or_repeat_order,product_id,add_to_cart_order,...,median_days_between_orders,order_frequency_flag,Gender,State,Age,date_joined,total_dependants,family_status,income,_merge
0,2539329,1,prior,1,2,8,,First Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
1,2398795,1,prior,2,3,7,15.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
2,473747,1,prior,3,3,12,21.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
3,2254736,1,prior,4,4,7,29.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
4,431534,1,prior,5,4,15,28.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
5,3367565,1,prior,6,2,7,19.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
6,550135,1,prior,7,1,9,20.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
7,3108588,1,prior,8,1,14,14.0,Repeat Order,196,2,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
8,2295261,1,prior,9,1,16,0.0,Repeat Order,196,4,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both
9,2550362,1,prior,10,4,8,30.0,Repeat Order,196,1,...,20.5,Non-frequent customer,Female,Alabama,31,02-17-2019,3,married,40423,both


In [54]:
df_merged['_merge'].value_counts()

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

In [56]:
df_merged.shape

(32404859, 33)

# 06 Export

In [57]:
#Export the merged file in pickle format as “ords_prods_customers.pkl”.
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_customers.pkl'))