# 01. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os

# 02. Import Dataframe 'customers.csv'

In [2]:
# Create path

path = r'/Users/dianaalatriste/Documents/Instacart'

In [3]:
# Import dataframe 'customers'

df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

In [4]:
# Check output

df.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [5]:
df.tail()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
206204,168073,Lisa,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,4/1/2020,1,married,99799
206208,80148,Cynthia,Noble,Female,New York,55,4/1/2020,1,married,57095


# 03. Cleaning dataframe by changing data types

In [8]:
# Check datatypes of columns

df.dtypes

user_id         object
First Name      object
Surnam          object
Gender          object
STATE           object
Age              int64
date_joined     object
n_dependants     int64
fam_status      object
income           int64
dtype: object

In [7]:
df['user_id'] = df['user_id'].astype('str')

In [10]:
df['Age'] = df['Age'].astype('int8')

In [11]:
df['income'] = df['income'].astype('int32')

In [12]:
df['n_dependants'] = df['n_dependants'].astype('int8')

In [51]:
#Recheck data types

df.dtypes

user_id               object
First Name            object
Surname               object
Gender                object
State                 object
Age                     int8
number_of_children      int8
fam_status            object
income                 int32
dtype: object

# 04. Wrangle data so that it follows consistent logic

### First, I will rename the columns

In [13]:
# Renaming columns with illogical names - SURNAM to SURNAME

df.rename(columns = {'Surnam' : 'Surname'}, inplace = True)

In [14]:
# Rename column n_dependants to number_of_children

df.rename(columns = {'n_dependants' : 'number_of_children'}, inplace = True)

In [17]:
#Rename column STATE to State

df.rename(columns = {'STATE' : 'State'}, inplace = True)

In [19]:
# Check output

df.head()

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,number_of_children,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1,married,40374


### With columns renamed, now drop columns that are not relevant for analysis

In [18]:
df = df.drop(columns = ['date_joined'])

In [21]:
# Check shape

df.shape

(206209, 9)

# 05. Fundamental data quality and consistency checks

## Missing values

In [22]:
# Finding missing values

df.isnull().sum()

user_id                   0
First Name            11259
Surname                   0
Gender                    0
State                     0
Age                       0
number_of_children        0
fam_status                0
income                    0
dtype: int64

#### First name column has a total of 11259 values

In [23]:
# Analyzing the missing values further

df_nan = df[df['First Name'].isnull() == True]

In [24]:
# Calling df_nan

df_nan

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,number_of_children,fam_status,income
53,76659,,Gilbert,Male,Colorado,26,2,married,41709
73,13738,,Frost,Female,Louisiana,39,0,single,82518
82,89996,,Dawson,Female,Oregon,52,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,1,married,155673
105,29778,,Dawson,Female,Utah,63,3,married,151819
...,...,...,...,...,...,...,...,...,...
206038,121317,,Melton,Male,Pennsylvania,28,3,married,87783
206044,200799,,Copeland,Female,Hawaii,52,2,married,108488
206090,167394,,Frost,Female,Hawaii,61,1,married,45275
206162,187532,,Floyd,Female,California,39,0,single,56325


#### Missing values do not interfere with analysis so they won't be removed, nor imputed.

## Mixed-Type Data

In [25]:
# Check for mixed-type data in dataframe 'df'

for col in df.columns.tolist():
    weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df[weird]) > 0:
        print(col)

First Name


#### Column 'First Name' has mixed-type data

In [30]:
# Change data type of 'First Name'

df['First Name'] = df['First Name'].astype('str')

In [38]:
# Change null values no 'NA' in 'First Name' column

df['First Name'].fillna('NA', inplace=True)

In [40]:
# Check output

df.tail(40)

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,number_of_children,fam_status,income
206169,99254,Lori,Watts,Female,Delaware,75,1,married,156966
206170,114593,Christopher,Velazquez,Male,Virginia,34,2,married,75707
206171,116898,,Delgado,Female,Colorado,23,2,married,59222
206172,120440,Philip,Duran,Male,North Dakota,42,3,married,120080
206173,122524,Phyllis,Greene,Male,Nebraska,77,0,divorced/widowed,163113
206174,134553,Ralph,Avalos,Male,Indiana,25,1,married,64482
206175,167749,Deborah,Farrell,Female,Florida,28,1,married,30169
206176,186595,Ruth,Cunningham,Female,Mississippi,38,1,married,92727
206177,199732,Katherine,Abbott,Female,Iowa,71,1,married,31019
206178,138442,Gloria,Cantrell,Female,North Carolina,26,1,married,46199


In [42]:
# Re check for mixed-type data in dataframe 'df'

for col in df.columns.tolist():
    weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df[weird]) > 0:
        print(col)
print(c)

The code has been run


#### Data-mixed type has been addressed and solved by changing all null values to 'NA's in the 'First Name' column

## Duplicates

In [44]:
# Finding duplicates

df_dups = df[df.duplicated()]
print('The code has been run')

The code has been run


#### No duplicates were identified

# 06. Combine customer data with rest of prepared Instacart data

In [45]:
# Import 'orders_products_merged_updated3.pkl'

df_ordsprods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_updated3.pkl'))

In [56]:
# Check shape of output

df_ordsprods_merged.shape

(32404859, 22)

In [46]:
df_ordsprods_merged.dtypes

order_id                   int64
user_id                   object
order_number                int8
orders_day_of_week          int8
order_hour_of_day           int8
last_purchase            float16
product_id                 int32
add_to_cart_order          int64
reordered                   int8
product_name              object
department_id               int8
prices                   float32
price_range_loc           object
busiest_day               object
busiest_days              object
busiest_period_of_day     object
max_order                   int8
loyalty_flag              object
average_expenses         float64
spending_flag             object
user_behaviour           float16
order_frequency_flag      object
dtype: object

In [48]:
# Change data type of 'user_id' on df_ordsprods_merged

df_ordsprods_merged['user_id'] = df_ordsprods_merged['user_id'].astype('str')

In [49]:
# Re check data types of 'user_id' on df_ordsprods_merged

df_ordsprods_merged.dtypes

order_id                   int64
user_id                   object
order_number                int8
orders_day_of_week          int8
order_hour_of_day           int8
last_purchase            float16
product_id                 int32
add_to_cart_order          int64
reordered                   int8
product_name              object
department_id               int8
prices                   float32
price_range_loc           object
busiest_day               object
busiest_days              object
busiest_period_of_day     object
max_order                   int8
loyalty_flag              object
average_expenses         float64
spending_flag             object
user_behaviour           float16
order_frequency_flag      object
dtype: object

In [53]:
# Combine dataframes

df_all = df_ordsprods_merged.merge(df, on = 'user_id', indicator = True)

In [54]:
# Check output

df_all.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,last_purchase,product_id,add_to_cart_order,reordered,product_name,...,order_frequency_flag,First Name,Surname,Gender,State,Age,number_of_children,fam_status,income,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,3,married,40423,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,3,married,40423,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,3,married,40423,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,3,married,40423,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,3,married,40423,both


In [55]:
# Check output

df_all.shape

(32404859, 31)

In [57]:
# Summing up values on '_merge' column

df_all['_merge'].value_counts()

both          32404859
left_only            0
right_only           0
Name: _merge, dtype: int64

# 07. Export newly merged dataframe

In [58]:
df_all.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_customers_merged.pkl'))

In [59]:
print('End of Part 1')

End of Part 1
