## Content

## 01. Import libraries
## 02. Import data
## 03. Data Consistency Checks - Part 1
## 04. Data Consistency Checks - Part 2 for order dataframe

# 01. Import libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Create path

path = r'C:\Users\Frederick\Documents\07-12-2023 Instacart Basket Analysis'

# 02. Import data

In [3]:
# Import data for products

df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [4]:
# Import data for orders

df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

# 03. Data Consistency Checks - Part 1

In [5]:
# Create a dataframe

df_test = pd.DataFrame()

In [7]:
# Create a mixed type column

df_test['mix'] = ['a','b',1,True]

In [8]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [9]:
# Check for mixed types

for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)


In [10]:
# Finding missing values in product dataframe

df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [11]:
# Create subset of missing value in product name column

df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [12]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [13]:
# Looking at number of rows in current dataframe

df_prods.shape

(49693, 5)

In [14]:
# Creating subset with product dataframe with no missing value

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [15]:
# Looking at number of rows(shape) in new product dataframe

df_prods_clean.shape

(49677, 5)

In [16]:
# Create subset with duplicate rows

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [17]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [18]:
# Number of rows in df_prods_clean

df_prods_clean.shape

(49677, 5)

In [19]:
# Create subset with no duplicate rows

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [20]:
# Number of rows in df_prods_clean with no duplicates

df_prods_clean_no_dups.shape

(49672, 5)

# 04. Data Consistency Checks - Part 2 for order dataframe

In [21]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,user_order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


## 02. From the above table, all the statistics look pretty reasonable apart from the maximum value for user order number. The minimum order number is 1 and 75th percentile is 23. However the maximum order is 100 which seems a bit high. So it might be worthwhile to take a look at the user_order_number column to investigate further. 

In [22]:
# Check for mixed types in order dataframe

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)


## Q4. There are no mix-types data in the order dataframe, as the check mix types function above didn't return any column(s).

In [23]:
# Finding missing values in order dataframe

df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
user_order_number              0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

## Q5. The only missing values is in days since prior order column, and there are a lot of them - total of 206209 records! One possible explanation is that some customers only order once so there are no data captured on the first order.

In [24]:
# Create subset of missing value in days_since_prior_order column

df_nan_order = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [27]:
df_nan_order.head(15)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,user_order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
50,50,2086598,6,1,5,18,
54,54,2565571,7,1,3,9,
75,75,600894,8,1,6,0,
79,79,280530,9,1,1,17,
83,83,1224907,10,1,2,14,


In [28]:
df_nan_order.tail()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,user_order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,
3421069,3421069,3154581,206209,1,3,11,


## Q6. After creating a subset on the missing values and looking at the top 15 rows and bottom 5 rows, it is likely that all these missing value on days since prior order were for the first order. This is because the user_order_number has a value of 1 for the sample I'm looking, indicating that they are all the first order. In this case, I would just leave the missing value blank as there will be days since prior order in the first order. Another alternative option is I could replace the missing value to 'N/A' and change the data type to string. 

In [29]:
# Create subset with duplicate rows in order table

df_dups_order = df_ords[df_ords.duplicated()]

In [30]:
df_dups_order

Unnamed: 0.1,Unnamed: 0,order_id,user_id,user_order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


## Q7. As can be seen above, there are no duplicate values in the order dataframe (i.e. no rows with exactly the same value in all columns). Therefore, there is no need to address any duplicates data. 

In [31]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_cleaned.csv'))

In [32]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_cleaned.csv'))