# Product Data Consistency Checks

### 01. Importing Libraries and Data

In [23]:
#Import libraries
import pandas as pd
import numpy as np
import os

In [24]:
#Import products data frame
path = r'/Users/Cel/Documents/Data Analytics/09-2023 Instacart Basket Analysis'

In [25]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'))

In [26]:
#Import orders_wrangled data frame
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'))

### 02. Finding mixed type columns

In [27]:
#New test data frame
df_test = pd.DataFrame()

In [28]:
#Insert different data type values into new column
df_test['mix'] = ['a', 'b', 1, True]

In [29]:
df_test

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [30]:
#Check for mixed types
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [31]:
#Set data type for mix column to string
df_test['mix'] = df_test['mix'].astype('str')

### 03. Finding missing values

In [34]:
#Find number of missing observations
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [40]:
#Create subset for missing product names
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [41]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


### 04. Filter out missing values

In [43]:
#Determine number of rows in original data set
df_prods.shape

(49693, 5)

In [44]:
#Filter out observations with missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [45]:
#Check number of rows has decreased by 16
df_prods_clean.shape

(49677, 5)

### 05. Find and delete full duplicates

In [48]:
#New data frame for exact duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [49]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


5 duplicate rows in df_prods_clean

In [51]:
#Number of rows in df_prods_clean
df_prods_clean.shape

(49677, 5)

In [52]:
#Create new data frame with no duplicates - use drop duplicates function
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [53]:
#Check number of rows after deleting 5 duplicates
df_prods_clean_no_dups.shape

(49672, 5)

New data frame has 5 fewer rows

In [54]:
#Exporting clean data frame
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

# Orders Data Consistency Checks

### 01. Overall consistency checks

In [62]:
#Run describe function on orders data frame
df_ords.describe()

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


order_id and user_id: both string values so nothing strange

number_of orders: mean number of orders per customer is 17, which seems reasonable. Min is 1, max is 100, and all quartiles also seem plausible. 

orders_day_of week: min is 0, max is 6, which makes sense as there should only be 7 discrete values. Consistent across quartiles. 

order_hour_of_day: min is 0, max is 23. This makes sense as there are 24 discrete values. 

days_since_prior_order: min is 0, which could mean 2 things -- a customer who places more than 1 order per day, or a brand new customer who has only placed one order so far. This is potential for confusion! Max is 30 days, which is seems possible. 

### 02. Finding mixed type columns

In [64]:
#Check for mixed types
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

Test for mixed type columns returned no results; data types in columns are consistent. 

### 04. Find missing values

In [65]:
#Find number of missing observations
df_ords.isnull().sum()

order_id                       0
user_id                        0
number_of_orders               0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

There are 206209 observations missing data in 'days_since_prior_order'. This likely means that 206209 customers have only ever placed one order with Instacart. 

In [70]:
#Check value count for 'number_of_orders'
df_ords.value_counts('number_of_orders', dropna = False) 

number_of_orders
1      206209
2      206209
3      206209
4      206209
5      182223
        ...  
96       1592
97       1525
98       1471
99       1421
100      1374
Name: count, Length: 100, dtype: int64

This value check confirms that 206209 customers have only ever placed one order. Corroborates the 209206 missing values for 'days_since_prior_order'. (However, this doesn't explain why there are exactly 206209 customers who have placed 2, 3, and 4 orders...)

### 05. Address missing values

In this case, adding another column to flag whether the customer is new (has only placed 1 order) or returning would be most helpful to explain the missing data. Deleting or filtering it out isn't helpful, and inputing wouldn't make sense either. However, keeping the null values provides enough info in itself, so no further action is needed.

### 06. Find full duplicate values

In [74]:
#Create new data frame for full duplicate values
df_ords_dups = df_ords[df_ords.duplicated()]

In [73]:
df_ords_dups

Unnamed: 0,order_id,user_id,number_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order


No full duplicates found.

### 07. Export data frames

In [77]:
#Exporting clean data frame
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'))