# Data Consistency Checks

### Contents:

#### Review Dataframe descriptives
#### Find missing data
#### Identify duplicate data
#### Check for mixed data

## 1.0 Importing Libraries

In [2]:
# Importing Libraries
import pandas as pd
import numpy as np
import os


## 2.0 Importing Data

In [4]:
# Importing orders and products dataframe
project_path = r'C:\Users\Owner\Documents\Career Foundry\Instacart Basket Analysis'
# Import orders 
vars_list = ['order_id', 'user_id', 'order_number', 'order_day_of_week', 'order_hour_of_day', 'days_since_prior_order']
df_orders = pd.read_csv(os.path.join(project_path, '02 Data','02 02 Prepared Data', 'orders_wrangled.csv'), usecols = vars_list)
# Importing complete products dataframe
df_products = pd.read_csv(os.path.join(project_path, '02 Data', '02 01 Originals', 'products.csv'), )


## 3.0 Data Consistency Practice

### 3.1 Review dataframe descriptives

In [28]:
#Reviewing descriptive information for orders
df_orders.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


## 3.2 Work with mixed datatype

In [8]:
#Create a dataframe
df_test = pd.DataFrame()
#Create a dataset
df_test['mix'] = ['a', 'b', 1, True]
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [9]:
#Check for mixed datatypes
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [10]:
#Assign datatype of string to mix
df_test['mix'] = df_test['mix'].astype('str')

## 3.3 Missing Data

In [11]:
#Finding missing data in product
df_products.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [18]:
#Store null value in subset
df_nan = df_products[df_products['product_name'].isnull()==True]
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [None]:
#Fill in missing values
#To fill in with mean
#df['column with missings'].fillna(mean value, inplace=True)
#To replace with median
#df['column with missings'].fillna(median value, inplace=True)

In [15]:
df_products.shape

(49693, 5)

In [16]:
#Create new dataset without missing values
df_products_clean = df_products[df_products['product_name'].isnull() == False]
df_products_clean.shape

(49677, 5)

## 3.4 Duplicate values

In [21]:
#look for duplicates in products
df_dups = df_products_clean[df_products_clean.duplicated()]
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [26]:
#Drop duplicates
df_products_clean_no_dups = df_products_clean.drop_duplicates()
df_products_clean_no_dups.shape

(49672, 5)

# 4.0 Task 4.5

In [52]:
#4.2 Descriptive Stats for orders dataframe
#Reviewing descriptive information for orders
df_orders.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,8.924952
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,5.0
50%,1710542.0,102689.0,11.0,3.0,13.0,8.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


Upon reviewing the data for the orders dataframe, the mins and max values seem to be logical for each of the columns.  The only item to review is that there are missing values from the 'days_since_prior_order' column, which is known due to the counts of each column.

In [53]:
#4.3 Check for mixed datatypes
for col in df_orders.columns.tolist():
  weird = (df_orders[[col]].applymap(type) != df_orders[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_orders[weird]) > 0:
    print (col)
    
#There are no mixed datatypes

In [54]:
#4.5 Missing values
df_orders.isnull().sum()

order_id                  0
user_id                   0
order_number              0
order_day_of_week         0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

There are 206,209 null values in the 'days_since_prior_order' column, but no other null or missing values in this dataframe.  Since the column holds the number of days since that user last placed an order, my explantion is that these orders are all placed by new customers, or customers with new user ids.

In [41]:
#4.6 Review total number of rows in orders
df_orders.shape

(3421083, 6)

In [43]:
#Number of rows with null value in days_since_prior_order
df_nan = df_orders[df_orders['days_since_prior_order'].isnull()==True]
df_nan.shape

(206209, 6)

In [50]:
#Fill null days_since_prior_order with mean
#Find mean of days since prior order
mean_value = df_orders['days_since_prior_order'].mean()
#Replace null with mean_value
df_orders['days_since_prior_order'].fillna(mean_value, inplace=True)
#Verify that there are no more nulls
df_orders.isnull().sum()

order_id                  0
user_id                   0
order_number              0
order_day_of_week         0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

To work with the missing values in this instance, I created a new dataset that holds any orders that have a value for days since prior order.  My logic would consider that these orders are repeat customers, versus those with a null value who might be new customers.  This would be a question for the data collections team.  Since we don't know if these orders are from new customers of if they are repeat customers who are missing a value, I choose to assign the mean to their days since prior order value since adding the mean wouldn't skew the certain data too greatly.

In [51]:
#4.7 Duplicate check for orders dataframe
df_order_dups = df_orders[df_orders.duplicated()]
df_order_dups

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order


Since the check for duplicate orders returned an empty set, there are no duplicates in this dataframe.

# 5.0 Export Data

In [55]:
#Export products
df_products_clean_no_dups.to_csv(os.path.join(project_path, '02 Data','02 02 Prepared Data', 'products_clean.csv'))
df_orders.to_csv(os.path.join(project_path, '02 Data', '02 02 Prepared Data', 'orders_clean.csv'))