# Data Consistency Checks

This script contains:
1. Importing Libraries & Dataframes
2. Practice Dataframe
3. Mixed Data Types
4. Finding Missing Values in products.csv

# Importing Libraries and Dataframes

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import os

In [3]:
# Setting file path
path = r'C:\Users\cschw\OneDrive\Desktop\Achievement 4\Instacart Basket Analysis'

In [4]:
# Importing products.csv
df_prods = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'products.csv'), index_col = False)

In [5]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [6]:
df_prods.shape

(49693, 5)

In [5]:
df_prods['prices'].max()

99999.0

In [6]:
# Importing orders.csv
df_ords = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'orders.csv'), index_col = False)

In [7]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


# Practice Dataframe

MIXED DATA TYPES

In [8]:
# Create a dataframe
df_test = pd.DataFrame()

In [9]:
# Create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [10]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


# Mixed Data Types

In [11]:
# Checking df_test for mixed data types - advanced python
for col in df_test.columns.tolist():
    weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis=1)
    if weird.any():
        print(col)

mix


In [12]:
# Convert column's data type from numeric to string
df_test['mix'] = df_test['mix'].astype('str')

In [13]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   mix     4 non-null      object
dtypes: object(1)
memory usage: 164.0+ bytes


In [14]:
df_test['mix'].dtype

dtype('O')

# FINDING MISSING VALUES IN PRODUCTS.CSV

In [15]:
# Finding columns with missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [16]:
# creating subset for missing product names
df_nan = df_prods[df_prods['product_name'].isnull()==True]

In [17]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [18]:
# checking shape
df_prods.shape

(49693, 5)

In [19]:
# Creating df_prods_clean dataframe (dropping null product names)
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [20]:
# checking shape
df_prods_clean.shape

(49677, 5)

# Checking for Duplicates

In [21]:
# checking for duplicates in df_prods_clean
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [22]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [23]:
#Checking number of rows and columns
df_prods_clean.shape

(49677, 5)

In [24]:
# creating new dataframe without duplicate rows
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [25]:
#SHAPE
df_prods_clean_no_dups.shape

(49672, 5)

# EXPORTING CLEANED PRODS CSV

In [26]:
#exporting df_prods_clean_no_dups.csv
df_prods_clean_no_dups.to_csv(os.path.join(path, 'Data', 'Prepared Data', 'products_checked.csv'))

# df_ords Consistency Checking

In [27]:
# Running describe function to check values
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


days_since_prior_order has a lower count than the other rows, which would show that there are missing values. This could be for new accounts that have never placed an order, or for accounts that have placed two orders in the same day. It's also possible that if the data updates automatically/frequently that missing values simply represent an order that was place 'today'. The other vales make sense for the data and are appropriate.

In [28]:
# checking df_ords for mixed data type
for col in df_ords.columns.tolist():
    weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis=1)
    if weird.any():
        print(col)

In [11]:
mixed_columns = [col for col in df_ords.columns if len(set(df_ords[col].apply(type))) > 1]
print("Columns with mixed data types:", mixed_columns)

NameError: name 'df_ords' is not defined

In [30]:
#Checking for Null values
df_ords.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

There are 206,209 null values for days_since_prior order. These null values could represent 'zero days since prior order' - if this could be confirmed to be true than I shouldn't do anything at all to these values. They likely correspond to other data that is correct if the null value is due to having zero days since prior order. It is also possible that they represent accounts that have been created but have not placed any orders yet. In that case they would also have zero days since prior order, which could be represented as a null value. In that case it would also be appropriate to not change anything in the dataframe. 

In [31]:
#Checking for duplicates
df_ords_dups = df_ords[df_ords.duplicated()]

In [32]:
df_ords_dups

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order


There are no duplicates in this data set. If there were I would use the .drop_duplicate() command. 

# Exporting orders_clean.csv

In [33]:
#Exporting orders_clean.csv
df_ords.to_csv(os.path.join(path, 'Data', 'Prepared Data', 'orders_clean.csv'))