# 4.5 Data Consistency Checks

## 01. Importing Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

## 02. Importing Data

In [2]:
path = r'/Users/Documents/cf/Instacart Data Analysis'

In [3]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [4]:
df_ords = pd.read_csv(os.path.join(path,'02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

## 03. Mixed-Type Data

In [5]:
df_test = pd.DataFrame()

In [6]:
df_test['mix'] = ['a', 'b', 1, True]

In [7]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [8]:
# Check for any mixed columns
for col in df_test.columns.tolist():
    weird = (df_test[col].map(type) != df_test[col].iloc[0].__class__).any()
    if weird:
        print(col)

mix


In [9]:
# Convert column's data from numeric to string
df_test['mix'] = df_test['mix'].astype('str')

In [10]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   mix     4 non-null      object
dtypes: object(1)
memory usage: 164.0+ bytes


## 04. Missing Values

In [11]:
# Find total number of missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [12]:
# Create null dataframe
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [13]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


#### Addressing Missing Values

There a few ways to deal with missing data:
1)  Create a new variable that acts like a flag based on the missing value.
2) Impute the value with the mean or median of the column (if the variable is numeric).
3) Remove or filter out the missing data.

In [14]:
df_nan.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,16.0,16.0,16.0,16.0
mean,6684.0,89.9375,10.9375,13.0125
std,12836.665242,33.731229,4.639953,3.881731
min,34.0,26.0,1.0,1.2
25%,459.25,70.75,7.75,12.175
50%,2413.0,98.5,11.5,13.65
75%,3872.75,120.0,14.5,14.425
max,40440.0,126.0,16.0,20.9


Ex: When using the mean,
df['column with missings'].fillna(mean value, inplace=True)

In [15]:
df_nan.median()

product_id       2413.0
product_name        NaN
aisle_id           98.5
department_id      11.5
prices            13.65
dtype: object

EX: when using the median,
df['column with missings'].fillna(median value, inplace=True)

Because the missing values are strings there's not much you can do other than remove/filter the data

In [16]:
# Find shape of df
df_prods.shape

(49693, 5)

In [17]:
# Create new df
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [18]:
# get shape of new data
df_prods_clean.shape

(49677, 5)

To drop all missing values --> df_prods.dropna(inplace = True)

To drop only the NaNs from a particular column --> df_prods.dropna(subset = [‘product_name’], inplace = True)

## 05. Duplicates

In [19]:
# Create duplicate dataframe
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [20]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [21]:
# Check current number of rowns in df_prods_clean
df_prods_clean.shape

(49677, 5)

In [22]:
# cleaned df without duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [23]:
df_prods_clean_no_dups.shape

(49672, 5)

## 06. Exporting Changes

In [24]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'))

# 4-5


In [25]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,customers_current_order_count,order_days_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


'days_since_prior_order': Has fewer entries than other columns, suggesting missing values. A value of 0 indicates same-day orders; the max of 30 may reflect a one-month cap.

'order_number': Max value is 100, possibly indicating a limit of 100 orders.

'order_dow': Ranges from 0 to 6, representing days of the week.

'order_hour_of_day': Ranges from 0 to 23, representing hours of the day.


In [26]:
for col in df_ords.columns.tolist():
    weird = (df_ords[col].map(type) != df_ords[col].iloc[0].__class__).any()
    if weird:
        print(col)

Unnamed: 0
order_id
user_id
customers_current_order_count
order_days_of_week
order_hour_of_day
days_since_prior_order


In [27]:
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 8 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   Unnamed: 0                     int64  
 1   order_id                       int64  
 2   user_id                        int64  
 3   eval_set                       object 
 4   customers_current_order_count  int64  
 5   order_days_of_week             int64  
 6   order_hour_of_day              int64  
 7   days_since_prior_order         float64
dtypes: float64(1), int64(6), object(1)
memory usage: 208.8+ MB


In [28]:
# Find missing values
df_ords.isnull().sum()

Unnamed: 0                            0
order_id                              0
user_id                               0
eval_set                              0
customers_current_order_count         0
order_days_of_week                    0
order_hour_of_day                     0
days_since_prior_order           206209
dtype: int64

'days_since_prior_order' has 206209 values missing.

In [29]:
# Create subset of the dataframe with null values
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [30]:
df_ords_nan

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,customers_current_order_count,order_days_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,prior,1,2,8,
11,11,2168274,2,prior,1,2,11,
26,26,1374495,3,prior,1,1,14,
39,39,3343014,4,prior,1,6,11,
45,45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,prior,1,4,12,
3420934,3420934,3189322,206206,prior,1,3,18,
3421002,3421002,2166133,206207,prior,1,6,19,
3421019,3421019,2227043,206208,prior,1,1,15,


In [31]:
# fill missing values
df_ords['days_since_prior_order'].fillna(7, inplace=True)

In [33]:
df_ords[df_ords['order_id']==1]

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,customers_current_order_count,order_days_of_week,order_hour_of_day,days_since_prior_order
1868044,1868044,1,112108,train,4,4,10,9.0


In [34]:
# Create subset of df_ords for duplicate rows
df_ords_dups = df_ords[df_ords.duplicated()]

In [35]:
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,customers_current_order_count,order_days_of_week,order_hour_of_day,days_since_prior_order


There are no duplicates in this . however, I would use '.drop_duplicates()' if they existed to creat a new df without duplicates.

In [36]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'))