# 2 IC Data Consistency Checks - orders_products_prior

### This script contains the following points
#### 01 Import Libraries
#### 02 Import Data
#### 03 First Look at Data
#### 04 Consistency Checks
#### 05 Export Data

# 01 Import Libraries

In [1]:
# Import Libraries

import pandas as pd
import numpy as np
import os

# 02 Import Data

In [2]:
# Set the data path

path = r'C:\Users\Tina\Desktop\CareerFoundry\Data Analytics Immersion\Instacart Basket Analysis'

In [3]:
# Import the "wrangled_orders_products" file

ords_prods_wr = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'wrangled_orders_products.csv'), index_col = False)

# 03 First Look at Data

In [4]:
# Shape of "ords_prods_wr"

ords_prods_wr.shape

(32434489, 5)

In [5]:
# First few rows of "ords_prods_wr"

ords_prods_wr.head()

Unnamed: 0.1,Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,0,2,33120,1,1
1,1,2,28985,2,1
2,2,2,9327,3,0
3,3,2,45918,4,1
4,4,2,30035,5,0


In [6]:
# Drop "Unnamed: 0" column

ords_prods_wr = ords_prods_wr.drop(columns =['Unnamed: 0'])

In [7]:
ords_prods_wr.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [8]:
# Data types of "ords_prods_wr"

ords_prods_wr.dtypes

order_id             int64
product_id           int64
add_to_cart_order    int64
reordered            int64
dtype: object

# 04 Consistency Checks

In [9]:
# Descriptive statistics of "ords_prods_wr"

ords_prods_wr.describe()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,32434490.0
mean,1710749.0,25576.34,8.351076,0.5896975
std,987300.7,14096.69,7.126671,0.4918886
min,2.0,1.0,1.0,0.0
25%,855943.0,13530.0,3.0,0.0
50%,1711048.0,25256.0,6.0,1.0
75%,2565514.0,37935.0,11.0,1.0
max,3421083.0,49688.0,145.0,1.0


No obvious problems were detected.

### 01 Mixed-Type Data

In [10]:
# Check for mixed-type data

for col in ords_prods_wr.columns.tolist():
    weird = (ords_prods_wr[[col]].map(type) != ords_prods_wr[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (ords_prods_wr[weird]) > 0:
        print (col)

No mixed-type data could be found.

### 02 Missing Values

In [11]:
# Check for missing values in "ords_prods_wr"

ords_prods_wr.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

No missing values could be found.

### 03 Duplicate Data

In [12]:
# Create a new subset with only duplicates

ords_prods_wr_dups = ords_prods_wr[ords_prods_wr.duplicated()]

In [13]:
ords_prods_wr_dups

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


There is no duplicate data in this data set.

In [14]:
# Number of rows after consistency check

ords_prods_wr.shape

(32434489, 4)

# 05 Export Data

In [15]:
# Export "ords_prods_wr"

ords_prods_wr.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'checked_orders_products.csv'))