# Data Checks on: order_products_prior.csv

#### Contents. 
- Missing Data Check
- Duplicate Check
- Mixed-type Data Check
- Other inconsistency, if needed, check
- Summary if changes to the original dataframe

## Importing libraries and data

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
#importing dataset:
path = r'C:\Users\chris\Documents\Instacart Basket Analysis'
df_prior = pd.read_csv(os.path.join(path,'02 Data', 'Original Data', 'order_products__prior.csv'))

## Viewing the dataframe

In [3]:
# size:
df_prior.shape

(32434489, 4)

In [4]:
# columns:
df_prior.columns

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered'], dtype='object')

In [5]:
# looks:
df_prior

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
32434484,3421083,39678,6,1
32434485,3421083,11352,7,0
32434486,3421083,4600,8,0
32434487,3421083,24852,9,1


In [6]:
# data types:
df_prior.dtypes

order_id             int64
product_id           int64
add_to_cart_order    int64
reordered            int64
dtype: object

In [7]:
# basic stats:
df_prior.describe().round(1)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
count,32434489.0,32434489.0,32434489.0,32434489.0
mean,1710748.5,25576.3,8.4,0.6
std,987300.7,14096.7,7.1,0.5
min,2.0,1.0,1.0,0.0
25%,855943.0,13530.0,3.0,0.0
50%,1711048.0,25256.0,6.0,1.0
75%,2565514.0,37935.0,11.0,1.0
max,3421083.0,49688.0,145.0,1.0


### Missing Data Check

In [8]:
# checking for any missing values:
df_prior.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

##### No missing data.

### Duplicated Data Check

In [9]:
# checking for mixed-typed data:
for col in df_prior.columns.tolist():
  weird = (df_prior[[col]].applymap(type) != df_prior[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_prior[weird]) > 0:
    print (col)

##### No duplicates.

## Summary: What needs to be addressed:

##### changing id's to string data as with the other dataframes

## Changing and Exporting the dataframe >> prior_wrangled.pkl

In [10]:
# changing ids into string variables:
df_prior[['product_id', 'order_id']] = df_prior[['product_id', 'order_id']].astype(str)

In [11]:
# exporting the dataframe:
df_prior.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'prior_wrangled.pkl'))