# Contents
1. Imports
2. Checks
3. Wrangling
4. Cleaning
5. Exports

# 1. Imports

In [2]:
#Libraries
import pandas as pd
import numpy as np
import os

In [4]:
#Path
path = r'/Users/davidgriesel/Documents/Analytics/Projects/Online Grocery Store'

In [6]:
#Dataset
df_orders_products = pd.read_csv(os.path.join(path, '02 - Data', 'Original Data', 'orders_products.csv'))

# 2. Checks

In [8]:
#Dimensions
df_orders_products.shape

(32434489, 4)

In [9]:
#Preview
df_orders_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [10]:
#Summary stats
df_orders_products.describe()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,32434490.0
mean,1710749.0,25576.34,8.351076,0.5896975
std,987300.7,14096.69,7.126671,0.4918886
min,2.0,1.0,1.0,0.0
25%,855943.0,13530.0,3.0,0.0
50%,1711048.0,25256.0,6.0,1.0
75%,2565514.0,37935.0,11.0,1.0
max,3421083.0,49688.0,145.0,1.0


##### Observations:
- Dataset contains 32,434,489 records
- Possible missing records in order_id with min = 2

# 3. Wrangling

## 3.1. Drop Columns

### 3.1.1. Identify Redundant Columns

In [None]:
#Check frequency distribution of variable
df_orders_products['order_id'].value_counts(dropna = False)

##### Observations:
- 3,214,874 unique values ranging between 2 to 3,421,083
- With 32,434,489 records the dataset likely has missing records
- Retain column

In [None]:
#Check frequency distribution of variable
df_orders_products['product_id'].value_counts(dropna = False)

##### Observations:
- 49,677 values ranging between 1 and 49,688
- Indicates missing records and/or that certain products have never been ordered
- Retain column

In [None]:
#Check frequency distribution of variable
df_orders_products['add_to_cart_order'].value_counts(dropna = False)

##### Observations:
- 145 values ranging between 1 and 145
- Customers ordered up to 145 items at a time
- Retain column

In [None]:
#Check frequency distribution of variable
df_orders_products['reordered'].value_counts(dropna = False)

##### Observations:
- 2 values with 1 indicating customer have ordered product before, and 0 indicating first time order 
- Retain column

### 3.1.2. Address Redundant Columns

##### Observations:
- No redundant columns identified

## 3.2. Rename Columns

### 3.2.1. Identify Unclear Descriptions

In [None]:
#Check Column Descriptions
df_orders_products.columns

##### Observations:
- Variables are described clearly

### 3.2.2. Address Unclear Descriptions

##### Observations:
- No unclear descriptions noted

## 3.3. Data Types

### 3.3.1. Identify Inconsistent Data Types

In [None]:
#Check data types
df_orders_products.dtypes

##### Observations:
- Data types are consistent with variable contents

### 3.3.2. Address Inconsistent Data Types

##### Observations:
- No inconsistencies noted

# 4. Cleaning

## 4.1. Accuracy

### 4.1.1. Identify Inaccurate Values

In [None]:
#Review descriptive statistics
df_orders_products.describe()

In [None]:
#Number of unique values per variable
df_orders_products.nunique()

##### Observations:
- The dataset cotains 32,434,489 records
- With 3,214,874 unique user_id's ranging between 2 to 3,421,083 the dataset likely contains duplicate and/or missing records
- With 49,677 unique product_id ranging between 1 and 49,688 it either confirms missing records, or that certain products have never been ordered
- Customers ordered between 1 and 145 products at a time
- Dataset indicates of products have been ordered before (1) or not (0).

### 4.1.2. Address Inaccurate Values

##### Observations
- No inaccuracies noted

## 4.2. Missing Values

### 4.2.1. Identify missing values

In [None]:
#Number of missing values per variable
df_orders_products.isnull().sum()

##### Observations:
- None of the variables had NaN values

### 4.2.2. Address Missing Values

##### Observations:
- No missing values were identified

## 4.3. Mixed Type Variables

### 4.3.1. Find Mixed Type Variables

In [None]:
#Finding mixed type data
for col in df_orders_products.columns.tolist():
    weird = (df_orders_products[[col]].map(type) != df_orders_products[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df_orders_products[weird]) > 0:
        print (col)

##### Observations:
- No variables returned

### 4.3.2. Address Mixed Type Variables

##### Observations:
- No variables identified with with mixed type data

## 4.4. Duplicates

### 4.4.1. Find Duplicates

In [None]:
#Identify duplicates, create and view subset with results
df_orders_products_duplicates = df_orders_products[df_orders_products.duplicated()]
df_orders_products_duplicates

In [None]:
#Confirm dimensions
df_orders_products_duplicates.shape

##### Observations:
- No records returned

### 4.4.2. Address Duplicates

##### Observations
- No duplicate records identified

# 5. Exports

In [None]:
#Confirm dimensions
df_orders_products.shape

In [None]:
#Export cleaned dataset
df_orders_products.to_pickle(os.path.join(path, '02 - Data', 'Prepared Data', 'cleaned_orders_products.pkl'))