In [114]:
import pandas as pd
from pathlib import Path
import sys
import os
import importlib

In [None]:
# Add the parent directory of src to sys.path
#sys.path.append(str(Path("..") / "03_src"))

In [109]:
import utils

In [5]:
path = "C:\\Users\\acer\\Documents\\Data Analyst - Field Training Program\\Portfolio\\3. E-commerce\\olist_ops_project_dap3\\01_data\\01_raw\\"

In [7]:
customers = pd.read_csv(path+"olist_customers_dataset.csv")
order_items = pd.read_csv(path+"olist_order_items_dataset.csv")
orders = pd.read_csv(path+"olist_orders_dataset.csv")
sellers = pd.read_csv(path+"olist_sellers_dataset.csv")


In [25]:
datasets = {
    'customers':customers,
    'order_items':order_items,
    'orders':orders,
    'sellers':sellers
}

In [None]:
for name, d in datasets.items():
    print(f"{name}: {d.shape}")

customers:(99441, 5)
order_items:(112650, 7)
orders:(99441, 8)
sellers:(3095, 4)


In [40]:
# .info() of datasets

for key, value in datasets.items():
    print(key + "\n")
    value.info()
    print("\n\n\n")

customers

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB




order_items

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   order_id             112650 non-null  object 
 1   order_item_id        112650 non-null  int64  
 2   product_id           112650 non-null  object 
 3   seller_id            112650 non-null  object 
 4   shipping_limit_

## Data Cleaning Checklist

- [x] Datatype conversions
- [x] Duplicates
- [x] Missing Values

### Duplicates

In [43]:
# Generic duplicate check on datasets

for name, df in datasets.items():
    print(f"{name}: {df.duplicated().sum()}")
    

customers: 0
order_items: 0
orders: 0
sellers: 0


In [17]:
customers['customer_id'].duplicated().sum()

np.int64(0)

In [44]:
# Duplicate IDs indicate repeat customers

customers['customer_unique_id'].duplicated().sum()

np.int64(3345)

In [45]:
# Grain(order_id, order_item_id), 1 row per item per order

order_items['order_id'].duplicated().sum()

np.int64(13984)

In [23]:
# order_items composite key: order_id, order_item_id

order_items.duplicated(
    subset=['order_id',"order_item_id"]
).sum()

np.int64(0)

In [24]:
sellers['seller_id'].duplicated().sum()

np.int64(0)

### Datatype Conversions

In [47]:
orders.columns

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date'],
      dtype='object')

In [48]:
order_date_cols = [
    'order_purchase_timestamp', 'order_approved_at', 
    'order_delivered_carrier_date', 'order_delivered_customer_date',
    'order_estimated_delivery_date'
]

In [49]:
for col in order_date_cols:
    orders[col] = pd.to_datetime(orders[col], errors='coerce')

In [53]:
order_items['shipping_limit_date'] = pd.to_datetime(order_items['shipping_limit_date'], errors='coerce')

In [52]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  object        
 1   customer_id                    99441 non-null  object        
 2   order_status                   99441 non-null  object        
 3   order_purchase_timestamp       99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(3)
memory usage: 6.1+ MB


In [54]:
order_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  object        
 1   order_item_id        112650 non-null  int64         
 2   product_id           112650 non-null  object        
 3   seller_id            112650 non-null  object        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 6.0+ MB


### Droping missing values

In [57]:
orders['order_status'].unique()

array(['delivered', 'invoiced', 'shipped', 'processing', 'unavailable',
       'canceled', 'created', 'approved'], dtype=object)

In [66]:
kpi_status = ['delivered']

In [78]:
orders_kpi = orders[orders['order_status'].isin(kpi_status)].copy()

In [80]:
orders_non_delivered = orders[~orders['order_status'].isin(kpi_status)].copy()

In [79]:
# Undelivered orders
orders.shape[0] - orders_kpi.shape[0]  

2963

In [84]:
orders_kpi[order_date_cols].isna().sum()

order_purchase_timestamp          0
order_approved_at                14
order_delivered_carrier_date      2
order_delivered_customer_date     8
order_estimated_delivery_date     0
dtype: int64

In [85]:
orders_kpi_clean = orders_kpi.dropna(
    subset=['order_approved_at', 'order_delivered_customer_date']
).copy()

In [86]:
orders_kpi_clean[order_date_cols].isna().sum()

order_purchase_timestamp         0
order_approved_at                0
order_delivered_carrier_date     1
order_delivered_customer_date    0
order_estimated_delivery_date    0
dtype: int64

In [90]:
orders_kpi_clean = orders_kpi.dropna(
    subset=['order_approved_at', 'order_delivered_customer_date', 'order_delivered_carrier_date']
).copy()

In [91]:
orders_kpi_clean[order_date_cols].isna().sum()

order_purchase_timestamp         0
order_approved_at                0
order_delivered_carrier_date     0
order_delivered_customer_date    0
order_estimated_delivery_date    0
dtype: int64

In [119]:
orders_kpi_clean.to_csv(utils.path() + "\\02_interim\\orders_kpi_clean.csv", index=False)

In [120]:
order_items.to_csv(utils.path() + "\\02_interim\\order_items.csv", index=False)