In [9]:
import pandas as pd
from pathlib import Path

pd.set_option("display.max_columns", None)

DATA_RAW = Path("/Users/dishasanthosh/Desktop/data_raw")


In [11]:
files = {
    "orders": "olist_orders_dataset.csv",
    "items": "olist_order_items_dataset.csv",
    "payments": "olist_order_payments_dataset.csv",
    "customers": "olist_customers_dataset.csv",
    "products": "olist_products_dataset.csv",
    "reviews": "olist_order_reviews_dataset.csv",
    "sellers": "olist_sellers_dataset.csv",
    "categories": "product_category_name_translation.csv",
}

dfs = {k: pd.read_csv(DATA_RAW / v) for k, v in files.items()}


In [12]:
summary = []
for name, df in dfs.items():
    summary.append({
        "table": name,
        "rows": df.shape[0],
        "cols": df.shape[1]
    })

pd.DataFrame(summary)


Unnamed: 0,table,rows,cols
0,orders,99441,8
1,items,112650,7
2,payments,103886,5
3,customers,99441,5
4,products,32951,9
5,reviews,99224,7
6,sellers,3095,4
7,categories,71,2


In [13]:
def key_check(df, keys):
    return {
        "duplicates": df.duplicated(subset=keys).sum(),
        "nulls": df[keys].isna().sum().to_dict()
    }

checks = {
    "orders": key_check(dfs["orders"], ["order_id"]),
    "items": key_check(dfs["items"], ["order_id","order_item_id"]),
    "payments": key_check(dfs["payments"], ["order_id"]),
    "customers": key_check(dfs["customers"], ["customer_id"]),
    "products": key_check(dfs["products"], ["product_id"]),
    "sellers": key_check(dfs["sellers"], ["seller_id"]),
}

checks


{'orders': {'duplicates': np.int64(0), 'nulls': {'order_id': 0}},
 'items': {'duplicates': np.int64(0),
  'nulls': {'order_id': 0, 'order_item_id': 0}},
 'payments': {'duplicates': np.int64(4446), 'nulls': {'order_id': 0}},
 'customers': {'duplicates': np.int64(0), 'nulls': {'customer_id': 0}},
 'products': {'duplicates': np.int64(0), 'nulls': {'product_id': 0}},
 'sellers': {'duplicates': np.int64(0), 'nulls': {'seller_id': 0}}}

In [14]:
critical_cols = [
    "order_purchase_timestamp",
    "order_delivered_customer_date",
    "order_estimated_delivery_date",
    "order_status"
]

dfs["orders"][critical_cols].isna().mean().sort_values(ascending=False) * 100


order_delivered_customer_date    2.981668
order_purchase_timestamp         0.000000
order_estimated_delivery_date    0.000000
order_status                     0.000000
dtype: float64