<a href="https://colab.research.google.com/github/fikrifaizz/ecommerce-dashboard/blob/main/notebooks/01_data_inspection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
RAW_DATA = Path('../data/raw')

# Load core tables
orders = pd.read_csv(RAW_DATA / 'olist_orders_dataset.csv')
order_items = pd.read_csv(RAW_DATA / 'olist_order_items_dataset.csv')
products = pd.read_csv(RAW_DATA / 'olist_products_dataset.csv')
customers = pd.read_csv(RAW_DATA / 'olist_customers_dataset.csv')
payments = pd.read_csv(RAW_DATA / 'olist_order_payments_dataset.csv')
reviews = pd.read_csv(RAW_DATA / 'olist_order_reviews_dataset.csv')

print("DATASET OVERVIEW")

tables = {
    'orders': orders,
    'order_items': order_items,
    'products': products,
    'customers': customers,
    'payments': payments,
    'reviews': reviews
}

for name, df in tables.items():
    print(f"\n{name.upper()}")
    print(f"  Rows: {len(df):,}")
    print(f"  Columns: {len(df.columns)}")
    print(f"  Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

DATASET OVERVIEW

ORDERS
  Rows: 99,441
  Columns: 8
  Memory: 58.97 MB

ORDER_ITEMS
  Rows: 112,650
  Columns: 7
  Memory: 39.43 MB

PRODUCTS
  Rows: 32,951
  Columns: 9
  Memory: 6.79 MB

CUSTOMERS
  Rows: 99,441
  Columns: 5
  Memory: 29.62 MB

PAYMENTS
  Rows: 103,886
  Columns: 5
  Memory: 17.81 MB

REVIEWS
  Rows: 99,224
  Columns: 7
  Memory: 42.75 MB


In [4]:
print("DATA QUALITY ASSESSMENT")

def check_data_quality(df, table_name):
    print(f"\n{table_name}:")
    
    # Missing values
    missing = df.isnull().sum()
    if missing.any():
        print(f"Missing values:")
        print(missing[missing > 0].to_string())
    else:
        print("No missing values")
    
    # Duplicates
    dupes = df.duplicated().sum()
    print(f"Duplicates: {dupes:,}")
    
    return missing, dupes

# Check each table
for name, df in tables.items():
    check_data_quality(df, name)

DATA QUALITY ASSESSMENT

orders:
Missing values:
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
Duplicates: 0

order_items:
No missing values
Duplicates: 0

products:
Missing values:
product_category_name         610
product_name_lenght           610
product_description_lenght    610
product_photos_qty            610
product_weight_g                2
product_length_cm               2
product_height_cm               2
product_width_cm                2
Duplicates: 0

customers:
No missing values
Duplicates: 0

payments:
No missing values
Duplicates: 0

reviews:
Missing values:
review_comment_title      87656
review_comment_message    58247
Duplicates: 0
