# 07 â€” Cleaning + data quality

Mirrors: `chapters/07_cleaning_quality.md`


In [None]:
import sys
from pathlib import Path

import pandas as pd

# If running from the repository root, this makes the shared module importable:
shared = Path.cwd() / "docs" / "tutorials" / "python" / "modules" / "pandas" / "shared"
sys.path.insert(0, str(shared))

from make_orders import make_orders

orders = make_orders()
orders.head()

In [None]:
# Make a deliberately messy copy

dirty = orders.copy()
dirty.loc[0, "shipping_status"] = " Delivered  "
dirty.loc[1, "shipping_status"] = "In Transit"
dirty.loc[2, "price"] = -10

dirty.loc[3, "customer"] = None

dirty[["shipping_status", "price", "customer"]].head(5)

In [None]:
# Profile missingness + obvious issues
missing = (
    dirty.isna().sum().to_frame("missing_count")
    .join(dirty.isna().mean().to_frame("missing_rate"))
    .sort_values("missing_rate", ascending=False)
)
missing

In [None]:
# Clean
status_map = {"in transit": "in_transit", "in-transit": "in_transit"}

clean = (
    dirty
    .dropna(subset=["customer"]).copy()
    .assign(
        shipping_status=lambda d: d["shipping_status"].astype("string").str.strip().str.lower().replace(status_map),
        price=lambda d: pd.to_numeric(d["price"], errors="coerce"),
        quantity=lambda d: pd.to_numeric(d["quantity"], errors="coerce"),
    )
    .assign(
        is_valid_price=lambda d: d["price"].ge(0) & d["price"].notna(),
        is_valid_qty=lambda d: d["quantity"].ge(0) & d["quantity"].notna(),
    )
)

clean[["shipping_status", "price", "is_valid_price"]].head()

In [None]:
# Filter to valid records
clean2 = clean.loc[clean["is_valid_price"] & clean["is_valid_qty"]].copy()
(len(dirty), len(clean), len(clean2))