In [2]:
import pandas as pd
import skrub
from skrub._data_ops._provenance import trace_row, explain_provenance

In [4]:
# =========================================================================
# 1. CREATE SOURCE DATA
# =========================================================================

orders_df = pd.DataFrame({
    'order_id': [101, 102, 103, 104, 105],
    'customer_id': [1, 2, 1, 3, 2],
    'product_id': ['A', 'B', 'A', 'C', 'B'],
    'amount': [50, 120, 75, 200, 90],
    'date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05']
})

customers_df = pd.DataFrame({
    'customer_id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'region': ['East', 'West', 'East']
})

products_df = pd.DataFrame({
    'product_id': ['A', 'B', 'C'],
    'category': ['Electronics', 'Clothing', 'Electronics'],
    'cost': [30, 80, 150]
})

# =========================================================================
# 2. BUILD PIPELINE WITH NORMAL SKRUB SYNTAX
# =========================================================================

# Create skrub variables (provenance initialized automatically in Var.compute)
orders = skrub.var('orders', orders_df)
customers = skrub.var('customers', customers_df)
products = skrub.var('products', products_df)

# Filter high-value orders (provenance propagated automatically)
high_value = orders.query('amount > 70')

# Join with customers (provenance combined automatically)
with_customers = high_value.merge(customers, on='customer_id')

# Join with products (provenance combined automatically)
full_data = with_customers.merge(products, on='product_id')

# Filter to electronics only
electronics = full_data.query('category == "Electronics"')

# Aggregate by region (provenance merged for all rows in each group)
# Use agg() to get a DataFrame rather than Series
sales_by_region = full_data.groupby('region').agg({'amount': 'sum', 'order_id': 'count'})

# =========================================================================
# 3. EVALUATE AND QUERY PROVENANCE
# =========================================================================

# Evaluate the pipelines
electronics_result = electronics.skb.eval()
regional_result = sales_by_region.skb.eval()

print("=" * 60)
print("ELECTRONICS ORDERS")
print("=" * 60)
print(electronics_result)

print("\n" + "=" * 60)
print("SALES BY REGION")
print("=" * 60)
print(regional_result)

# =========================================================================
# 4. TRACE PROVENANCE
# =========================================================================

print("\n" + "=" * 60)
print("PROVENANCE QUERIES")
print("=" * 60)

# Trace where each electronics order came from
print("\nElectronics Orders - Provenance:")
for i in range(len(electronics_result)):
    print(f"\n{explain_provenance(electronics_result, i)}")

# Trace regional aggregations
print("\n\nRegional Sales - Provenance:")
for i in range(len(regional_result)):
    prov = trace_row(regional_result, i)
    region = regional_result.index[i]
    total = regional_result.iloc[i]['amount']
    count = regional_result.iloc[i]['order_id']
    print(f"\n  {region}: ${total} ({count} orders)")
    print(f"    Orders contributing: rows {prov.get('orders', [])}")
    print(f"    Customers involved: rows {prov.get('customers', [])}")
    print(f"    Products involved: rows {prov.get('products', [])}")

ELECTRONICS ORDERS
   order_id  customer_id product_id  amount        date     name region  \
1       103            1          A      75  2024-01-03    Alice   East   
2       104            3          C     200  2024-01-04  Charlie   East   

      category  cost                                 _skrub_provenance_  
1  Electronics    30  {'orders': [2], 'customers': [0], 'products': ...  
2  Electronics   150  {'orders': [3], 'customers': [2], 'products': ...  

SALES BY REGION
        amount  order_id                                 _skrub_provenance_
region                                                                     
East       275         2  {'orders': [2, 3], 'customers': [0, 2], 'produ...
West       210         2  {'orders': [1, 4], 'customers': [1], 'products...

PROVENANCE QUERIES

Electronics Orders - Provenance:

Row 0 provenance:
  - From 'customers': row 0
  - From 'orders': row 2
  - From 'products': row 0

Row 1 provenance:
  - From 'customers': row 2
  - From 'or

In [7]:
list(electronics_result._skrub_provenance_)[0]

{'orders': [2], 'customers': [0], 'products': [0]}