In [10]:
import pandas as pd
import numpy as np

# Nice display
pd.set_option("display.width", 140)
pd.set_option("display.max_rows", 30)

# ----------------------------
# 1) Create mock raw dataset
# ----------------------------
data = {
    "InvoiceNo": ["10001","10001","10002","10003","10003","10003"],
    "StockCode": ["A1","B2","A1","C3","A1","B2"],
    "Description": ["Mug","T-shirt","Mug","Hat","Mug","T-shirt"],
    "Quantity": [2,1,3,1,1,2],
    "InvoiceDate": [
        "2010-12-01 08:45","2010-12-01 08:45","2010-12-02 10:05",
        "2010-12-05 14:20","2010-12-05 14:20","2010-12-05 14:20"
    ],
    "UnitPrice": [5.0,15.0,5.0,10.0,5.0,15.0],
    "CustomerID": [12345,12345,99999,77777,77777,77777],
    "Country": ["United Kingdom","United Kingdom","France","Germany","Germany","Germany"]
}
df_raw = pd.DataFrame(data)
print("✅ Created df_raw:", df_raw.shape)

# ----------------------------
# 2) Clean & types → df
# ----------------------------
df = df_raw.copy()
df.columns = (df.columns
              .str.strip()
              .str.lower()
              .str.replace(" ", "_", regex=False))

df["invoicedate"] = pd.to_datetime(df["invoicedate"], errors="coerce")
for c in ["quantity", "unitprice"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.dropna(subset=["invoicedate", "quantity", "unitprice", "invoiceno"])
print("✅ Clean df:", df.shape)

# ----------------------------
# 3) Build df_full (order_items ⟷ orders)
# ----------------------------
orders_cols      = ["invoiceno", "customerid", "invoicedate", "country"]
order_item_cols  = ["invoiceno", "stockcode", "description", "quantity", "unitprice"]

orders = (df[orders_cols]
          .drop_duplicates(subset=["invoiceno"])
          .reset_index(drop=True))
order_items = df[order_item_cols].reset_index(drop=True)

df_full = pd.merge(order_items, orders, on="invoiceno", how="inner")
df_full["revenue"] = df_full["quantity"] * df_full["unitprice"]
df_full["month"]   = df_full["invoicedate"].dt.to_period("M")

print("✅ df_full:", df_full.shape)
print(df_full.head(5))

# ----------------------------
# 4) True AOV (per invoice)
# ----------------------------
# Sum line items into invoice totals first
invoice_rev = (df_full
    .groupby(["invoiceno","month","country","customerid"], as_index=False)["revenue"].sum()
    .rename(columns={"revenue":"invoice_revenue"})
)

summary_true_aov = (
    invoice_rev.groupby(["month","country"], as_index=False)
               .agg(
                    invoices=("invoiceno","nunique"),
                    customers=("customerid","nunique"),
                    total_revenue=("invoice_revenue","sum"),
               )
)
summary_true_aov["average_order_value"] = (
    summary_true_aov["total_revenue"] / summary_true_aov["invoices"]
)

print("\n📊 Summary (true AOV per invoice):")
print(summary_true_aov.sort_values(["month","total_revenue"], ascending=[True,False]))

# ----------------------------
# 5) Top-3 invoices per month
# ----------------------------
invoice_rank = (invoice_rev
    .sort_values(["month","invoice_revenue"], ascending=[True,False])
    .assign(rank_in_month=lambda d: d.groupby("month")["invoice_revenue"]
                                     .rank(method="first", ascending=False))
)
top3_invoices = invoice_rank[invoice_rank["rank_in_month"] <= 3]

print("\n🥇 Top-3 invoices per month:")
print(top3_invoices[["month","invoiceno","country","invoice_revenue","rank_in_month"]])

# ----------------------------
# 6) Monthly country ranking (all + top-3)
# ----------------------------
monthly_country_rank = (
    df_full.groupby(["month","country"])["revenue"].sum()
           .reset_index(name="monthly_revenue")
           .sort_values(["month","monthly_revenue"], ascending=[True,False])
)
top3_countries_per_month = (monthly_country_rank
    .groupby("month", group_keys=False)
    .head(3)
)

print("\n🌍 Monthly country ranking (all rows):")
print(monthly_country_rank)

print("\n🏆 Top-3 countries per month:")
print(top3_countries_per_month)

# ----------------------------
# 7) Top-3 customers per country
# ----------------------------
top_customers_by_country = (
    df_full.groupby(["country","customerid"])["revenue"].sum()
           .reset_index(name="total_revenue")
           .sort_values(["country","total_revenue"], ascending=[True,False])
           .groupby("country", group_keys=False)
           .head(3)
)
print("\n👤 Top-3 customers per country:")
print(top_customers_by_country)

# ----------------------------
# 8) Sanity checks
# ----------------------------
total_raw = df_full["revenue"].sum()
total_invoice = invoice_rev["invoice_revenue"].sum()
assert np.isclose(total_raw, total_invoice), "❌ Totals mismatch: line-items vs invoice sums!"
assert (df_full["revenue"] >= 0).all(), "❌ Negative revenue found."

print("\n✅ Sanity checks passed. Everything is consistent.")

✅ Created df_raw: (6, 8)
✅ Clean df: (6, 8)
✅ df_full: (6, 10)
  invoiceno stockcode description  quantity  unitprice  customerid         invoicedate         country  revenue    month
0     10001        A1         Mug         2        5.0       12345 2010-12-01 08:45:00  United Kingdom     10.0  2010-12
1     10001        B2     T-shirt         1       15.0       12345 2010-12-01 08:45:00  United Kingdom     15.0  2010-12
2     10002        A1         Mug         3        5.0       99999 2010-12-02 10:05:00          France     15.0  2010-12
3     10003        C3         Hat         1       10.0       77777 2010-12-05 14:20:00         Germany     10.0  2010-12
4     10003        A1         Mug         1        5.0       77777 2010-12-05 14:20:00         Germany      5.0  2010-12

📊 Summary (true AOV per invoice):
     month         country  invoices  customers  total_revenue  average_order_value
1  2010-12         Germany         1          1           45.0                 45.0
2  2010-