In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

# Extract

In [2]:
DATA = Path("data")
BRONZE = DATA / "bronze"
SILVER = DATA / "silver"
GOLD   = DATA / "gold"
GOLD.mkdir(parents=True, exist_ok=True)

# Title-case converter with acronym handling
ACRONYM_MAP = {"id":"ID","mrr":"MRR","arr":"ARR","api":"API","utc":"UTC","usa":"USA"}
def snake_to_title(col: str) -> str:
    parts = str(col).strip().split("_")
    fixed = [ACRONYM_MAP.get(p.lower(), p.capitalize()) for p in parts]
    return " ".join(fixed)
def rename_title(df: pd.DataFrame) -> pd.DataFrame:
    return df.rename(columns={c: snake_to_title(c) for c in df.columns})

# Load
accounts = pd.read_csv(SILVER / "silver_accounts.csv", parse_dates=["signup_date"], dtype={"account_id": str})
subs = pd.read_csv(BRONZE / "ravenstack_subscriptions.csv", parse_dates=["start_date","end_date"], dtype={"subscription_id": str, "account_id": str})
usage = pd.read_csv(SILVER / "silver_feature_usage.csv", parse_dates=["usage_date"], dtype={"subscription_id": str, "usage_id": str})

# Transform

In [3]:
# Helper months
usage["usage_month"] = usage["usage_date"].values.astype("datetime64[M]")
accounts["signup_month"] = accounts["signup_date"].values.astype("datetime64[M]")

# First usage date by account (via sub join)
usage_sub = usage.merge(subs[["subscription_id","account_id"]], on="subscription_id", how="left", validate="many_to_one")
first_use = (usage_sub.dropna(subset=["account_id"])
             .groupby("account_id", as_index=False)["usage_date"]
             .min()
             .rename(columns={"usage_date":"first_usage_date"}))

accounts_gold = accounts.merge(first_use, on="account_id", how="left")

# --- Country code â†’ name mapping (apply before rename_title) ---
country_map = {
    "US": "United States",
    "AU": "Australia",
    "CA": "Canada",
    "DE": "Germany",
    "FR": "France",
    "IN": "India",
    "UK": "United Kingdom",  # dataset uses UK; keep as requested
}

# Normalize codes and map; keep original if not in the mapping
if "country" in accounts_gold.columns:
    accounts_gold["country"] = (
        accounts_gold["country"]
        .astype(str)
        .str.strip()
        .str.upper()
        .map(country_map)
        .fillna(accounts_gold["country"])
    )

In [4]:
# GOLD: dim_account
dim_account_cols = [
    "account_id","account_name","industry","country",
    "signup_date","signup_month","referral_source",
    "first_usage_date"
]
gold_dim_account = rename_title(accounts_gold[dim_account_cols].copy())

# GOLD: fact_subscription
for col in ["mrr_amount","arr_amount","seats"]:
    if col in subs.columns:
        subs[col] = pd.to_numeric(subs[col], errors="coerce")

fact_subscription_cols = [
    "subscription_id","account_id","start_date","end_date","plan_tier","seats",
    "mrr_amount","arr_amount","is_trial","upgrade_flag","downgrade_flag",
    "churn_flag","billing_frequency","auto_renew_flag"
]
gold_fact_subscription = rename_title(subs[[c for c in fact_subscription_cols if c in subs.columns]].copy())

# GOLD: fact_feature_usage (denormalize Account Id for convenience)
usage_with_acct = usage.merge(subs[["subscription_id","account_id"]], on="subscription_id", how="left", validate="many_to_one")
fact_feature_usage_cols = [
    "usage_id","subscription_id","account_id","usage_date","usage_month",
    "feature_name","usage_count","usage_duration_secs","error_count","is_beta_feature"
]
gold_fact_feature_usage = rename_title(usage_with_acct[[c for c in fact_feature_usage_cols if c in usage_with_acct.columns]].copy())

# Load

In [5]:
# Write
gold_dim_account.to_csv(GOLD / "gold_dim_account.csv", index=False)
gold_fact_subscription.to_csv(GOLD / "gold_fact_subscription.csv", index=False)
gold_fact_feature_usage.to_csv(GOLD / "gold_fact_feature_usage.csv", index=False)

print("[OK] GOLD refreshed from silver_feature_usage:")
print(" - data/gold/gold_dim_account.csv")
print(" - data/gold/gold_fact_subscription.csv")
print(" - data/gold/gold_fact_feature_usage.csv")

[OK] GOLD refreshed from silver_feature_usage:
 - data/gold/gold_dim_account.csv
 - data/gold/gold_fact_subscription.csv
 - data/gold/gold_fact_feature_usage.csv
