In [1]:
from pathlib import Path
import pandas as pd

# Extract

In [2]:
BRONZE = Path("data/bronze")
SILVER = Path("data/silver")
SILVER.mkdir(parents=True, exist_ok=True)

accounts = pd.read_csv(
    BRONZE / "ravenstack_accounts.csv",
    parse_dates=["signup_date"],
    dtype={"account_id": str}
)
subs = pd.read_csv(
    BRONZE / "ravenstack_subscriptions.csv",
    parse_dates=["start_date", "end_date"],
    dtype={"subscription_id": str, "account_id": str}
)

# Transform

In [3]:
# Normalize IDs (defensive)
accounts["account_id"] = accounts["account_id"].astype(str).str.strip()
subs["account_id"] = subs["account_id"].astype(str).str.strip()

# -----------------------
# 1) Earliest subscription start per account
# -----------------------
subs_min = (
    subs[["account_id", "start_date"]]
    .dropna(subset=["start_date"])
    .groupby("account_id", as_index=False)["start_date"]
    .min()
    .rename(columns={"start_date": "earliest_sub_start"})
)

# -----------------------
# 2) Start from accounts with the columns you want to keep
# -----------------------
cols_to_drop = ["plan_tier", "seats", "is_trial", "churn_flag"]
cols_to_keep = [c for c in accounts.columns if c not in cols_to_drop]

silver_accounts = accounts[cols_to_keep].merge(subs_min, on="account_id", how="left")

# -----------------------
# 3) Replace signup_date with earliest subscription start when available
#    (if an account has no subs, keep the original signup_date)
# -----------------------
silver_accounts["signup_date"] = silver_accounts["earliest_sub_start"].combine_first(
    silver_accounts["signup_date"]
)

# -----------------------
# 4) Drop helper column
# -----------------------
silver_accounts = silver_accounts.drop(columns=["earliest_sub_start"])

# Optional helper: signup_month for convenience downstream
if "signup_date" in silver_accounts.columns:
    silver_accounts["signup_month"] = silver_accounts["signup_date"].values.astype("datetime64[M]")

# -----------------------
# Sanity checks & reporting
# -----------------------
original_signup = accounts.set_index("account_id")["signup_date"]
updated_mask = (
    silver_accounts["signup_date"].values !=
    original_signup.reindex(silver_accounts["account_id"]).values
)
updated_count = int(updated_mask.sum())

print(f"[OK] silver_accounts built: {len(silver_accounts):,} rows")
print(f"[INFO] Signup date updated for {updated_count} account(s) out of {len(silver_accounts):,}.")

[OK] silver_accounts built: 500 rows
[INFO] Signup date updated for 475 account(s) out of 500.


In [4]:
# Peek
silver_accounts.head()

Unnamed: 0,account_id,account_name,industry,country,signup_date,referral_source,signup_month
0,A-2e4581,Company_0,EdTech,US,2024-10-20,partner,2024-10-01
1,A-43a9e3,Company_1,FinTech,IN,2023-09-20,other,2023-09-01
2,A-0a282f,Company_2,DevTools,US,2024-08-31,organic,2024-08-01
3,A-1f0ac7,Company_3,HealthTech,UK,2023-12-04,other,2023-12-01
4,A-ce550d,Company_4,HealthTech,US,2024-10-28,event,2024-10-01


# Load

In [5]:
# --- Save ---
out_path = SILVER / "silver_accounts.csv"
silver_accounts.to_csv(out_path, index=False)
print(f"[OK] Wrote {out_path}")

[OK] Wrote data\silver\silver_accounts.csv
