In [1]:
# --- Setup
import pandas as pd
import numpy as np
from pathlib import Path
import os

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

# --- Paths (adjust if needed)
DATA_DIR = Path("./data/bronze")  # folder where your CSVs live

# Filenames (rename if yours differ)
f_accounts      = DATA_DIR / "ravenstack_accounts.csv"
f_subs          = DATA_DIR / "ravenstack_subscriptions.csv"
f_usage         = DATA_DIR / "ravenstack_feature_usage.csv"
f_tickets       = DATA_DIR / "ravenstack_support_tickets.csv"
f_churn         = DATA_DIR / "ravenstack_churn_events.csv"


# --- Helper to read with parsed dates and lowercased columns
def read_csv_with_dates(path, parse_cols=None):
    df = pd.read_csv(path)
    df.columns = [c.strip().lower() for c in df.columns]
    if parse_cols:
        for c in parse_cols:
            if c in df.columns:
                df[c] = pd.to_datetime(df[c], errors="coerce", utc=False)
    return df

accounts = read_csv_with_dates(f_accounts, parse_cols=["signup_date"])
subs     = read_csv_with_dates(f_subs,     parse_cols=["start_date","end_date"])
usage    = read_csv_with_dates(f_usage,    parse_cols=["usage_date"])
tickets  = read_csv_with_dates(f_tickets,  parse_cols=["submitted_at","closed_at"])
churn    = read_csv_with_dates(f_churn,    parse_cols=["churn_date"])

In [6]:
import pandas as pd

# --- 1) Find the global date range across your DFs ---
def min_max_date(*series_list):
    mins, maxs = [], []
    for s in series_list:
        s = pd.to_datetime(s, errors='coerce')
        if s.notna().any():
            mins.append(s.min())
            maxs.append(s.max())
    if not mins:
        raise ValueError("No valid dates found in the provided dataframes.")
    return min(mins), max(maxs)

start_date, end_date = min_max_date(
    accounts['signup_date'],
    subs['start_date'], subs['end_date'],
    usage['usage_date'],
    tickets['submitted_at'], tickets['closed_at'],
    churn['churn_date']
)

# --- 2) Generate daily calendar range ---
cal = pd.DataFrame({'Date Full': pd.date_range(start=start_date, end=end_date, freq='D')})

# --- 3) Enrich with calendar attributes ---
iso = cal['Date Full'].dt.isocalendar()  # has .year, .week, .day as UInt32
cal['Year'] = cal['Date Full'].dt.year
cal['Quarter'] = cal['Date Full'].dt.quarter
cal['Month Number'] = cal['Date Full'].dt.month
cal['Month Name'] = cal['Date Full'].dt.month_name()
cal['Month Day'] = cal['Date Full'].dt.day

# ISO week/year + weekday
cal['Week Number'] = iso.week.astype(int)
# Monday=1 .. Sunday=7
cal['Week Day'] = cal['Date Full'].dt.weekday + 1
cal['Week Day Name'] = cal['Date Full'].dt.day_name()

# Week start/end (Monday-based)
cal['Week Start Date'] = cal['Date Full'] - pd.to_timedelta(cal['Date Full'].dt.weekday, unit='D')
cal['Week End Date'] = cal['Week Start Date'] + pd.Timedelta(days=6)

# Flags
cal['Is Weekend'] = cal['Week Day'] >= 6

In [7]:
gold_dim_date = cal
gold_dim_date.head(5)

Unnamed: 0,Date Full,Year,Quarter,Month Number,Month Name,Month Day,Week Number,Week Day,Week Day Name,Week Start Date,Week End Date,Is Weekend
0,2023-01-01,2023,1,1,January,1,52,7,Sunday,2022-12-26,2023-01-01,True
1,2023-01-02,2023,1,1,January,2,1,1,Monday,2023-01-02,2023-01-08,False
2,2023-01-03,2023,1,1,January,3,1,2,Tuesday,2023-01-02,2023-01-08,False
3,2023-01-04,2023,1,1,January,4,1,3,Wednesday,2023-01-02,2023-01-08,False
4,2023-01-05,2023,1,1,January,5,1,4,Thursday,2023-01-02,2023-01-08,False


In [8]:
# --- 4) Save as GOLD dimension ---
gold_dim_date.to_csv('./data/gold/gold_dim_date.csv', index=False)