In [4]:
# 01_simulate_data_bayesian_analysis.py
# Simulates semi-synthetic user and event data for a fictional streaming platform (Streamly).
# Outputs:
# - users.csv: user-level data with signup_date, subscription_type, and region
# - events.csv: monthly user-level panel data with usage_time, transactions, and churn_flag
# This serves as the raw input for downstream product metric teardown and Bayesian churn modeling.

In [5]:
# Load libraries
import numpy as np
import pandas as pd
from datetime import datetime
from pathlib import Path

In [6]:
# Dynamically set project root
CURRENT_DIR = Path().resolve()
BASE_DIR = CURRENT_DIR.parent if CURRENT_DIR.name == "notebooks" else CURRENT_DIR
RAW_DATA_DIR = BASE_DIR / "data" / "raw"
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

In [7]:
# Prepare synthetic dataset
# set random seed
np.random.seed(64)

# Settings
n_users = 3000
start_date = pd.to_datetime("2023-01-01")
end_date = pd.to_datetime("2024-06-30")
months = pd.date_range(start=start_date, end=end_date, freq="MS")

# Create users
user_ids = [f"u_{i}" for i in range(n_users)]
regions = ["US", "EU", "LATAM"]
plans = ["trial", "monthly", "annual"]
plan_probs = [0.2, 0.6, 0.2]

users = pd.DataFrame({
    "user_id": user_ids,
    "region": np.random.choice(regions, n_users, p=[0.5, 0.3, 0.2]),
    "subscription_type": np.random.choice(plans, n_users, p=plan_probs),
    "signup_date": np.random.choice(months, n_users)
})

# Assign churn probability by plan (lower for annual)
churn_base = {"trial": 0.25, "monthly": 0.15, "annual": 0.05}

events = []

for _, row in users.iterrows():
    uid = row["user_id"]
    plan = row["subscription_type"]
    region = row["region"]
    signup = row["signup_date"]

    for m in months:
        if m < signup:
            continue

        if np.random.rand() < churn_base[plan]:
            churn_flag = 1
            events.append({
                "user_id": uid,
                "month": m,
                "subscription_type": plan,
                "region": region,
                "usage_time": np.random.gamma(2, 2),
                "transactions": np.random.poisson(1),
                "churn_flag": churn_flag
            })
            break  # user churns
        else:
            churn_flag = 0
            events.append({
                "user_id": uid,
                "month": m,
                "subscription_type": plan,
                "region": region,
                "usage_time": np.random.gamma(3, 3),
                "transactions": np.random.poisson(2),
                "churn_flag": churn_flag
            })

In [8]:
# Convert to DataFrame
events_df = pd.DataFrame(events)

# Save to root-level data/raw/
users.to_csv(RAW_DATA_DIR / "users.csv", index=False)
events_df.to_csv(RAW_DATA_DIR / "events.csv", index=False)

print(f"Confirmed: simulated users and events saved to {RAW_DATA_DIR}")

Confirmed: simulated users and events saved to C:\Users\hayde\Desktop\bayesian-product-metrics\data\raw
