In [27]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product

# Set random seed
np.random.seed(64)

# Parameters
n_users = 5000
n_days = 60  # 30 pre, 30 post
intervention_day = 30

# User-level covariates
users = pd.DataFrame({
    "user_id": np.arange(n_users),
    "age": np.random.normal(35, 10, size=n_users).astype(int),
    "is_premium": np.random.binomial(1, 0.3, n_users),
    "device": np.random.choice(["mobile", "desktop", "tablet"], size=n_users, p=[0.6, 0.3, 0.1]),
    "region": np.random.choice(["US", "EU", "LATAM", "APAC"], size=n_users),
    "signup_channel": np.random.choice(["organic", "paid", "referral"], size=n_users, p=[0.5, 0.4, 0.1]),
    "power_user": np.random.binomial(1, 0.25, n_users),
    "is_female": np.random.binomial(1, 0.5, n_users)  # add binary gender
})

# Assign treatment group
users["treatment"] = (
    (users["is_premium"] & (users["device"] == "mobile")) |
    (np.random.rand(n_users) < 0.1)
).astype(int)

# Create user-day observations
obs = pd.DataFrame(list(product(users["user_id"], np.arange(n_days))), columns=["user_id", "day"])
obs = obs.merge(users, on="user_id")
obs["post"] = (obs["day"] >= intervention_day).astype(int)
obs["post_treated"] = obs["post"] * obs["treatment"]

# Simulate engagement (main outcome)
obs["engagement"] = (
    3
    + 0.05 * obs["day"]
    + 1.5 * obs["post_treated"]
    - 0.01 * obs["age"]
    + 0.5 * obs["power_user"]
    + np.random.normal(0, 1, size=len(obs))
)

# Add prior engagement (mean pre-intervention engagement per user)
obs["prior_engagement"] = obs.groupby("user_id")["engagement"].transform(
    lambda x: x[obs.loc[x.index, "post"] == 0].mean()
)

# Save synthetic dataset
os.makedirs("data", exist_ok=True)
obs.to_csv("data/simulated_user_behavior.csv", index=False)
