In [None]:
"""
Author: Hayden Hedman
Date: August 2025

Project: Engagement Lift Analysis
Module: 01_data_simulation.ipynb

Purpose:
--------
This script generates synthetic user-level engagement data to support quasi-experimental evaluation of a personalized recommendation engine. 
It simulates realistic product usage patterns over time, including intervention group assignment, baseline engagement levels, and time series behavior.

Key Features:
-------------
- Simulates N users with randomized attributes and engagement baselines
- Introduces a treatment effect based on assignment to a recommendation engine variant
- Generates a longitudinal dataset of daily engagement over a configurable date range
- Outputs include:
    1. `users_df` – user-level attributes and group assignments
    2. `timeseries_df` – daily engagement records per user

Note:
-----
This data is synthetic and meant for demonstration purposes only. Files are saved locally (../data/) but excluded from version control via `.gitignore`.
"""

In [2]:
# Load libraries
import os
import numpy as np
import pandas as pd
from datetime import timedelta
import random

✅ Simulated data saved to ../data/raw/


In [None]:
#------------------------------------------------------------------------------------------
# Setup reproducible dataset
#------------------------------------------------------------------------------------------
# Set seed for reproducibility
np.random.seed(64)
random.seed(64)

# Configuration
NUM_USERS = 20000
START_DATE = pd.to_datetime("2022-01-01")
END_DATE = pd.to_datetime("2022-12-31")
ROLL_OUT_DATE = pd.to_datetime("2022-06-01")

In [None]:
#------------------------------------------------------------------------------------------
# Output paths
#------------------------------------------------------------------------------------------
raw_data_dir = "../data/raw"
os.makedirs(raw_data_dir, exist_ok=True)

# Simulate User Metadata
user_ids = np.arange(1, NUM_USERS + 1)
signup_dates = np.random.choice(
    pd.date_range(start=START_DATE, end=ROLL_OUT_DATE - timedelta(days=1)), 
    size=NUM_USERS
)

cohorts = np.where(signup_dates < pd.to_datetime("2022-03-01"), "early", "late")
segments = np.random.choice(["casual", "binge", "loyal"], size=NUM_USERS, p=[0.5, 0.3, 0.2])
plan_types = np.random.choice(["free", "premium"], size=NUM_USERS, p=[0.7, 0.3])

# Selection bias: assign treatment based on segment + random noise
base_probs = np.where(segments == "binge", 0.7, 0.3)
treatment_flags = np.random.binomial(1, base_probs)

users_df = pd.DataFrame({
    "user_id": user_ids,
    "signup_date": signup_dates,
    "cohort": cohorts,
    "segment": segments,
    "plan_type": plan_types,
    "new_rec_engine": treatment_flags
})

In [3]:
#------------------------------------------------------------------------------------------
# Simulate Time Series Data
#------------------------------------------------------------------------------------------
records = []
date_range = pd.date_range(start=START_DATE, end=END_DATE)

for _, row in users_df.iterrows():
    uid = row["user_id"]
    segment = row["segment"]
    signup = row["signup_date"]
    treated = row["new_rec_engine"]
    
    for date in date_range:
        if date < signup:
            continue

        base = {
            "casual": 10,
            "binge": 50,
            "loyal": 30
        }[segment]

        # Inject *pre-treatment bias* (optional: keep or scale this as needed)
        # e.g., group 1 starts higher than group 0
        if treated and date < ROLL_OUT_DATE:
            base += 3  # 3 extra baseline minutes


        # Treatment effect after rollout
        if treated and date >= ROLL_OUT_DATE:
            base *= 1.15  # 15% lift
        
        # Add noise and temporal effects
        day_engagement = np.random.normal(loc=base, scale=5)
        if date.month in [11, 12]:
            day_engagement *= 1.1  # holiday bump

        records.append({
            "user_id": uid,
            "date": date,
            "minutes_engaged": max(day_engagement, 0),
            "thumbs_up": np.random.binomial(1, 0.1),
            "skips": np.random.poisson(1.5),
            "genre_action": np.random.poisson(1),
            "genre_romance": np.random.poisson(1),
            "genre_comedy": np.random.poisson(1)
        })

timeseries_df = pd.DataFrame(records)

In [4]:
#------------------------------------------------------------------------------------------
# Save raw output data
#------------------------------------------------------------------------------------------
users_df.to_csv(f"{raw_data_dir}/users.csv", index=False)
timeseries_df.to_csv(f"{raw_data_dir}/user_timeseries.csv", index=False)

print("Confirmed raw data generated to ../data/raw/")

Confirmed raw data generated to ../data/raw/
