In [26]:
"""
Script: 02_causal_gee_analysis.py
Author: Hayden Hedman
Created: June 2025
Purpose: 
    This script implements a generalized estimating equations (GEE) model to estimate the causal effect 
    of a product treatment on user engagement over time, adjusting for user-level covariates. 
    The analysis accounts for repeated measures by clustering at the user level.

Inputs:
    - data/simulated_user_behavior.csv (generated from script 01)

Outputs:
    - Console summary of GEE model results
    - Interpretation-ready coefficient estimates with robust standard errors

Dependencies:
    - pandas
    - numpy
    - statsmodels
    - matplotlib (optional, for future plotting)

Notes:
    - Designed for clarity and auditability
    - Can be adapted for real-world A/B test evaluation or panel data modeling
"""

'\nScript: 02_causal_gee_analysis.py\nAuthor: Hayden Hedman\nCreated: June 2025\nPurpose: \n    This script implements a generalized estimating equations (GEE) model to estimate the causal effect \n    of a product treatment on user engagement over time, adjusting for user-level covariates. \n    The analysis accounts for repeated measures by clustering at the user level.\n\nInputs:\n    - data/simulated_user_behavior.csv (generated from script 01)\n\nOutputs:\n    - Console summary of GEE model results\n    - Interpretation-ready coefficient estimates with robust standard errors\n\nDependencies:\n    - pandas\n    - numpy\n    - statsmodels\n    - matplotlib (optional, for future plotting)\n\nNotes:\n    - Designed for clarity and auditability\n    - Can be adapted for real-world A/B test evaluation or panel data modeling\n'

In [27]:
#Load libraries
import pandas as pd
import statsmodels.api as sm
from statsmodels.genmod.cov_struct import Independence
import os

In [28]:
# Correct relative path to data from /notebooks folder
DATA_PATH = os.path.join("..", "data", "simulated_user_behavior.csv")

# Load data
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Expected dataset not found at: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
print("Confirmed: Data loaded:", df.shape)

Confirmed: Data loaded: (300000, 14)


In [29]:
# Load data
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Expected dataset not found at: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)

In [30]:
# Quick data check: prints data loaded and basic checks on data structure
print("Confirmed: data loaded:")
print(df.head())
print("\n Columns:", df.columns.tolist())

# Define model
ind = Independence()
fam = sm.families.Gaussian()

# Ensure formula variables exist
required_cols = [
    "engagement", "post", "treatment", "post_treated",
    "age", "is_female", "prior_engagement", "user_id"
]
missing = set(required_cols) - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# 6. Fit GEE model
model = sm.GEE.from_formula(
    "engagement ~ post + treatment + post_treated + age + is_female + prior_engagement",
    groups="user_id",
    data=df,
    cov_struct=ind,
    family=fam
)
result = model.fit()

Confirmed: data loaded:
   user_id  day  age  is_premium  device region signup_channel  power_user  \
0        0    0   46           0  mobile   APAC        organic           1   
1        0    1   46           0  mobile   APAC        organic           1   
2        0    2   46           0  mobile   APAC        organic           1   
3        0    3   46           0  mobile   APAC        organic           1   
4        0    4   46           0  mobile   APAC        organic           1   

   is_female  treatment  post  post_treated  engagement  prior_engagement  
0          0          0     0             0    2.078719          3.730117  
1          0          0     0             0    1.373123          3.730117  
2          0          0     0             0    3.510399          3.730117  
3          0          0     0             0    2.622221          3.730117  
4          0          0     0             0    3.729853          3.730117  

 Columns: ['user_id', 'day', 'age', 'is_premium', 

In [31]:
# Display results
print("\n GEE Results:")
print(result.summary())


 GEE Results:
                               GEE Regression Results                              
Dep. Variable:                  engagement   No. Observations:               300000
Model:                                 GEE   No. clusters:                     5000
Method:                        Generalized   Min. cluster size:                  60
                      Estimating Equations   Max. cluster size:                  60
Family:                           Gaussian   Mean cluster size:                60.0
Dependence structure:         Independence   Num. iterations:                     2
Date:                     Sun, 17 Aug 2025   Scale:                           1.184
Covariance type:                    robust   Time:                         13:40:07
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.7881      0.022     36.366      0.00