In [3]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols

# Plot design settings
sns.set_theme(style="whitegrid", palette="viridis")
plt.rcParams.update({
    "figure.figsize": (11, 6),
    "lines.linewidth": 2.5,
    "axes.grid": True,
    "grid.alpha": 0.3,
    "grid.linestyle": "--",
    "axes.spines.top": False,
    "axes.spines.right": False,
})

# Get current working dir (works in notebooks too)
base_dir = os.getcwd()
data_dir = os.path.join(base_dir, "..", "data", "raw")
results_dir = os.path.join(base_dir, "..", "results")
os.makedirs(results_dir, exist_ok=True)

users_path = os.path.join(data_dir, "users.csv")
ts_path = os.path.join(data_dir, "user_timeseries.csv")

# Load data
users = pd.read_csv(users_path, parse_dates=["signup_date"])
timeseries = pd.read_csv(ts_path, parse_dates=["date"])

# Rename if needed
if "minutes_engaged" in timeseries.columns and "engagement" not in timeseries.columns:
    timeseries = timeseries.rename(columns={"minutes_engaged": "engagement"})

# Merge and validate
df = pd.merge(timeseries, users, on="user_id", how="left")
if "new_rec_engine" not in df.columns:
    raise KeyError("Missing 'new_rec_engine' in merged dataset.")

df["new_rec_engine"] = df["new_rec_engine"].astype(int)

# Summarize by user (average engagement)
user_avg = df.groupby("user_id")["engagement"].mean().reset_index()
user_data = users.merge(user_avg, on="user_id", how="left")

# Construct dynamic formula with available covariates
available_covariates = ["segment", "plan_type", "cohort"]
included_covariates = [f"C({col})" for col in available_covariates if col in user_data.columns]

formula = "engagement ~ new_rec_engine"
if included_covariates:
    formula += " + " + " + ".join(included_covariates)

print(f"Using formula: {formula}")

# Fit model
model = ols(formula, data=user_data).fit()

# Output summary
summary = model.summary().as_text()
print(summary)

# Save to results folder
output_path = os.path.join(results_dir, "causal_estimates.txt")
with open(output_path, "w") as f:
    f.write(summary)


Using formula: engagement ~ new_rec_engine + C(segment) + C(plan_type) + C(cohort)
                            OLS Regression Results                            
Dep. Variable:             engagement   R-squared:                       0.997
Model:                            OLS   Adj. R-squared:                  0.997
Method:                 Least Squares   F-statistic:                 3.610e+05
Date:                Sat, 16 Aug 2025   Prob (F-statistic):               0.00
Time:                        15:08:42   Log-Likelihood:                -7255.4
No. Observations:                5000   AIC:                         1.452e+04
Df Residuals:                    4994   BIC:                         1.456e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------