In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import dowhy
from dowhy import CausalModel
import statsmodels.api as sm

# Set random seed for reproducibility
np.random.seed(42)

# ----------------------------------------------------------------------
# Step 1: Simulate Synthetic Data for Difference-in-Differences
# ----------------------------------------------------------------------
# For simplicity, we assume 200 units where the first 100 are in the treatment group
# and the remaining 100 are in the control group.
# Data is collected for two time periods: 0 (pre-treatment) and 1 (post-treatment).

n_units = 200   # Total units, 100 treatment and 100 control
data = []
for unit in range(n_units):
    # Assign treatment: first 100 units receive treatment
    treated = 1 if unit < 100 else 0
    # Each unit has a baseline outcome drawn from a normal distribution
    baseline = np.random.normal(50, 10)
    # Iterate over two time periods: 0 for pre-treatment, 1 for post-treatment
    for time_period in [0, 1]:
        # Define a common trend across time periods (5 units increase when time_period==1)
        trend = 5 * time_period
        # Treatment effect applies only for treated units in the post-treatment period
        treatment_effect = 10 if (treated == 1 and time_period == 1) else 0
        # Simulate outcome with some added noise
        y = baseline + trend + treatment_effect + np.random.normal(0, 5)
        data.append([unit, treated, time_period, y])
        
# Create a DataFrame from the simulated data
df = pd.DataFrame(data, columns=["unit", "treated", "post", "y"])

# Create an interaction term (treated x post) as required by the DiD model
df["did"] = df["treated"] * df["post"]

# ----------------------------------------------------------------------
# Step 2: Estimate DiD Effect Using OLS Regression
# ----------------------------------------------------------------------
# The DiD model is specified as:
#   y = β₀ + β₁ * treated + β₂ * post + β₃ * (treated × post) + ε
# The coefficient β₃ is the DiD causal effect estimate.

X = sm.add_constant(df[["treated", "post", "did"]])
model_ols = sm.OLS(df["y"], X).fit()
print(model_ols.summary())

# The coefficient on the "did" variable represents the causal impact of the treatment.
print("Difference-in-Differences Estimate (β₃):", model_ols.params["did"])

# ----------------------------------------------------------------------
# Step 3: Formalize the Causal Model Using DoWhy
# ----------------------------------------------------------------------
# In line with the decision tree in the Uber paper, we define a simplified causal graph.
# The graph below shows that both treatment (X) and time (T) impact the outcome (Y),
# consistent with a DiD setup. Note that additional unobserved confounders (U) may
# influence the outcome but are assumed to affect both groups similarly (parallel trends).

causal_graph = """
digraph {
    U [label="Unobserved\nFactors"];
    X [label="Treatment\n(treated)"];
    T [label="Time\n(post)"];
    Y [label="Outcome\n(y)"];
    X -> Y;
    T -> Y;
    X -> T;
    U -> Y;
}
"""

# Create a causal model with DoWhy based on the simulated data and graph
model_dowhy = CausalModel(
    data=df,
    treatment="treated",
    outcome="y",
    graph=causal_graph
)

# (Optional) Visualize the causal graph in a Jupyter Notebook:
# model_dowhy.view_model()

# Identify the causal effect from the graph and data
identified_estimand = model_dowhy.identify_effect()
print("\nIdentified Estimand:", identified_estimand)

# Estimate the effect using the backdoor linear regression estimator available in DoWhy
estimate = model_dowhy.estimate_effect(identified_estimand,
                                       method_name="backdoor.linear_regression")
print("DoWhy Causal Estimate (via Regression):", estimate.value)
