In [1]:
import numpy as np
import pandas as pd

In [2]:
def generate_data_dml(n=2000, seed=42):
    np.random.seed(seed)
    # X: 10-dimensional
    X = np.random.normal(size=(n, 10))

    # True propensity function for D
    # Let's do p(X) = sigmoid(0.2*X0 + 0.3*X1 + noise)
    logits = 0.2 * X[:, 0] + 0.3 * X[:, 1] 
    p = 1 / (1 + np.exp(-logits))
    D = np.random.binomial(1, p)

    # True outcome function
    # Y = 5 + 2*D + f(X) + noise
    # Let f(X) = 0.5*X0 + 0.2*X1^2 - 0.3*sin(X2)
    f_X = 0.5 * X[:, 0] + 0.2*(X[:,1]**2) - 0.3*np.sin(X[:,2])
    Y = 5 + 2*D + f_X + np.random.normal(0,1,n)  # ATE = 2

    df = pd.DataFrame(X, columns=[f"X{i}" for i in range(10)])
    df['D'] = D
    df['Y'] = Y
    return df

df_dml = generate_data_dml()
df_dml.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,D,Y
0,0.496714,-0.138264,0.647689,1.52303,-0.234153,-0.234137,1.579213,0.767435,-0.469474,0.54256,1,6.888174
1,-0.463418,-0.46573,0.241962,-1.91328,-1.724918,-0.562288,-1.012831,0.314247,-0.908024,-1.412304,1,6.046513
2,1.465649,-0.225776,0.067528,-1.424748,-0.544383,0.110923,-1.150994,0.375698,-0.600639,-0.291694,0,5.345171
3,-0.601707,1.852278,-0.013497,-1.057711,0.822545,-1.220844,0.208864,-1.95967,-1.328186,0.196861,0,6.450545
4,0.738467,0.171368,-0.115648,-0.301104,-1.478522,-0.719844,-0.460639,1.057122,0.343618,-1.76304,0,5.884974


In [3]:
from doubleml import DoubleMLData, DoubleMLDIDData, DoubleMLPLR
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# Prepare data in DoubleML format
df_dml = generate_data_dml()
X = df_dml[[f"X{i}" for i in range(10)]].values
y = df_dml['Y'].values
d = df_dml['D'].values

# Step 1: Create DoubleMLData object
data = DoubleMLData.from_arrays(X, y, d)

# Step 2: Specify learners
learner_outcome = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
learner_treatment = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

# Step 3: Initialize the Double ML estimator (Partial Linear Regression for ATE)
dml_plr = DoubleMLPLR(
    data,
    ml_g=learner_outcome,  # model for E[Y|X]
    ml_m=learner_treatment, # model for E[D|X] or P(D=1|X)
    n_folds=5,             # cross-fitting folds
    score='ATE'            # we want the average treatment effect
)

# Step 4: Fit
dml_plr.fit()

# Step 5: Extract the estimated ATE and standard error
ate = dml_plr.coef_
ate_se = dml_plr.se_
print(f"DoubleML ATE estimate: {ate[0]:.3f} (SE: {ate_se[0]:.3f})")

AttributeError: module 'numpy' has no attribute '_no_nep50_warning'