In [2]:
import pandas as pd
import numpy as np

url_email_obs = "https://raw.githubusercontent.com/matheusfacure/causal-inference-in-python-code/main/causal-inference-in-python/data/email_obs_data.csv"
url_email_rnd = "https://raw.githubusercontent.com/matheusfacure/causal-inference-in-python-code/main/causal-inference-in-python/data/email_rnd_data.csv"

data_biased = pd.read_csv(url_email_obs)
data_rnd = pd.read_csv(url_email_rnd)

In [8]:
y = "next_mnth_pv"
T = "mkt_email"
X = list(data_rnd.drop(columns=[y, T]).columns)

train, test = data_biased, data_rnd

## T-Learner

In [19]:
from lightgbm import LGBMRegressor

np.random.seed(123)
m0 = LGBMRegressor()
m1 = LGBMRegressor()

m0.fit(train.query(f"{T}==0")[X], train.query(f"{T}==0")[y])
m1.fit(train.query(f"{T}==1")[X], train.query(f"{T}==1")[y])

t_learner_cate_test = test.assign(cate=m1.predict(test[X]) - m0.predict(test[X]))

In [20]:

from toolz import curry

@curry
def effect(data, y, t):
    return (np.sum((data[t] - data[t].mean()) * data[y]) / sum( (data[t] - data[t].mean())**2 ))

def cumulative_gain_curve(df, prediction, y, t, ascending=False, normalize=False, steps=100):
    effect_fn = effect(t = t, y = y)
    normalizer = effect_fn(df) if normalize else 0

    size = len(df)
    ordered_df = (df
                  .sort_values(prediction, ascending=ascending)
                  .reset_index(drop=True))
    
    steps = np.linspace(size / steps, size, steps).round(0)
    effects = [(effect_fn(ordered_df.query(f"index <= {row}")) - normalizer) * (row / size) for row in steps]

    return np.array([0] + effects)

In [22]:
np.trapz(cumulative_gain_curve(t_learner_cate_test, "cate", y, T, normalize=True))

10532.16173183944

## X Learner

In [26]:
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMRegressor

# Propensity score model
ps_model = LogisticRegression(penalty="none")
ps_model.fit(train[X], train[T])

# First stage models
train_t0 = train.query(f"{T}==0")
train_t1 = train.query(f"{T}==1")

m0 = LGBMRegressor()
m1 = LGBMRegressor()

np.random.seed(123)

m0.fit(train_t0[X], train_t0[y], sample_weight=1 / ps_model.predict_proba(train_t0[X])[:, 0])
m1.fit(train_t1[X], train_t1[y], sample_weight=1 / ps_model.predict_proba(train_t1[X])[:, 1])

# Second stage
tau_hat_0 = m1.predict(train_t0[X]) - train_t0[y]
tau_hat_1 = m0.predict(train_t1[X]) - train_t1[y]

m_tau_0 = LGBMRegressor()
m_tau_1 = LGBMRegressor()

np.random.seed(123)

m_tau_0.fit(train_t0[X], tau_hat_0)
m_tau_1.fit(train_t1[X], tau_hat_1)

# estimate the Cate
ps_test = ps_model.predict_proba(test[X])[:, 1]
x_cate_test = test.assign(
    cate = (ps_test * m_tau_0.predict(test[X]) + (1 - ps_test)*m_tau_1.predict(test[X]))
)


In [27]:
np.trapz(cumulative_gain_curve(x_cate_test, "cate", y, T, normalize=True))

3083.5822393048334