# IPW v1

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
path = "../data"

In [3]:
lalonde_obs = pd.read_csv(f"{path}/lalonde_psid.csv")

## Variable Transformation

### Age

In [4]:
age_bins = [0, 25, 33, 40, float("inf")]
age_labels = ["0-25", "26-33", "34-40", "41+"]

In [5]:
lalonde_obs["age_group"] = pd.cut(
    lalonde_obs["age"],
    bins=age_bins,
    labels=age_labels,
    right=False,
    include_lowest=True,
)

### Education

In [6]:
bins = [0, 5, 8, 12, float("inf")]
labels = ["0-5", "6-8", "9-12", "13+"]

In [7]:
lalonde_obs["education_group"] = pd.cut(
    lalonde_obs["education"], bins=bins, labels=labels, right=True, include_lowest=True
)

## IPW - With Grouped Variables

In [8]:
lalonde_obs_dummified_df = pd.get_dummies(
    lalonde_obs[
        [
            "married",
            "re74",
            "re75",
            "treat",
            "age_group",
            "black",
            "hispanic",
            "education_group",
            "id",
        ]
    ]
)

In [9]:
bool_columns = [
    col
    for col in lalonde_obs_dummified_df.columns
    if "age_group" in col or "education_group" in col
]
for col in bool_columns:
    lalonde_obs_dummified_df[col] = lalonde_obs_dummified_df[col].astype(int)

In [10]:
X = lalonde_obs_dummified_df.drop(["treat", "id"], axis=1)
y = lalonde_obs_dummified_df["treat"]

X = sm.add_constant(X)

In [11]:
model = sm.Logit(y, X).fit()

Optimization terminated successfully.
         Current function value: 0.087341
         Iterations 11


In [12]:
print(model.summary())

                           Logit Regression Results                           
Dep. Variable:                  treat   No. Observations:                 2675
Model:                          Logit   Df Residuals:                     2663
Method:                           MLE   Df Model:                           11
Date:                Sun, 19 May 2024   Pseudo R-squ.:                  0.6527
Time:                        01:48:11   Log-Likelihood:                -233.64
converged:                       True   LL-Null:                       -672.65
Covariance Type:            nonrobust   LLR p-value:                3.285e-181
                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   -0.6734        nan        nan        nan         nan         nan
married                 -1.9431      0.271     -7.170      0.000      -2.474      -1.412
re74        

In [13]:
propensity_scores = model.predict(X).reset_index(drop=True)
lalonde_obs_dummified_df["propensity"] = propensity_scores
lalonde_obs_dummified_df["ipw"] = 1 / lalonde_obs_dummified_df["propensity"]
lalonde_obs_dummified_df = pd.merge(
    lalonde_obs_dummified_df, lalonde_obs[["id", "re78"]], on="id"
)

In [14]:
lalonde_obs_dummified_df.groupby("treat").apply(
    lambda x: np.average(x["re78"], weights=x["ipw"])
)

  lalonde_obs_dummified_df.groupby("treat").apply(


treat
0    88663.593725
1    16560.554625
dtype: float64

## IPW - With no groupings

In [15]:
lalonde_obs_confounders_df = lalonde_obs[
    [
        "married",
        "re74",
        "re75",
        "treat",
        "age",
        "black",
        "hispanic",
        "education",
        "id",
    ]
].copy()

In [16]:
X = lalonde_obs_confounders_df.drop(["treat", "id"], axis=1)
y = lalonde_obs_confounders_df["treat"]

X = sm.add_constant(X)

In [17]:
model = sm.Logit(y, X).fit()

Optimization terminated successfully.
         Current function value: 0.088313
         Iterations 11


In [18]:
print(model.summary())

                           Logit Regression Results                           
Dep. Variable:                  treat   No. Observations:                 2675
Model:                          Logit   Df Residuals:                     2667
Method:                           MLE   Df Model:                            7
Date:                Sun, 19 May 2024   Pseudo R-squ.:                  0.6488
Time:                        01:48:11   Log-Likelihood:                -236.24
converged:                       True   LL-Null:                       -672.65
Covariance Type:            nonrobust   LLR p-value:                3.544e-184
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.3027      0.911      2.527      0.012       0.517       4.089
married       -1.5884      0.260     -6.103      0.000      -2.098      -1.078
re74          -0.0001   2.94e-05     -3.986      0.0

In [19]:
propensity_scores = model.predict(X).reset_index(drop=True)
lalonde_obs_confounders_df["propensity"] = propensity_scores
lalonde_obs_confounders_df["ipw"] = 1 / lalonde_obs_confounders_df["propensity"]
lalonde_obs_confounders_df = pd.merge(
    lalonde_obs_confounders_df, lalonde_obs[["id", "re78"]], on="id"
)

In [20]:
lalonde_obs_confounders_df.groupby("treat").apply(
    lambda x: np.average(x["re78"], weights=x["ipw"])
)

  lalonde_obs_confounders_df.groupby("treat").apply(


treat
0    88663.593714
1     9211.082147
dtype: float64

## IPW - with correlated variables

In [21]:
lalonde_obs_confounders_df = lalonde_obs[
    [
        "married",
        "re74",
        "re75",
        "treat",
        "age",
        "black",
        "hispanic",
        "education",
        "nodegree",
        "id",
    ]
].copy()

In [22]:
X = lalonde_obs_confounders_df.drop(["treat", "id"], axis=1)
y = lalonde_obs_confounders_df["treat"]

X = sm.add_constant(X)

In [23]:
model = sm.Logit(y, X).fit()

Optimization terminated successfully.
         Current function value: 0.087406
         Iterations 11


In [24]:
print(model.summary())

                           Logit Regression Results                           
Dep. Variable:                  treat   No. Observations:                 2675
Model:                          Logit   Df Residuals:                     2666
Method:                           MLE   Df Model:                            8
Date:                Sun, 19 May 2024   Pseudo R-squ.:                  0.6524
Time:                        01:48:11   Log-Likelihood:                -233.81
converged:                       True   LL-Null:                       -672.65
Covariance Type:            nonrobust   LLR p-value:                3.683e-184
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6371      1.180      0.540      0.589      -1.676       2.950
married       -1.6273      0.263     -6.195      0.000      -2.142      -1.112
re74          -0.0001   2.91e-05     -3.747      0.0

In [25]:
propensity_scores = model.predict(X).reset_index(drop=True)
lalonde_obs_confounders_df["propensity"] = propensity_scores
lalonde_obs_confounders_df["ipw"] = 1 / lalonde_obs_confounders_df["propensity"]
lalonde_obs_confounders_df = pd.merge(
    lalonde_obs_confounders_df, lalonde_obs[["id", "re78"]], on="id"
)

In [26]:
lalonde_obs_confounders_df.groupby("treat").apply(
    lambda x: np.average(x["re78"], weights=x["ipw"])
)

  lalonde_obs_confounders_df.groupby("treat").apply(


treat
0    88663.593723
1    10299.169290
dtype: float64