In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from causaldata import black_politicians

In [4]:
data.head()

Unnamed: 0,leg_black,treat_out,responded,totalpop,medianhhincom,black_medianhh,white_medianhh,blackpercent,statessquireindex,nonblacknonwhite,urbanpercent,leg_senator,leg_democrat,south
0,0,0,0,1.5873,5.0625,2.6814,2.6586,0.007119,0.227,0,0.695601,0,0,0
1,0,0,1,1.6218,4.9713,2.7126,2.6619,0.005796,0.227,0,0.618073,0,0,0
2,0,0,1,1.671,6.9646,2.3087,2.9973,0.012029,0.227,0,0.824331,0,0,0
3,0,0,1,1.6122,4.1811,2.4668,2.4887,0.00428,0.227,1,0.0,0,0,0
4,0,1,1,1.5622,3.1152,2.149,2.0597,0.008258,0.227,1,0.0,0,1,0


In [5]:
# Load the dataset
data = black_politicians.load_pandas().data

# Separate features (X) and treatment variable (T)
X = data.drop(columns=['responded', 'leg_black'], axis=1)
T = data['leg_black']

In [6]:
# Split the data into training and testing sets
X_train, X_test, T_train, T_test = train_test_split(X, T, test_size=0.2, random_state=42)

# Set up the XGBoost model with appropriate parameters
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=200, learning_rate=0.05)

# Train the model
model.fit(X_train, T_train)

# Predict the propensity scores on the test set
propensity_scores = model.predict_proba(X_test)[:, 1]

# Calculate the area under the ROC curve to evaluate model performance
auc = roc_auc_score(T_test, propensity_scores)
print(f"AUC: {auc}")

AUC: 0.9787448405863044


In [7]:
# Predict propensity scores for the entire dataset
propensity_scores_all = model.predict_proba(X)[:, 1]

# Calculate IPTW weights
iptw_weights = np.where(T == 1, 1 / propensity_scores_all, 1 / (1 - propensity_scores_all))