In [1]:
import pandas as pd
import statsmodels.formula.api as smf
from causaldata import black_politicians

In [2]:
# Load data
br = black_politicians.load_pandas().data
br.head()

Unnamed: 0,leg_black,treat_out,responded,totalpop,medianhhincom,black_medianhh,white_medianhh,blackpercent,statessquireindex,nonblacknonwhite,urbanpercent,leg_senator,leg_democrat,south
0,0,0,0,1.5873,5.0625,2.6814,2.6586,0.007119,0.227,0,0.695601,0,0,0
1,0,0,1,1.6218,4.9713,2.7126,2.6619,0.005796,0.227,0,0.618073,0,0,0
2,0,0,1,1.671,6.9646,2.3087,2.9973,0.012029,0.227,0,0.824331,0,0,0
3,0,0,1,1.6122,4.1811,2.4668,2.4887,0.00428,0.227,1,0.0,0,0,0
4,0,1,1,1.5622,3.1152,2.149,2.0597,0.008258,0.227,1,0.0,0,1,0


In [3]:
# Create bins for our continous matching variables
br['inc_bins'] = pd.qcut(br['medianhhincom'], 6)
br['bp_bins'] = pd.qcut(br['blackpercent'], 6)

In [4]:
# Count the number of treated and untreated units in each bin
# Create a function to do this
def count_units_treated_untreated(
    df: pd.DataFrame, 
    treatment_col: str, 
    treatment_status: int, 
    bin_cols: list, 
    name_of_col: str)->pd.DataFrame:
    """Counts the number of units that have been treated or untreated, and groups the results by the specified bin columns.

    Args:
        df (pd.DataFrame): The DataFrame to count.
        treatment_col (str): The name of the column that indicates whether a unit has been treated or untreated.
        treatment_status (int): The value of the treatment_col column that indicates that a unit has been treated.
        bin_cols (list): A list of column names to group the results by.
        name_of_col (str): The name of the column to store the counts in.

    Returns:
        pd.DataFrame: A DataFrame with the following columns:
            * bin_cols: The values of the bin_cols columns.
            * name_of_col: The counts of the units in each bin.
    """

    # Check that the treatment_col column exists in the DataFrame.
    if treatment_col not in df.columns:
        raise ValueError(f"The column '{treatment_col}' does not exist in the DataFrame.")

    # Check that the treatment_status value is valid.
    if treatment_status not in df[treatment_col].unique():
        raise ValueError(f"The treatment_status value '{treatment_status}' is not valid.")
    
    # Check that the bin_cols columns exist in the DataFrame.
    if not all([col in df.columns for col in bin_cols]):
        raise ValueError(f"One or more of the columns in '{bin_cols}' do not exist in the DataFrame.")
    
    # Calculate the counts.
    output = (
        df[df[treatment_col] == treatment_status]
              .groupby(bin_cols)
              .size()
              .to_frame(name_of_col)
              .reset_index()
              )
    return output

In [5]:
# Count the number of treated and untreated units in each bin
treated = count_units_treated_untreated(br, 'leg_black', 1, ['inc_bins', 'bp_bins', 'leg_democrat'], 'treated')
control = count_units_treated_untreated(br, 'leg_black', 0, ['inc_bins', 'bp_bins', 'leg_democrat'], 'control')

In [6]:
# Merge the two DataFrames on the bin columns to the br DataFrame
br = br.merge(treated, on=['inc_bins', 'bp_bins', 'leg_democrat'], how='left')
br = br.merge(control, on=['inc_bins', 'bp_bins', 'leg_democrat'], how='left')

In [9]:
# Create weights for the treated and control units
# Weight is 1 if  if there are any control matches, 0 otherwise
br['weight'] = br[br['leg_black'] == 1]['control'].apply(lambda x: 1 if x > 0 else 0)
totalcontrol = br[(br['leg_black'] == 0) & (br['treated']>0)]['control'].sum()
totaltreated = br[(br['leg_black'] == 1) & (br['control']>0)]['treated'].sum()

In [11]:
# Create control weights treated/control in the bin times control/treated overall
br['controlweights'] = (br['treated'] /br['control'])*(totalcontrol/totaltreated)
br.loc[br['leg_black']==0, 'weight'] = br['controlweights']

In [12]:
# Use the weights to estimate the ATT
m = smf.wls(formula = 'responded ~ leg_black', weights = br['weight'], data = br).fit()
m.summary()

  llf += 0.5 * np.sum(np.log(self.weights))


0,1,2,3
Dep. Variable:,responded,R-squared:,0.001
Model:,WLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,5.28
Date:,"Mon, 24 Apr 2023",Prob (F-statistic):,0.0216
Time:,21:36:42,Log-Likelihood:,-inf
No. Observations:,5593,AIC:,inf
Df Residuals:,5591,BIC:,inf
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4302,0.007,57.951,0.000,0.416,0.445
leg_black,-0.0373,0.016,-2.298,0.022,-0.069,-0.005

0,1,2,3
Omnibus:,1267.587,Durbin-Watson:,1.959
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13278.218
Skew:,0.78,Prob(JB):,0.0
Kurtosis:,10.385,Cond. No.,2.59
