In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from scipy.spatial.distance import mahalanobis

In [2]:
df = pd.read_csv('homework_8.1.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,X,Y,Z
0,0,1,4.109218,1.764052
1,1,0,2.259504,0.400157
2,2,0,-0.647584,0.978738
3,3,0,2.106071,2.240893
4,4,1,3.583464,1.867558


In [3]:
# Step 1: Estimate propensity scores using logistic regression
X_covariate = df[['Z']]
treatment = df['X']

log_reg = LogisticRegression()
log_reg.fit(X_covariate, treatment)

# Step 2: Predict propensity scores
propensity_scores = log_reg.predict_proba(X_covariate)[:, 1]

# Step 3: Compute inverse probability weights
weights = np.where(df['X'] == 1, 1 / propensity_scores, 1 / (1 - propensity_scores))

# Step 4: Compute weighted average outcome for treated and untreated
weighted_y_treated = np.sum((df['Y'] * weights)[df['X'] == 1]) / np.sum(weights[df['X'] == 1])
weighted_y_untreated = np.sum((df['Y'] * weights)[df['X'] == 0]) / np.sum(weights[df['X'] == 0])

# Average Treatment Effect (ATE)
ate_ipw = weighted_y_treated - weighted_y_untreated
ate_ipw

np.float64(2.2743411898510133)

In [8]:
# Display the first three propensity scores
propensity_scores[:3]

array([0.84011371, 0.58464597, 0.71108245])

In [10]:
df2 = pd.read_csv('homework_8.2.csv')

# Split into treated and untreated
treated = df2[df2['X'] == 1].reset_index(drop=True)
untreated = df2[df2['X'] == 0].reset_index(drop=True)

# Extract covariates Z1 and Z2
Z_all = df2[['Z1', 'Z2']].values.T
cov_matrix = np.cov(Z_all)
inv_cov_matrix = np.linalg.inv(cov_matrix)

# Match each treated unit to the nearest untreated using Mahalanobis distance
matched_outcomes = []
for _, row in treated.iterrows():
    treated_vector = row[['Z1', 'Z2']].values
    distances = untreated[['Z1', 'Z2']].apply(
        lambda x: mahalanobis(treated_vector, x.values, inv_cov_matrix), axis=1
    )
    nearest_index = distances.idxmin()
    matched_outcome = untreated.loc[nearest_index, 'Y']
    matched_outcomes.append(matched_outcome)

# Calculate the ATE: mean(Y_treated - Y_matched_untreated)
ate_mahalanobis = (treated['Y'].values - np.array(matched_outcomes)).mean()
ate_mahalanobis

np.float64(3.437678997912609)

In [11]:
max_min_distance = -1
worst_match_index = -1

for i, row in treated.iterrows():
    treated_vector = row[['Z1', 'Z2']].values
    distances = untreated[['Z1', 'Z2']].apply(
        lambda x: mahalanobis(treated_vector, x.values, inv_cov_matrix), axis=1
    )
    min_distance = distances.min()
    if min_distance > max_min_distance:
        max_min_distance = min_distance
        worst_match_index = i

# Get the Z1 and Z2 values for the treated item with least common support
least_support_Z_values = treated.loc[worst_match_index, ['Z1', 'Z2']]
least_support_Z_values.tolist()

[2.6962240525635797, 0.5381554886023228]