In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import statsmodels.api as sm
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv('homework_1.1.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,Y
0,0,-0.440646,-0.390227,0.156718,-0.877671
1,1,-3.810099,-1.304665,-1.105117,-10.130388
2,2,-1.425451,-0.340049,1.115908,0.284068
3,3,-1.32575,0.161906,-0.25467,-1.994344
4,4,3.120263,1.487343,-1.164839,2.03003


In [5]:
X = df[['X1', 'X2', 'X3']]
y = df['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
coefficients = dict(zip(X.columns, model.coef_))
intercept = model.intercept_

mse, r2, coefficients, intercept

(0.27038744076297094,
 0.9908211302615575,
 {'X1': np.float64(1.0092442544468478),
  'X2': np.float64(1.9532067277273115),
  'X3': np.float64(2.960663144315282)},
 np.float64(0.00048148172336404116))

In [6]:
import statsmodels.api as sm

# Add a constant term for the intercept
X_with_const = sm.add_constant(X)

# Fit the OLS model
ols_model = sm.OLS(y, X_with_const).fit()

# Get the summary of the regression results
ols_summary = ols_model.summary()
ols_model.tvalues.sort_values(ascending=False)

X3       196.645240
X1        60.984011
X2        53.283212
const      0.166181
dtype: float64

In [7]:
simple_models = {
    col: sm.OLS(y, sm.add_constant(X[[col]])).fit()
    for col in X.columns
}

# Extract coefficients from simple regressions
simple_coefs = {col: model.params[col] for col, model in simple_models.items()}

# Extract coefficients from the multiple regression (already computed)
multi_coefs = ols_model.params.drop('const')

# Compute the absolute differences between the simple and multiple regression coefficients
coef_diff = {col: abs(simple_coefs[col] - multi_coefs[col]) for col in X.columns}

coef_diff

{'X1': np.float64(0.8346234440931537),
 'X2': np.float64(2.1190439844996054),
 'X3': np.float64(0.12155266688640776)}

In [10]:
df = pd.read_csv('homework_1.2.csv')

# Split the data into treatment and control groups based on X
treated = df[df['X'] == 1]
control = df[df['X'] == 0]

# Use Z variables for matching
Z_cols = [col for col in df.columns if col.startswith('Z')]
Z_treated = treated[Z_cols]
Z_control = control[Z_cols]

# Fit NearestNeighbors on the control group
nn = NearestNeighbors(n_neighbors=1)
nn.fit(Z_control)

# Find nearest neighbors for each treated unit
distances, indices = nn.kneighbors(Z_treated)
farthest_distance = distances.max()
print("Farthest match distance:", farthest_distance)

# Create a DataFrame of matched pairs
matched_control = control.iloc[indices.flatten()].copy()
matched_control.index = treated.index  # Align indices for easy comparison

# Optionally, concatenate matched pairs
matched_pairs = pd.concat([treated, matched_control], axis=1, keys=['Treated', 'Matched_Control'])

matched_pairs

Farthest match distance: 0.2102170871093757


Unnamed: 0_level_0,Treated,Treated,Treated,Treated,Matched_Control,Matched_Control,Matched_Control,Matched_Control
Unnamed: 0_level_1,Unnamed: 0,X,Y,Z,Unnamed: 0,X,Y,Z
1,1,1,1.215189,0.715189,93,0,0.716327,0.716327
5,5,1,1.145894,0.645894,56,0,0.653108,0.653108
6,6,1,0.937587,0.437587,41,0,0.437032,0.437032
7,7,1,1.391773,0.891773,18,0,0.778157,0.778157
8,8,1,1.463663,0.963663,18,0,0.778157,0.778157
9,9,1,0.883442,0.383442,29,0,0.414662,0.414662
10,10,1,1.291725,0.791725,18,0,0.778157,0.778157
13,13,1,1.425597,0.925597,18,0,0.778157,0.778157
17,17,1,1.33262,0.83262,18,0,0.778157,0.778157
19,19,1,1.370012,0.870012,18,0,0.778157,0.778157


In [11]:
avg_y_treated = treated['Y'].mean()

# Compute the average Y for matched control units (X = 0 matched to each treated)
avg_y_matched_control = matched_control['Y'].mean()

# Calculate the treatment effect
effect = avg_y_treated - avg_y_matched_control
print("Estimated treatment effect:", effect)

Estimated treatment effect: 0.5433600651913855


In [14]:
# Fit NearestNeighbors with enough neighbors to check distances within threshold
nn_all = NearestNeighbors(radius=0.2)
nn_all.fit(Z_control)

# Find all neighbors within 0.2 radius for each treated observation
neighbors_within_radius = nn_all.radius_neighbors(Z_treated, return_distance=True)

# Extract indices and distances
distances_list, indices_list = neighbors_within_radius

# Build a list of matched pairs (treated index -> matched control indices)
matched_indices = []
for treated_idx, control_indices in zip(treated.index, indices_list):
    for control_idx in control_indices:
        matched_indices.append((treated_idx, control_idx))

# Create a DataFrame of matched control observations
matched_controls_within_radius = control.loc[[idx for _, idx in matched_indices]].copy()
matched_controls_within_radius['matched_to'] = [treated_idx for treated_idx, _ in matched_indices]

print(matched_controls_within_radius.head())

KeyError: '[1, 5, 9, 17, 20, 21, 25, 31, 35, 48, 27, 13, 39, 42, 22, 49, 6, 7, 8, 10, 19, 23, 33, 36, 38, 40, 47, 50, 51] not in index'