In [1]:
import numpy as np
from scipy.stats import pearsonr

# Set random seed for reproducibility
np.random.seed(42)

# Simulate the variables
n = 1000
W = np.random.normal(0, 1, n)
epsilon1 = np.random.normal(0, 1, n)
X = W + epsilon1

Z = np.ones(n)  # since np.random.normal(1, 0, n) produces constant 1s
epsilon2 = np.random.normal(0, 1, n)
Y = X + Z + W + epsilon2

# Regression error if Y is regressed on X: error = residual = Y - predicted_Y
# But since we want the theoretical error = ε2 - ε1 + Z, we compute that directly:
error_term = epsilon2 - epsilon1 + Z

# Correlation between X and error term
corr, _ = pearsonr(X, error_term)

print(f"Correlation between X and the error term: {corr:.4f}")

Correlation between X and the error term: -0.4900


In [2]:
# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n = 1000

# Generate variables
W = np.random.normal(0, 1, n)
epsilon1 = np.random.normal(0, 1, n)
epsilon2 = np.random.normal(0, 1, n)

X = W + epsilon1
Z = np.ones(n)  # constant Z = 1
Y = X + Z + W + epsilon2  # same as: Y = 2X + Z + (epsilon2 - epsilon1)

# Compute error when regressing Y on X and Z
# Theoretical residual: error = epsilon2 - epsilon1
error = epsilon2 - epsilon1

# Compute correlation between X and the error
corr, _ = pearsonr(X, error)

print(f"Correlation between X and the error term (controlling for Z): {corr:.4f}")

Correlation between X and the error term (controlling for Z): -0.4900


In [4]:
import pandas as pd
import statsmodels.api as sm


# Load the dataset
df = pd.read_csv("homework_7.1.csv")

# Inspect the first few rows to understand the structure
df.head()

Unnamed: 0.1,Unnamed: 0,X,W,Z,Y
0,0,1.137055,1.221768,0.327829,1.944532
1,1,-0.112905,0.465835,0.59965,0.655514
2,2,2.077755,1.795414,-0.063393,5.934411
3,3,0.456373,-0.512159,1.177413,-0.188064
4,4,-1.012402,0.080002,-0.275697,-0.533775


In [5]:
# Define the target W values and window around them
target_W_values = [-1, 0, 1]
window = 0.1

# Store regression results
coefficients = {}

# Run regression for each subset where W is close to target value
for w_val in target_W_values:
    subset = df[(df['W'] >= w_val - window) & (df['W'] <= w_val + window)]
    
    X_vars = subset[['X', 'Z']]
    X_vars = sm.add_constant(X_vars)
    y = subset['Y']
    
    model = sm.OLS(y, X_vars).fit()
    coefficients[w_val] = model.params['X']

coefficients

{-1: np.float64(0.857978125432604),
 0: np.float64(1.3832110460956801),
 1: np.float64(1.9580971601672839)}

In [7]:
import numpy as np
import statsmodels.api as sm

# Define the custom error function with temporal correlation
def make_error(corr_const, num):
    err = []
    prev = np.random.normal(0, 1)
    for _ in range(num):
        prev = corr_const * prev + (1 - corr_const) * np.random.normal(0, 1)
        err.append(prev)
    return np.array(err)

# Simulation parameters
n = 500  # sample size
trials = 500  # number of trials
corr_consts = [0.2, 0.5, 0.8]

# Store results
results = {}

for corr in corr_consts:
    beta_estimates = []
    se_estimates = []

    for _ in range(trials):
        # Generate error terms for treatment and outcome
        error_X = make_error(corr, n)
        error_Y = make_error(corr, n)

        # Create covariate Z
        Z = np.random.normal(0, 1, n)

        # Treatment: X depends on Z and error_X
        X = 0.5 * Z + error_X

        # Outcome: Y depends on X, Z, and error_Y
        Y = 2.0 * X + 0.3 * Z + error_Y

        # Regression of Y on X and Z (with intercept)
        X_design = sm.add_constant(np.column_stack((X, Z)))
        model = sm.OLS(Y, X_design).fit()

        # Collect beta_X estimate and standard error
        beta_estimates.append(model.params[1])  # coefficient of X
        se_estimates.append(model.bse[1])       # standard error of X coefficient

    # Compute (i) standard deviation of beta estimates, (ii) mean of SEs, and their ratio
    std_dev_beta = np.std(beta_estimates)
    mean_se = np.mean(se_estimates)
    ratio = std_dev_beta / mean_se

    results[corr] = {
        'std_dev_beta': std_dev_beta,
        'mean_se': mean_se,
        'ratio': ratio
    }

import pandas as pd
results_df = pd.DataFrame(results).T
results_df.columns = ['Std Dev of β (i)', 'Mean of SE(β) (ii)', 'Ratio (i)/(ii)']

results_df

Unnamed: 0,Std Dev of β (i),Mean of SE(β) (ii),Ratio (i)/(ii)
0.2,0.044478,0.044839,0.991931
0.5,0.058059,0.044715,1.298426
0.8,0.097418,0.045003,2.164707
