## 1. Two-Stage Least Squares

In [21]:
from typing import Tuple
from dataclasses import dataclass

import numpy as np
from scipy.stats import bootstrap


def least_squares(y: np.ndarray, x: np.ndarray) -> Tuple[float, float]:
    assert y.ndim == 1
    assert x.ndim == 1

    x_ = np.stack((np.ones_like(x), x), axis=1)  # Shape: (N, 2)
    xtx = np.dot(x_.T, x_)  # Shape: (2, 2)
    xty = np.dot(x_.T, y)  # Shape: (2,)

    beta = np.linalg.solve(xtx, xty)  # Shape: (2,)

    return beta[0], beta[1]  # Intercept, slope


@dataclass
class DGP:
    N: int = 1_000
    number_of_simulations: int = 1_000

    # Correlations
    rho_xu: float = 0.5
    rho_zx: float = 0.4
    rho_zq: float = 0.4

    true_beta_0: float = 2.0
    true_beta_1: float = 5.0

    def __call__(self) -> np.ndarray:
        """Runs the simulation, returns the estimates of beta_1."""
        estimates = np.full(shape=(self.number_of_simulations,), fill_value=np.nan)

        # Generate the data
        # NOTE: Keeping the full shape can be memory intensive for large N or number_of_simulations
        u = np.random.standard_normal(size=(self.number_of_simulations, self.N))
        x = self.rho_xu * u + np.random.standard_normal(size=(self.number_of_simulations, self.N))
        q = np.random.standard_normal(size=(self.number_of_simulations, self.N))
        z = self.rho_zx * x + self.rho_zq * q + np.random.standard_normal(size=(self.number_of_simulations, self.N))

        y = self.true_beta_0 + self.true_beta_1 * x + u + 2 * q

        for i in range(self.number_of_simulations):
            # Run the two-stage least squares regression
            # First stage: regress x on z
            b0, b1 = least_squares(x[i], z[i])
            x_hat = b0 + b1 * z[i]

            # Second stage: regress y on x_hat
            b0, b1 = least_squares(y[i], x_hat)
            estimates[i] = b1

        return estimates


dgp = DGP()
estimates = dgp()

avg_beta_1 = np.mean(estimates)
bias = avg_beta_1 - dgp.true_beta_1

print("Mean of beta_1 estimates: {:.4f}".format(avg_beta_1))
print("Bias of beta_1 estimates: {:.4f}".format(bias))

bs = bootstrap(data=[estimates - dgp.true_beta_1], statistic=np.mean)
low, high = bs.confidence_interval
print("Bias 95% CI: [{:.4f}, {:.4f}]".format(low, high))

print("True beta_1: {:.4f}".format(dgp.true_beta_1))


Mean of beta_1 estimates: 7.0189
Bias of beta_1 estimates: 2.0189
Bias 95% CI: [2.0053, 2.0325]
True beta_1: 5.0000
