In [1]:
import numpy as np
from sklearn.utils import resample
import statsmodels.api as sm


# Function to generate synthetic data
def generate_data(n, break_point, alpha0, alpha1, alpha2, beta0, gamma0):
    index = np.arange(n).reshape(-1, 1)
    c2 = np.random.normal(size=n)
    e = np.random.normal(size=n)
    u1 = np.random.normal(0, 0.5, size=n)  # Shared random factor for correlation
    
    d1 = beta0 * (index > break_point).astype(float).reshape(-1) - u1 + np.random.normal(scale=1, size=n)
    d2 = gamma0 * (index > (break_point + n * 0.4)).astype(float).reshape(-1) - u1 + np.random.normal(scale=1, size=n)
    controls = np.random.normal(scale=2, size=(n, 5)) - u1[:, np.newaxis]  # d3 to d10 with added shared variation
    
    c1 = alpha0 + alpha1 * c2 + alpha2 * d1 + e
    
    data = np.column_stack((index, c2, controls)) #,d1, d2, 
    return c1, data

# Function to compute delta and return model parameters
def compute_delta(c1, data, theta, break_point):
    index = data[:, 0]
    c2 = data[:, 1]
    dj = data[:, theta]
    
    X = sm.add_constant(np.column_stack((c2, c2 * (index >= break_point), dj)))
    model = np.linalg.lstsq(X, c1, rcond=None)
    coeffs = model[0]
    delta = coeffs[2]
    return -1*np.abs(delta), delta


n = 250
true_break_point = int(n*.5)
alpha0 = 0
alpha1 = 2
alpha2 = 3
beta0 = 2
gamma0 = 1
A_CUTOFF = 500

# `generate_data` returns `c1` and the dataset with all covariates including `c2` and `d1` to `d10`
c1, data = generate_data(n, true_break_point, alpha0, alpha1, alpha2, beta0, gamma0)

# Define the range of candidate break points (here, corresponds to control columns in `data`, i.e., `d1`...`d10`)
candidate_breaks = np.arange(2, data.shape[1])  # Starts at 2 to skip index and `c2`
for theta in candidate_breaks:
    print(compute_delta(c1, data, theta, true_break_point))

(-1.2983524700610847, 1.2983524700610847)
(-1.33090178394337, 1.33090178394337)
(-1.2877784813586688, 1.2877784813586688)
(-1.3223570586370166, 1.3223570586370166)
(-1.324457530538616, 1.324457530538616)


In [2]:
def bootstrap_covariance(c1, data, theta_tilde, theta_i,break_point, n_bootstraps=1000):
    results = []

    for _ in range(n_bootstraps):
        c1_b, data_b = resample(c1, data)

        x_tilde,y_tilde = compute_delta(c1_b, data_b, theta_tilde,break_point)
        x_i,y_i = compute_delta(c1_b, data_b, theta_i,break_point)

        results.append((x_i,y_i, x_tilde,y_tilde))

    return np.cov(list(zip(*results)))

def compute_matrices(cov_matrix, x_tilde,y_tilde,print_stuff=False):
    ΣYX_true, ΣY_tilde = cov_matrix[1, 0], cov_matrix[3, 3]
    ΣXY_tilde = cov_matrix[2, 3]
    ΣYX = cov_matrix[0, 3]# really need the covariance between x and y_tilde
    # y and x_tilde would be cov_matrix[1,2]
    ΣX, ΣX_tilde = cov_matrix[0, 0], cov_matrix[2, 2]

    # Projection calculations using the provided formulation
    Z = y_tilde - (ΣYX / ΣY_tilde) * x_tilde
    Z_tilde = y_tilde - (ΣXY_tilde / ΣY_tilde) * x_tilde
    A = ΣY_tilde**(-2) * (ΣXY_tilde**2 - ΣYX**2)
    B = 2 * ΣY_tilde**(-1) * (ΣXY_tilde * Z_tilde - ΣYX * Z)
    C = Z_tilde**2 - Z**2

    D = B ** 2 - 4 * A * C
    H = -C / B if B != 0 else np.nan
    G = (-B - np.sqrt(D)) / (2 * A) if D >= 0 and A!=0 else np.nan
    K = (-B + np.sqrt(D)) / (2 * A) if D >= 0 and A!=0 else np.nan
    if False:
        print('------------------')
        print(cov_matrix)
        print(ΣYX,'//',cov_matrix[0, 3],  cov_matrix[3, 0],'//',cov_matrix[1,2],cov_matrix[1, 2],'//',ΣYX_true,cov_matrix[0, 1])
        
    return A, B, C, D, H, G, K


def compute_confidence_intervals(theta_tilde, candidate_breaks, c1, data, a_cutoff=1, break_point =true_break_point,
                                 print_stuff=False):
    cached_results = {}
    x_tilde, y_tilde = compute_delta(c1, data, theta_tilde,break_point)
    
    # Cache computations to avoid redundant work
    for theta in candidate_breaks:
        cov_matrix = bootstrap_covariance(c1, data, theta_tilde, theta,break_point)
        cached_results[theta] = (cov_matrix, x_tilde, y_tilde)
    
    sigma = cached_results[theta][0][3, 3]
    
    max_lower_bound = -np.inf
    min_upper_bound = np.inf
    
    for theta in candidate_breaks:
        cov_matrix, x_tilde, y_tilde = cached_results[theta]
        A, B, C, D, H, G, K = compute_matrices(cov_matrix, x_tilde, y_tilde, print_stuff=print_stuff)
        if print_stuff:
            print(A, B, C,'//', D, H,'//', G, K)
        # Compute bounds under the specified constraints A > 0 and D >= 0
        if A > 0 and D >= 0:
            max_lower_bound = max(max_lower_bound, G)
            min_upper_bound = min(min_upper_bound, K)
        if A < 0 and D >= 0:
            max_lower_bound = max(max_lower_bound, K) #swap K and G if A is negative...
            min_upper_bound = min(min_upper_bound, G)
            
    return np.array([max_lower_bound, min_upper_bound]), y_tilde, sigma
    #TODO add in these constraints A =0... or clsoe to it...
    #Y ( ˜θ)≥ −CZ( ˜θ,θ) BZ( ˜θ,θ) ∀θ∈Θ s.t. A( ˜θ,θ)=0 
    #and BZ( ˜θ,θ)>0, Y ( ˜θ)≤ −CZ( ˜θ,θ) BZ( ˜θ,θ) ∀θ∈Θ s.t. A( ˜θ,θ)=0 and BZ( ˜θ,θ)<0


# Generate data with control variables included
for i in range(10):
    # `generate_data` returns `c1` and the dataset with all covariates including `c2` and `d1` to `d10`
    c1, data = generate_data(n, true_break_point, alpha0, alpha1, alpha2, beta0, gamma0)

    # Define the range of candidate break points (here, corresponds to control columns in `data`, i.e., `d1`...`d10`)
    candidate_breaks = np.arange(2, data.shape[1])  # Starts at 2 to skip index and `c2`
    
    # Find the theta (control variable) that gives the minimum delta coefficient
    theta_hat, min_delta = max(
        ((theta, compute_delta(c1, data, theta,true_break_point)) for theta in candidate_breaks),
        key=lambda x: x[1]
    )
    
    # Calculate confidence intervals based on `theta_hat`
    interval_left, mean, variance = compute_confidence_intervals(
        theta_hat, candidate_breaks, c1, data, a_cutoff=A_CUTOFF
    )

    # Output results
    print(f"Iteration {i+1}:")
    print("Theta_hat (control variable index chosen):", theta_hat)
    print("Delta coefficient (minimized):", min_delta)
    print("Confidence Intervals:")
    print("Left Interval:", interval_left)
    print("Mean delta for bootstraps:", mean)
    print("Variance for delta_hat:", variance)
    print('-----')

Iteration 1:
Theta_hat (control variable index chosen): 4
Delta coefficient (minimized): (-0.3014280360443669, 0.3014280360443669)
Confidence Intervals:
Left Interval: [-0.30142804  0.26758001]
Mean delta for bootstraps: 0.3014280360443669
Variance for delta_hat: 0.3673765146046641
-----
Iteration 2:
Theta_hat (control variable index chosen): 2
Delta coefficient (minimized): (-0.07109730589616477, -0.07109730589616477)
Confidence Intervals:
Left Interval: [-0.07109731  0.19245716]
Mean delta for bootstraps: -0.07109730589616477
Variance for delta_hat: 0.2760668668776926
-----
Iteration 3:
Theta_hat (control variable index chosen): 3
Delta coefficient (minimized): (-0.09968152670790888, -0.09968152670790888)
Confidence Intervals:
Left Interval: [-0.09968153  0.29556245]
Mean delta for bootstraps: -0.09968152670790888
Variance for delta_hat: 0.36014338069785895
-----
Iteration 4:
Theta_hat (control variable index chosen): 6
Delta coefficient (minimized): (-0.06571673150647744, 0.06571673

In [3]:
import numpy as np
from scipy.stats import norm

def truncated_normal_quantile(quantile, interval, mean, variance):
    # Unpack the continuous interval
    u1, l2 = interval
    
    # Standardize the interval with the mean and variance
    u1_std, l2_std = (u1 - mean) / np.sqrt(variance), (l2 - mean) / np.sqrt(variance)
    
    # Calculate the total probability over the single continuous interval
    P_total = norm.cdf(l2_std) - norm.cdf(u1_std)
    
    # Target cumulative probability adjusted for truncation
    adjusted_quantile = quantile * P_total
    
    # Compute the quantile value in the standardized normal space
    quantile_value_std = norm.ppf(norm.cdf(u1_std) + adjusted_quantile)
    
    # Convert back to the original scale
    quantile_value = quantile_value_std * np.sqrt(variance) + mean
    
    return quantile_value


quantile_upper = truncated_normal_quantile(0.95, interval_left, mean, variance)
quantile_lower = truncated_normal_quantile(0.05, interval_left, mean, variance)
print(quantile_lower, quantile_upper)

-0.5969700469539285 0.04133393961952425
