In [1]:
import numpy as np
from sklearn.utils import resample
import statsmodels.api as sm


def generate_data(n, break_point, alpha0, alpha1, alpha2, beta0, gamma0):
    index = np.arange(n).reshape(-1, 1)  # Reshaping for compatibility in column stacking
    c2 = np.random.normal(size=n)
    e = np.random.normal(size=n)
    
    # Common random variable to introduce correlation
    u1 = np.random.normal(0, 0.5, size=n)  # Shared random factor for correlation
    
    # Treatment and outcome relationships
    d1 = beta0 * (index > break_point).astype(float).reshape(-1) + u1 + np.random.normal(scale=1, size=n)
    d2 = gamma0 * (index > (break_point + n * 0.4)).astype(float).reshape(-1) + u1 + np.random.normal(scale=1, size=n)
    controls = np.random.normal(scale=2, size=(n, 5)) + u1[:, np.newaxis]  # d3 to d10 with added shared variation
    
    # Outcome variable should be a 1D array
    c1 = alpha0 + alpha1 * c2 + alpha2 * c2 * d1 + e  # Ensure this is 1D

    # Ensure data is combined correctly into a 2D array structure, featuring index for reference
    #data = np.column_stack((index, c2, d1, d2, controls))
    data = np.column_stack((index, c2,  controls))
    
    return c1, data

def compute_delta(c1, data, theta):
    index = data[:, 0]
    c2 = data[:, 1]
    dj = data[:, theta]
    
    # Regression model with c2 * (index > theta) and interaction with dj
    X = sm.add_constant(np.column_stack((c2, c2 * (index >= index[theta]), c2 * dj)))
    model = np.linalg.lstsq(X, c1, rcond=None)
    coeffs = model[0]
    delta = coeffs[2]  # Get the coefficient for the interaction term (c2 * (index >= theta))
    
    return -1*np.abs(delta), delta


n = 250
true_break_point = int(n*.5)
alpha0 = 0
alpha1 = 2
alpha2 = 2
beta0 = 2
gamma0 = 1
A_CUTOFF = 500

# Generate data with control variables included
for i in range(10):
    # `generate_data` returns `c1` and the dataset with all covariates including `c2` and `d1` to `d10`
    c1, data = generate_data(n, true_break_point, alpha0, alpha1, alpha2, beta0, gamma0)

    # Define the range of candidate break points (here, corresponds to control columns in `data`, i.e., `d1`...`d10`)
    candidate_breaks = np.arange(2, data.shape[1])  # Starts at 2 to skip index and `c2`

In [2]:
def bootstrap_covariance(c1, data, theta_tilde, theta_i, n_bootstraps=1000):
    results = []

    for _ in range(n_bootstraps):
        c1_b, data_b = resample(c1, data)

        x_tilde,y_tilde = compute_delta(c1_b, data_b, theta_tilde)
        x_i,y_i = compute_delta(c1_b, data_b, theta_i)

        results.append((x_i,y_i, x_tilde,y_tilde))

    return np.cov(list(zip(*results)))

def compute_matrices(cov_matrix, x_tilde,y_tilde,print_stuff=False):
    ΣYX_true, ΣY_tilde = cov_matrix[1, 0], cov_matrix[3, 3]
    ΣXY_tilde = cov_matrix[2, 3]
    ΣYX = cov_matrix[0, 3]# really need the covariance between x and y_tilde
    # y and x_tilde would be cov_matrix[1,2]
    ΣX, ΣX_tilde = cov_matrix[0, 0], cov_matrix[2, 2]

    # Projection calculations using the provided formulation
    Z = y_tilde - (ΣYX / ΣY_tilde) * x_tilde
    Z_tilde = y_tilde - (ΣXY_tilde / ΣY_tilde) * x_tilde
    A = ΣY_tilde**(-2) * (ΣXY_tilde**2 - ΣYX**2)
    B = 2 * ΣY_tilde**(-1) * (ΣXY_tilde * Z_tilde - ΣYX * Z)
    C = Z_tilde**2 - Z**2

    D = B ** 2 - 4 * A * C
    H = -C / B if B != 0 else np.nan
    G = (-B - np.sqrt(D)) / (2 * A) if D >= 0 and A!=0 else np.nan
    K = (-B + np.sqrt(D)) / (2 * A) if D >= 0 and A!=0 else np.nan
    if print_stuff:
        print('------------------')
        print(cov_matrix)
        print(ΣYX,'//',cov_matrix[0, 3],  cov_matrix[3, 0],'//',cov_matrix[1,2],cov_matrix[1, 2],'//',ΣYX_true,cov_matrix[0, 1])
        
    return A, B, C, D, H, G, K


def compute_confidence_intervals(theta_tilde, candidate_breaks, c1, data, a_cutoff=1, print_stuff=False):
    cached_results = {}
    x_tilde, y_tilde = compute_delta(c1, data, theta_tilde)
    
    # Cache computations to avoid redundant work
    for theta in candidate_breaks:
        cov_matrix = bootstrap_covariance(c1, data, theta_tilde, theta)
        cached_results[theta] = (cov_matrix, x_tilde, y_tilde)
    
    sigma = cached_results[theta][0][3, 3]
    
    max_lower_bound = -np.inf
    min_upper_bound = np.inf
    
    for theta in candidate_breaks:
        cov_matrix, x_tilde, y_tilde = cached_results[theta]
        A, B, C, D, H, G, K = compute_matrices(cov_matrix, x_tilde, y_tilde, print_stuff=print_stuff)
        
        # Compute bounds under the specified constraints A > 0 and D >= 0
        if A > 0 and D >= 0:
            max_lower_bound = max(max_lower_bound, G)
            min_upper_bound = min(min_upper_bound, K)
    
    return np.array([max_lower_bound, min_upper_bound]), y_tilde, sigma



# Generate data with control variables included
for i in range(1):
    # `generate_data` returns `c1` and the dataset with all covariates including `c2` and `d1` to `d10`
    c1, data = generate_data(n, true_break_point, alpha0, alpha1, alpha2, beta0, gamma0)

    # Define the range of candidate break points (here, corresponds to control columns in `data`, i.e., `d1`...`d10`)
    candidate_breaks = np.arange(2, data.shape[1])  # Starts at 2 to skip index and `c2`
    
    # Find the theta (control variable) that gives the minimum delta coefficient
    theta_hat, min_delta = max(
        ((theta, compute_delta(c1, data, theta)) for theta in candidate_breaks),
        key=lambda x: x[1]
    )
    
    

    # Calculate confidence intervals based on `theta_hat`
    interval_left, mean, variance = compute_confidence_intervals(
        theta_hat, candidate_breaks, c1, data, a_cutoff=A_CUTOFF
    )

    # Output results
    print(f"Iteration {i+1}:")
    print("Theta_hat (control variable index chosen):", theta_hat)
    print("Delta coefficient (minimized):", min_delta)
    print("Confidence Intervals:")
    print("Left Interval:", interval_left)
    print("Mean delta for bootstraps:", mean)
    print("Variance for delta_hat:", variance)
    print('-----')

Iteration 1:
Theta_hat (control variable index chosen): 6
Delta coefficient (minimized): (-1.6419096697464919, 1.6419096697464919)
Confidence Intervals:
Left Interval: [-1.64190967  1.44051614]
Mean delta for bootstraps: 1.6419096697464919
Variance for delta_hat: 2.0030405928994477
-----


In [3]:
import numpy as np
from scipy.stats import norm

def truncated_normal_quantile(quantile, interval, mean, variance):
    # Unpack the continuous interval
    u1, l2 = interval
    
    # Standardize the interval with the mean and variance
    u1_std, l2_std = (u1 - mean) / np.sqrt(variance), (l2 - mean) / np.sqrt(variance)
    
    # Calculate the total probability over the single continuous interval
    P_total = norm.cdf(l2_std) - norm.cdf(u1_std)
    
    # Target cumulative probability adjusted for truncation
    adjusted_quantile = quantile * P_total
    
    # Compute the quantile value in the standardized normal space
    quantile_value_std = norm.ppf(norm.cdf(u1_std) + adjusted_quantile)
    
    # Convert back to the original scale
    quantile_value = quantile_value_std * np.sqrt(variance) + mean
    
    return quantile_value

# Example usage
mean = 0
variance = 1
interval = (-1.0, 1.0)  # Example continuous interval [u1, l2]

quantile_upper = truncated_normal_quantile(0.95, interval, mean, variance)
quantile_lower = truncated_normal_quantile(0.05, interval, mean, variance)
print(quantile_lower, quantile_upper)

-0.8676618854186884 0.8676618854186879
