In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from matplotlib import pyplot as plt
import folktables
from folktables import ACSDataSource, ACSEmployment

In [2]:
def has_converged(x, size, epsilon):
    for i in range(0,size):
            if np.linalg.norm(x[-1:][0] - x[-(2+i):][0], np.inf) > epsilon:
                return False
    return True

In [3]:
def linear_proxies(x, y, z, iters, epsilon, eta_val=None):
    r = 100
    treshold = 0.01
    n = x.shape[0]
    theta = [np.round(LinearRegression(fit_intercept=False).fit(x,z).coef_.reshape(-1,1),10)]
    #theta = [np.random.rand(d,1)]
    grad_l = [0]
    theta_average = theta
    converged = False
    if eta_val:
        eta = eta_val
    print("Iterations:")
    
    for t in range(1, iters):
        if eta_val is None:
            eta = 1/np.sqrt(t)
            
        if converged:
            print("Converged")
            break
            
        if t%200 == 0:
            print(t)
        
        zhat = np.round(np.matmul(x,theta[t-1]),r)
        costs = np.round((zhat-z),r)*(1-2*y)
        
        prc_1 = LinearRegression(fit_intercept=False).fit(x,costs)
        prc_2 = LinearRegression(fit_intercept=False).fit(x,-costs)
        h_s_1 = np.round(prc_1.predict(x),r)
        h_s_2 = np.round(prc_2.predict(x),r)
        
        #Use matmul, numerical precision error
        h_1 = (h_s_1 > 0)       
        h_2 = (h_s_2 > 0)
    
        if np.sum(h_s_1[h_1]) > np.sum(h_s_2[h_2]):
            h = h_1
        else:
            h = h_2

        h = h.astype(int)
        zhat_sum = np.sum(zhat)
        z_sum = np.sum(z)
        err_points = np.round(np.abs(h-y))
       
        [[err_cost]] = np.matmul(np.round((zhat-z).T,r),err_points)
        overall_diff = (zhat_sum/z_sum) - 1
        #Add in break statement for very small costs
        
        if np.abs(overall_diff) >= np.abs(err_cost):           
            penalty = np.sign(overall_diff) * np.sum(x, axis=0)/z_sum
            penalty = eta*penalty.reshape(-1,1)
        else:
            penalty = np.sign(err_cost) * np.matmul(np.transpose(x),err_points) 
            penalty = eta*penalty.reshape(-1,1)

        grad_l.append(penalty)
        new_theta = (theta[t-1] - grad_l[t])

        theta.append(new_theta)
        theta_average.append((t*theta_average[t-1]+theta[t])/(t+1))
        
        if t%100 == 0 :
            if has_converged(theta_average, 10, epsilon):
                converged = True
                    
    return theta, grad_l, theta_average[-1]   

In [4]:
def evaluate_proxy(x, y, z, theta_average):
    n = x.shape[0]
    zhat = np.matmul(x, theta_average) 
    final_costs = (zhat-z)*(1-2*y)
    prc = LinearRegression(fit_intercept=False).fit(x, final_costs)  
    h_s = prc.predict(x)
    h_plus = (h_s > 0)
    h_minus = (h_s < 0)
    
    if np.sum(h_s[h_plus]) > -np.sum(h_s[h_minus]):
        h = h_plus
    else:
        h = h_minus
    
    h = h.astype(int)    
       
    zhat_sum = np.sum(zhat)
    z_sum = np.sum(z)
    err_points = np.abs(h-y)

    [[err_cost_z]] = np.matmul(z.T, err_points)
    [[err_cost_zhat]] = np.matmul(zhat.T, err_points)
    overall_diff = (zhat_sum/z_sum) - 1
    model_diff = (err_cost_zhat - err_cost_z)
    proxy_error = err_cost_zhat/zhat_sum - err_cost_z/z_sum
    
    return overall_diff, model_diff, proxy_error

I. Census Data Experiments

In [None]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=False)
features, label, group = ACSEmployment.df_to_numpy(acs_data)

In [None]:
bw_indices = (group<3)

X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features[bw_indices,:], label[bw_indices], group[bw_indices]-1, test_size=0.30, random_state=0)

#Normalize feature matrix
intercept = np.ones(X_train.shape[0]).reshape(-1,1)
y = y_train.astype(int).reshape(-1,1)
z = group_train.reshape(-1,1)
x = np.hstack((intercept,X_train))

coefficients, gradients, theta_average = linear_proxies(x, y, z, 1000, 0.001)
overall_diff, model_diff, proxy_error = evaluate_proxy(x,y,z,theta_average)

print("Overall diff: " + str(overall_diff))
print("Model diff: " + str(model_diff))
print("Proxy error: " + str(proxy_error))

In [None]:
plt.hist(np.matmul(x,theta_average)-z)

II. Synthetic Data Experimental Framework

In [None]:
def synthetic_proxy_experiments(T, trials, n, d, epsilon, eta, unit=False):
    discrepancy_proxy = []
    discrepancy_total = []
    discrepancy_h = []
    intercept = np.ones(n).reshape(-1,1)
    
    for i in range(0,trials):
        np.random.seed(i)
        y = np.round(np.random.rand(n,1))
        
        if unit:
            theta = np.random.rand(d,1)
            theta = theta/np.linalg.norm(theta)
            x = np.random.rand(n,d)
            
            for j in range(0,n):
                x[j,:] = x[j,:]/np.linalg.norm(x[j,:])
                
            zhat = np.matmul(x,theta)
            z = np.random.binomial(n=n,p=zhat)   
            
        else:
            z = np.round(np.random.rand(n,n))
            theta = np.random.rand(n,n)
            x = np.matmul(z, np.linalg.inv(theta))
            x = np.hstack((intercept, x))
            z = z[:,0].reshape(-1,1)

        coefficients, gradients, theta_average = linear_proxies(x, y, z, T, epsilon, eta)
        overall_diff, model_diff, proxy_error = evaluate_proxy(x,y,z,theta_average)
       
        discrepancy_total.append(overall_diff)
        discrepancy_h.append(model_diff)
        discrepancy_proxy.append(proxy_error)
        
    return discrepancy_total, discrepancy_h, discrepancy_proxy