In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from matplotlib import pyplot as plt
import folktables
from folktables import ACSDataSource, ACSEmployment

In [2]:
def has_converged(x, size, epsilon):
    for i in range(0,size):
            if np.linalg.norm(x[-1:][0] - x[-(2+i):][0], np.inf) > epsilon:
                return False
    return True

In [12]:
def linear_proxies(x, y, z, iters, epsilon, eta_val=None):
    
    theta = [LinearRegression(fit_intercept=False).fit(x,z).coef_.reshape(-1,1)]
    #theta = [np.random.rand(d,1)]
    grad_l = [0]
    theta_average = theta
    converged = False
    print("Iterations:")
    
    for t in range(1, iters):
        eta = 1/np.sqrt(t)
        if converged:
            break
            
        if t%200 == 0:
            print(t)
            
        zhat = np.matmul(x,theta[t-1])  
        costs = (zhat-z)*(1-2*y)
        
        prc = LinearRegression(fit_intercept=False).fit(x,costs)
        h_s = prc.predict(x)
        
        #Use matmul
        h_1 = (h_s > 0)       
        h_2 = (h_s < 0)
    
        if np.sum(h_s[h_1]) > - np.sum(h_s[h_2]):
            h = h_1
        else:
            h = h_2

        h = h.astype(int)
        zhat_sum = np.sum(zhat)
        z_sum = np.sum(z)
        err_points = np.abs(h-y)
        [[err_cost]] = np.matmul((zhat-z).T,err_points)
        overall_diff = (zhat_sum/z_sum) - 1
        
        if np.abs(overall_diff) >= np.abs(err_cost):           
            penalty = np.sign(overall_diff) * np.sum(x, axis=0)/z_sum
            penalty = eta*penalty.reshape(-1,1)
        else:
            penalty = np.sign(err_cost) * np.matmul(np.transpose(x),err_points) 
            penalty = eta*penalty.reshape(-1,1)

        grad_l.append(penalty)
        new_theta = (theta[t-1] - grad_l[t])

        theta.append(new_theta)
        theta_average.append((t*theta_average[t-1]+theta[t])/(t+1))
        
        if t%100 == 0 :
            if has_converged(theta_average, 10, epsilon):
                converged = True
                    
    return theta, grad_l, theta_average[-1]   

In [4]:
#TODO: Functionalize this evaluation system
def evaluate_proxy(x, y, z, theta_average):
    zhat = np.matmul(x, theta_average) 
    final_costs = (zhat-z)*(1-2*y)
    prc = LinearRegression(fit_intercept=False).fit(x, final_costs)  
    h_s = prc.predict(x)
    h_plus = (h_s > 0)
    h_minus = (h_s < 0)
    
    if np.sum(h_s[h_plus]) > -np.sum(h_s[h_minus]):
        h = h_plus
    else:
        h = h_minus
    
    h = h.astype(int)    
       
    zhat_sum = np.sum(zhat)
    z_sum = np.sum(z)
    err_points = np.abs(h-y)

    [[err_cost_z]] = np.matmul(z.T, err_points)
    [[err_cost_zhat]] = np.matmul(zhat.T, err_points)
    overall_diff = (zhat_sum/z_sum) - 1
    model_diff = err_cost_zhat - err_cost_z
    proxy_error = err_cost_zhat/zhat_sum - err_cost_z/z_sum
    
    return overall_diff, model_diff, proxy_error

I. Census Data Experiments

In [5]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=False)
features, label, group = ACSEmployment.df_to_numpy(acs_data)

In [14]:
bw_indices = (group<3)

X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features[bw_indices,:], label[bw_indices], group[bw_indices]-1, test_size=0.30, random_state=0)

#Normalize feature matrix
x = X_train/(np.linalg.norm(X_train, axis=1).max())
print(x.shape)
y = y_train.astype(int).reshape(-1,1)
z = group_train.reshape(-1,1)

coefficients, gradients, theta_average = linear_proxies(x, y, z, 10000, 0.01, eta_val = 0.001)
overall_diff, model_diff, proxy_error = evaluate_proxy(x,y,z,theta_average)

print("Overall diff: " + str(overall_diff))
print("Model diff: " + str(model_diff))
print("Proxy error: " + str(proxy_error))

(175592, 16)
Iterations:
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6600
6800
7000
7200
7400
7600
7800
8000
8200
8400
8600
8800
9000
9200
9400
9600
9800
Overall diff: -20837.139196393608
Model diff: -196583201.2371413
Proxy error: 0.012743676877850385


In [18]:
print(np.max(np.matmul(x,theta_average)))

-17.573253088924687


II. Synthetic Data Experimental Framework

In [7]:
def synthetic_proxy_experiments(T, trials, n, d, epsilon, eta, unit=False):
    discrepancy_proxy = []
    discrepancy_total = []
    discrepancy_h = []
    intercept = np.ones(n).reshape(-1,1)
    
    for i in range(0,trials):
        np.random.seed(i)
        y = np.round(np.random.rand(n,1))
        
        if unit:
            theta = np.random.rand(d,1)
            theta = theta/np.linalg.norm(theta)
            x = np.random.rand(n,d)
            
            for j in range(0,n):
                x[j,:] = x[j,:]/np.linalg.norm(x[j,:])
                
            zhat = np.matmul(x,theta)
            z = np.random.binomial(n=n,p=zhat)   
            
        else:
            z = np.round(np.random.rand(n,n))
            theta = np.random.rand(n,n)
            x = np.matmul(z, np.linalg.inv(theta))
            x = np.hstack((intercept, x))
            z = z[:,0].reshape(-1,1)

        coefficients, gradients, theta_average = linear_proxies(x, y, z, T, epsilon, eta)
        overall_diff, model_diff, proxy_error = evaluate_proxy(x,y,z,theta_average)
       
        discrepancy_total.append(overall_diff)
        discrepancy_h.append(model_diff)
        discrepancy_proxy.append(proxy_error)
        
    return discrepancy_total, discrepancy_h, discrepancy_proxy