In [47]:
# This code creates a (k, lambda, eta, beta)- base synopsis generator that is (epsilon,delta)-DP 

import numpy as np
import math
import sympy as sp
from itertools import combinations
import pandas as pd
import time
import copy


lower_bound = -1 # data lower bound
upper_bound = 1  # data upper bound
data_precision = 2 # data precision

n = 200 
num_points = 2*n # number of data points
m = num_points*math.log(((upper_bound - lower_bound)/10**(-data_precision)) +1) # size of the domain universe

delta =  0.3 # DP parameter
eta = 0.01 # edge for boosting
beta = 0.2 # failure probability of the base synopsis
k = math.ceil(2*((math.log(2/beta)+m)/(1-2*eta))) # number of query sample as demanded by Lemma 6.5 
# Assume coefficient \in (0,1], changing one x_i can at most change 1*[(1+x_j)^2 - (-1+x_j)^2] = 4x_j <= 4.

rho = 4/num_points-2/num_points**2 # l_1 sensitivity of our query = 4 sub
# rho = 16



Llambda = 0.4 # accuracy parameter lambda
epsilon = (math.log(1/beta)*rho*math.sqrt(k*math.log(1/delta)))/Llambda




In [49]:
epsilon

2.9028448235520354

In [48]:

# query does the following:
# Given a database X, it selects two points x_i, x_j \in X uniformly at random and computes
# q_c(X) = c(x_i + x_j)^2.
# The set of queries Q consists of all such q_c with coefficient c \in (0, 1] 
# Note that we are not setting the precision of coefficients to allow a large size of Q

c, x_i, x_j = sp.symbols('c x_i x_j') 
# c, x_1, x_2, x_3, x_4, x_5  = sp.symbols('c x_1, x_2, x_3, x_4, x_5')

# deg_2_poly = Function('deg_2_poly')

deg_2_poly = c*(x_i+x_j)**2 
# deg_2_poly = c*(x_1+ x_2+ x_3+ x_4+ x_5)**2

gradient_deg_2_poly = sp.diff(deg_2_poly, x_i)
hession_deg_2_poly = sp.diff(gradient_deg_2_poly, x_i) 
# gradient_deg_2_poly = sp.diff(deg_2_poly, x_1)
# hession_deg_2_poly = sp.diff(gradient_deg_2_poly, x_1)

q_hat = sp.symbols('q_hat')

gradient_loss_per_q = (1/Llambda**2)*2*(deg_2_poly-q_hat)*gradient_deg_2_poly
hession_loss_per_q = (1/Llambda**2)*(2*(gradient_deg_2_poly)**2+2*(deg_2_poly-q_hat)*hession_deg_2_poly)

loss_per_q = (1/Llambda**2)*(deg_2_poly-q_hat)**2
# evaluate a function
# deg_2_poly.evalf(4, subs={c:1, x_i:2.32, x_j:3})


In [4]:
# all possible pairs of data among the given data points
all_possible_pairs= np.array(list(combinations(range(num_points), 2))) 
# all_possible_pairs= np.array(list(combinations(range(num_points), 5)))

# The entire query set Q corresponds to a coefficient_array of (0,1] of precision 0.001
all_coeff = 1-np.linspace(0, 1, num=len(all_possible_pairs), endpoint=False, dtype=None)

# for each query, we also need to fix a pair of x_i, x_j
UAR_among_all_pairs= np.random.choice(range(len(all_possible_pairs)), size=len(all_coeff), replace=False)

# the k-th row stores a pair of indices (i,j) used by q_k = c_k(x_i+x_j)^2
all_pairs_indices = all_possible_pairs[UAR_among_all_pairs,:] 

# create a dataframe to store the coefficient and pairs indices for each query in Q
df_deg_2_poly = {'coefficient': all_coeff, 'pairs indices 1': all_pairs_indices[:,0],'pairs indices 2': all_pairs_indices[:,1]} 
# df_deg_2_poly = {'coefficient': all_coeff, 'pairs indices 1': all_pairs_indices[:,0],'pairs indices 2': all_pairs_indices[:,1], 
                #  'pairs indices 3': all_pairs_indices[:,2],'pairs indices 4': all_pairs_indices[:,3], 
                #  'pairs indices 5': all_pairs_indices[:,4],'pairs indices 6': all_pairs_indices[:,5]}

# Create the DataFrame
df_deg_2_poly = pd.DataFrame(df_deg_2_poly).apply(lambda x: round(x, data_precision+2))


In [5]:
# decide on a set of real data 
real_X = np.random.uniform(low=lower_bound, high=upper_bound, size=num_points)
# initialize the synopsis to be some arbirary set of data, say from the standard normal
# fake_X = np.random.randn(num_points)
fake_X = copy.copy(real_X)
fake_X_copy = copy.copy(fake_X) # save a copy of fake_X

# for verification purposes, if fake_X = real_X, the initial error should be the same as the added laplace noise
# fake_X = real_X 

# To initialize boosting, we choose UAR a size k subset of Q 
# initially, queries are sampled UAR. This distribution will change after each iteration of boosting
df_sampled_queries = df_deg_2_poly.sample(n=k, replace=False, weights=None, axis=None)
sampled_queries_dict = df_sampled_queries.reset_index().to_dict()
#### BOOSTING LOOP STARTS ####

# initialize all-zero arrays to store noiselss query output, noisy output, and laplace noise  
real_output = np.zeros(k)
real_data_noisy_output = np.zeros(k)
lap_noise = np.zeros(k)
fake_data_output = np.zeros(k)
error = np.zeros(k) # store |q(X) - noisy_output| for each q


# for each query, compute its real output, noisy output, and error
for index, coefficient in enumerate(df_sampled_queries['coefficient']):
    # store the original index 
    ori_idx_queries = df_sampled_queries['coefficient'].index
    # store the real x_i, x_j used for each query
    real_xi = real_X[df_sampled_queries['pairs indices 1'][ori_idx_queries[index]]]
    real_xj = real_X[df_sampled_queries['pairs indices 2'][ori_idx_queries[index]]]
    # real output
    real_output[index] = deg_2_poly.evalf(data_precision+2, subs={c:df_sampled_queries['coefficient'][ori_idx_queries[index]], x_i:real_xi, x_j:real_xj})

    
    # compute noisy output on the real data
    lap_noise[index] = np.random.laplace(loc=0.0, scale=rho*(2*math.sqrt(2*k*math.log(1/delta))/epsilon), size=None) 
    # lap_noise[index] = 0
    real_data_noisy_output[index] = real_output[index] + lap_noise[index]
    
    # compute query output on the current synopsis 
    fake_xi = fake_X[df_sampled_queries['pairs indices 1'][ori_idx_queries[index]]]
    fake_xj = fake_X[df_sampled_queries['pairs indices 2'][ori_idx_queries[index]]]
    
    # compute query output on fake data 
    fake_data_output[index] = deg_2_poly.evalf(data_precision+4, subs={c:df_sampled_queries['coefficient'][ori_idx_queries[index]], x_i:fake_xi, x_j:fake_xj})
 
    # calculate initial error
    # notice that this is |q(X) - real_data_noisy_output|
    error[index] = abs(fake_data_output[index]-real_data_noisy_output[index])


In [None]:

#### COORDINATE DESCENT LOOP STARTS HERE ###
#### In this loop, we do coordinate descent, NOT multivariate Newton's method ####

# initialize number of coordinate descent iterations = 0
num_iter_descent = 0
q_idx = 0


# while we don't have |q(X) - noisy_output|<lambda/2 for all q, continue coordinate descent 
while not np.all(error < Llambda/2):
    # calculate total loss
    total_loss = 0
    q_idx = 0
    while q_idx < k:
        # calculate the loss of query indexed by q_idx
        xi = sampled_queries_dict['pairs indices 1'][q_idx]
        xi = fake_X[xi]
        xj = sampled_queries_dict['pairs indices 2'][q_idx]
        xj = fake_X[xj]
        loss_q = loss_per_q.evalf(data_precision+4,subs={c: sampled_queries_dict['coefficient'][q_idx], x_i:xi, x_j:xj, q_hat:real_data_noisy_output[q_idx]})
        total_loss += loss_q 

        # print("query #1 has loss = ", loss_q, 'total loss = ', total_loss)
        q_idx += 1

    # evaluate partial derivative wrt each coordinate of X
    if num_iter_descent ==0: 

        # initialize the partial derivative of each coordinate to be zero
        part_derivative = np.zeros(num_points)
        
        # compute the partial derivative of the loss function with respect to each coordinate     
        for coordinate in range(num_points):
            
            # calculate partial derivative of total_loss wrt to the coordinate 
            # initiate the partial derivative of total loss to be 0.
            part_deri_total_loss = 0 
            
            # eventually this is a sum over gradient_loss_per_q for all q in df_sampled_queries
            for idx_q in range(k):
                
                # check if coordinate is one of the x_i x_j used in the query
                coords_chosen = [sampled_queries_dict['pairs indices 1'][idx_q], sampled_queries_dict['pairs indices 2'][idx_q]]
                bool_coord_chosen = coordinate in coords_chosen    
                # if the coordinate is in the chosen pair of x_i, x_j
                if bool_coord_chosen == True:
                    coeff = df_sampled_queries[['coefficient']].iloc[idx_q].iloc[0]
                    xi = df_sampled_queries[['pairs indices 1']].iloc[idx_q].iloc[0]
                    xi = fake_X[xi]
                    xj = df_sampled_queries[['pairs indices 2']].iloc[idx_q].iloc[0]
                    xj = fake_X[xj]
                    noisy_output = real_data_noisy_output[idx_q]
                    part_deri_q_loss =  gradient_loss_per_q.evalf(data_precision+4, subs={c:coeff, x_i:xi, x_j:xj, q_hat:noisy_output})
                
                    # add the partial derivative of loss per every query to the total part deri
                    part_deri_total_loss += part_deri_q_loss
            
            part_derivative[coordinate] = part_deri_total_loss
            # print(coordinate, part_derivative[coordinate])
        
    else: 
    # change the partial derivative wrt to the queries which were impacted by x_coord_descent  
        
        idx_part_deri_to_change = df_sampled_queries[df_sampled_queries['pairs indices 1']==x_coord_descent]['pairs indices 2']
        idx_part_deri_to_change = pd.concat([idx_part_deri_to_change,df_sampled_queries[df_sampled_queries['pairs indices 2']==x_coord_descent]['pairs indices 1']])
        idx_part_deri_to_change = np.append(x_coord_descent, idx_part_deri_to_change.to_numpy())

        for coordinate in idx_part_deri_to_change:
            
            # calculate partial derivative of total_loss wrt to the coordinate 
            # initiate the partial derivative of total loss to be 0.
            part_deri_total_loss = 0 
            
            # eventually this is a sum over gradient_loss_per_q for all q in df_sampled_queries
            for idx_q in range(k):
                
                # check if coordinate is one of the x_i x_j used in the query
                coords_chosen =  [sampled_queries_dict['pairs indices 1'][idx_q], sampled_queries_dict['pairs indices 2'][idx_q]]
                bool_coord_chosen = coordinate in coords_chosen    
                # if the coordinate is in the chosen pair of x_i, x_j
                if bool_coord_chosen == True:
                    coeff = df_sampled_queries[['coefficient']].iloc[idx_q].iloc[0]
                    xi = df_sampled_queries[['pairs indices 1']].iloc[idx_q].iloc[0]
                    xi = fake_X[xi]
                    xj = df_sampled_queries[['pairs indices 2']].iloc[idx_q].iloc[0]
                    xj = fake_X[xj]
                    noisy_output = real_data_noisy_output[idx_q]
                    part_deri_q_loss =  gradient_loss_per_q.evalf(data_precision+4, subs={c:coeff, x_i:xi, x_j:xj, q_hat:noisy_output})
                
                    # add the partial derivative of loss per every query to the total part deri
                    part_deri_total_loss += part_deri_q_loss
            
            part_derivative[coordinate] = part_deri_total_loss

       
    # find the coordinate with the max absolute value part_derivative 
    x_coord_descent = np.argmax(np.abs(part_derivative))
     
    # update x value at the coordinate x_coord_descent 
    
    # First, we calculate the 2nd derivative wrt x_coord_descent
    deg_2_part_deri_total_loss = 0
    for idx_q in range(k):
            
        # check if x_coord_descent is one of the x_i x_j used in the query
        coords_chosen =  [sampled_queries_dict['pairs indices 1'][idx_q], sampled_queries_dict['pairs indices 2'][idx_q]]
        bool_coord_chosen = x_coord_descent in coords_chosen    
        # if the x_coord_descent is in the chosen pair of x_i, x_j
        if bool_coord_chosen == True:
            coeff = df_sampled_queries[['coefficient']].iloc[idx_q].iloc[0]
            xi = df_sampled_queries[['pairs indices 1']].iloc[idx_q].iloc[0]
            xi = fake_X[xi]
            xj = df_sampled_queries[['pairs indices 2']].iloc[idx_q].iloc[0]
            xj = fake_X[xj]
            noisy_output = real_data_noisy_output[idx_q]
            deg_2_part_deri_q_loss =  hession_loss_per_q.evalf(data_precision+4, subs={c:coeff, x_i:xi, x_j:xj, q_hat:noisy_output})
        
            # add the partial derivative of loss per every query to the total part deri
            deg_2_part_deri_total_loss += deg_2_part_deri_q_loss
    
    # Now we are ready to update fake_X at x_coord_descent
    fake_X[x_coord_descent] = fake_X[x_coord_descent] - (part_derivative[x_coord_descent]/deg_2_part_deri_total_loss)
    
    
    # recalculate error for queries that were impacted
    # save a list of queries whose error were updated bc of change in x_coord_descent
    which_q_impacted = [] 
    for idx_q in range(k):
            
        # check if x_coord_descent is one of the x_i x_j used in the query
        coords_chosen =  [sampled_queries_dict['pairs indices 1'][idx_q], sampled_queries_dict['pairs indices 2'][idx_q]]
        bool_coord_chosen = x_coord_descent in coords_chosen    
        # if the coordinate is in the chosen pair of x_i, x_j, we update the error for that query
        if bool_coord_chosen == True:
        
            # compute query output on the current synopsis 
            fake_xi = fake_X[df_sampled_queries['pairs indices 1'].iloc[idx_q]]
            fake_xj = fake_X[df_sampled_queries['pairs indices 2'].iloc[idx_q]]
    
            # compute query output on fake data 
            fake_data_output[idx_q] = deg_2_poly.evalf(data_precision+4, subs={c:df_sampled_queries['coefficient'].iloc[idx_q], x_i:fake_xi, x_j:fake_xj})
 
            # update error for that query
            # notice that this is |q(X) - real_data_noisy_output|
            error[idx_q] = abs(fake_data_output[idx_q]-real_data_noisy_output[idx_q])
            which_q_impacted.append(idx_q)
    
    
    print(f"#iter {num_iter_descent} x_co={x_coord_descent}, 1st={part_derivative[x_coord_descent]}, 2nd={ deg_2_part_deri_total_loss}, # queries below={sum(error<Llambda)} fake x={fake_X[x_coord_descent]}")
    print('Total loss = ', total_loss)
    


    



    # add 1 to num_iter_descent 
    num_iter_descent += 1

    


    
        
        
      

#iter 0 x_co=250, 1st=-95.21737670898438, 2nd=199.989, # queries below=262 fake x=-0.3836529706613294
Total loss =  1122.44
#iter 1 x_co=302, 1st=56.009254455566406, 2nd=138.471, # queries below=261 fake x=0.5506466854612742
Total loss =  1095.05
#iter 2 x_co=152, 1st=54.56489562988281, 2nd=26.8783, # queries below=262 fake x=-2.291702159907795
Total loss =  1081.92
#iter 3 x_co=393, 1st=-580.3302001953125, 2nd=789.279, # queries below=260 fake x=-0.2099780648235119
Total loss =  1382.51
#iter 4 x_co=152, 1st=-186.23306274414062, 2nd=386.952, # queries below=259 fake x=-1.810420662173963
Total loss =  1127.96
#iter 5 x_co=123, 1st=48.26048278808594, 2nd=-49.5918, # queries below=259 fake x=0.5989809599187121
Total loss =  1076.54
#iter 6 x_co=152, 1st=-40.48711395263672, 2nd=224.456, # queries below=260 fake x=-1.6300420412677235
Total loss =  1091.59
#iter 7 x_co=270, 1st=37.12384033203125, 2nd=79.3091, # queries below=260 fake x=-0.22068775594824053
Total loss =  1087.65
#iter 8 x_co

KeyboardInterrupt: 

1. plot loss function and see if it's decreasing. If not, that means we have a bug 
2. try fake_X = real_X and see if it converges quickly. if not, might mean we have added too much noise
3. if our current code's loss function is converging correctly, and it just needs more time, we can let it run for longer and speed up the code by descending on all coordinates at once 

In [None]:
# sampled_queries_dict = df_sampled_queries.reset_index().to_dict()


# for idx_q in range(k):
    
# #     # check if coordinate is one of the x_i x_j used in the query
#     coords_chosen = [sampled_queries_dict['pairs indices 1'][idx_q], sampled_queries_dict['pairs indices 2'][idx_q]]

# coords_chosen

In [None]:

# #### COORDINATE DESCENT LOOP STARTS HERE ###
# #### In this loop, we do coordinate descent, NOT multivariate Newton's method ####

# # initialize number of coordinate descent iterations = 0
# num_iter_descent = 0

# # while we don't have |q(X) - noisy_output|<lambda/2 for all q, continue coordinate descent 
# while not np.all(error < Llambda/2):
    
#     # initialize the partial derivative of each coordinate to be zero
#     part_derivative = np.zeros(num_points)
    
#     # compute the partial derivative of the loss function with respect to each coordinate     
#     for coordinate in range(num_points):
        
#         # calculate partial derivative of total_loss wrt to the coordinate 
#         # initiate the partial derivative of total loss to be 0.
#         part_deri_total_loss = 0 
        
#         # eventually this is a sum over gradient_loss_per_q for all q in df_sampled_queries
#         for idx_q in range(k):
            
#             # check if coordinate is one of the x_i x_j used in the query
#             coords_chosen = [df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[0], df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[1]]
#             bool_coord_chosen = coordinate in coords_chosen    
#             # if the coordinate is in the chosen pair of x_i, x_j
#             if bool_coord_chosen == True:
#                 coeff = df_sampled_queries[['coefficient']].iloc[idx_q].iloc[0]
#                 xi = df_sampled_queries[['pairs indices 1']].iloc[idx_q].iloc[0]
#                 xi = fake_X[xi]
#                 xj = df_sampled_queries[['pairs indices 2']].iloc[idx_q].iloc[0]
#                 xj = fake_X[xj]
#                 noisy_output = real_data_noisy_output[idx_q]
#                 part_deri_q_loss =  gradient_loss_per_q.evalf(data_precision, subs={c:coeff, x_i:xi, x_j:xj, q_hat:noisy_output})
            
#             # if the coordinate is not one of the chosen ones, then the partial derivative = 0
#             else: 
                
#                 part_deri_q_loss = 0
            
#             # add the partial derivative of loss per every query to the total part deri
#             part_deri_total_loss += part_deri_q_loss
        
#         part_derivative[coordinate] = part_deri_total_loss
        
#     part_deri_copy_1 = part_derivative 
       
#     # find the coordinate with a negative part_derivative which has the largest absolute value
    
#     # Get indices of negative values
#     negative_part_deri_indices = np.where(part_derivative < 0)[0]    
#     # Get the negative values
#     negative_part_deri_values = part_derivative[negative_part_deri_indices]    
#     # Find the index of the maximum absolute value among the negative values
#     max_neg_idx = np.argmax(np.abs(negative_part_deri_values))
#     # Get the index in part_derivative
#     x_coord_descent = negative_part_deri_indices[max_neg_idx]
#     #     # Get the value from part_derivative
#     # max_part_deri_value = part_derivative[max_part_deri_index]
    
    
#     # update x value at the coordinate x_coord_descent 
    
#     # First, we calculate the 2nd derivative wrt x_coord_descent
#     deg_2_part_deri_total_loss = 0
#     for idx_q in range(k):
            
#             # check if x_coord_descent is one of the x_i x_j used in the query
#             coords_chosen = [df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[0], df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[1]]
#             bool_coord_chosen = x_coord_descent in coords_chosen    
#             # if the x_coord_descent is in the chosen pair of x_i, x_j
#             if bool_coord_chosen == True:
#                 coeff = df_sampled_queries[['coefficient']].iloc[idx_q].iloc[0]
#                 xi = df_sampled_queries[['pairs indices 1']].iloc[idx_q].iloc[0]
#                 xi = fake_X[xi]
#                 xj = df_sampled_queries[['pairs indices 2']].iloc[idx_q].iloc[0]
#                 xj = fake_X[xj]
#                 noisy_output = real_data_noisy_output[idx_q]
#                 deg_2_part_deri_q_loss =  hession_loss_per_q.evalf(data_precision, subs={c:coeff, x_i:xi, x_j:xj, q_hat:noisy_output})
            
#             # if the coordinate is not one of the chosen ones, then the partial derivative = 0
#             else: 
                
#                 deg_2_part_deri_q_loss = 0
            
#             # add the partial derivative of loss per every query to the total part deri
#             deg_2_part_deri_total_loss += deg_2_part_deri_q_loss
      
#     # Now we are ready to update fake_X at x_coord_descent
#     fake_X[x_coord_descent] = fake_X[x_coord_descent] - (part_derivative[x_coord_descent]/deg_2_part_deri_total_loss)
    
    
#     # recalculate error for queries that were impacted
#     # save a list of queries whose error were updated bc of change in x_coord_descent
#     which_q_impacted = [] 
#     for idx_q in range(k):
            
#         # check if x_coord_descent is one of the x_i x_j used in the query
#         coords_chosen = [df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[0], df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[1]]
#         bool_coord_chosen = x_coord_descent in coords_chosen    
#         # if the coordinate is in the chosen pair of x_i, x_j, we update the error for that query
#         if bool_coord_chosen == True:
        
#             # compute query output on the current synopsis 
#             fake_xi = fake_X[df_sampled_queries['pairs indices 1'].iloc[idx_q]]
#             fake_xj = fake_X[df_sampled_queries['pairs indices 2'].iloc[idx_q]]
    
#             # compute query output on fake data 
#             fake_data_output[idx_q] = deg_2_poly.evalf(data_precision, subs={c:df_sampled_queries['coefficient'].iloc[idx_q], x_i:fake_xi, x_j:fake_xj})
 
#             # update error for that query
#             # notice that this is |q(X) - real_data_noisy_output|
#             error[idx_q] = abs(fake_data_output[idx_q]-real_data_noisy_output[idx_q])
#             which_q_impacted.append(idx_q)
    
#     # add 1 to num_iter_descent 
#     num_iter_descent += 1
    
        
        
      

In [None]:
# ### DEBUGGING CELL 1 
# # Time = 2 ins
# # initialize the partial derivative of each coordinate to be zero
# part_derivative = np.zeros(num_points)

# num_iter_chosen = 0



# # compute the partial derivative of the loss function with respect to each coordinate     
# for coordinate in range(num_points):
    
#     # calculate partial derivative of total_loss wrt to the coordinate 
#     # initiate the partial derivative of total loss to be 0.
#     part_deri_total_loss = 0 
    
#     # eventually this is a sum over gradient_loss_per_q for all q in df_sampled_queries
#     for idx_q in range(k):
        
#         # check if coordinate is one of the x_i x_j used in the query
#         # The next two lines take 0.0005 s to run
#         # start_time = time.time()
#         coords_chosen = [df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[0], df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[1]]
#         bool_coord_chosen = coordinate in coords_chosen    
#         # print("--- %s seconds ---" % (time.time() - start_time))
        

#         # if the coordinate is in the chosen pair of x_i, x_j
#         if bool_coord_chosen == True: # time = 0.002 - 0.003
#             num_iter_chosen += 1
#             coeff = df_sampled_queries[['coefficient']].iloc[idx_q].iloc[0] # time = 0.0003 
#             xi = df_sampled_queries[['pairs indices 1']].iloc[idx_q].iloc[0]
#             xi = fake_X[xi]
#             xj = df_sampled_queries[['pairs indices 2']].iloc[idx_q].iloc[0]
#             xj = fake_X[xj]
            
#             noisy_output = real_data_noisy_output[idx_q] # time = 10^-7
            
#             # start_time = time.time()
#             part_deri_q_loss =  gradient_loss_per_q.evalf(data_precision, subs={c:coeff, x_i:xi, x_j:xj, q_hat:noisy_output}) # time = 0.0015 - 0.002
#             # print(coordinate, idx_q, "--- %s seconds ---" % (time.time() - start_time))
           
#         # if the coordinate is not one of the chosen ones, then the partial derivative = 0
            
#         else: 
#             # start_time = time.time()
#             part_deri_q_loss = 0
#             # print(coordinate, idx_q, "--- %s seconds ---" % (time.time() - start_time))
#         # add the partial derivative of loss per every query to the total part deri
#         part_deri_total_loss += part_deri_q_loss
        
    
#     part_derivative[coordinate] = part_deri_total_loss

# # print(num_iter_chosen)   
    
# # find the coordinate with a negative part_derivative which has the largest absolute value



In [None]:
# #### DEBUGGING CELL 2

# # Get indices of negative values
# # Time = 0.0003 
# negative_part_deri_indices = np.where(part_derivative < 0)[0]    
# # Get the negative values
# negative_part_deri_values = part_derivative[negative_part_deri_indices]    
# # Find the index of the maximum absolute value among the negative values
# max_neg_idx = np.argmax(np.abs(negative_part_deri_values))
# # Get the index in part_derivative
# x_coord_descent = negative_part_deri_indices[max_neg_idx]
# #     # Get the value from part_derivative
# # max_part_deri_value = part_derivative[max_part_deri_index]


# # update x value at the coordinate x_coord_descent 

# # First, we calculate the 2nd derivative wrt x_coord_descent
# deg_2_part_deri_total_loss = 0

# for idx_q in range(k): # Time = ~0.9 
        
#         # check if x_coord_descent is one of the x_i x_j used in the query
#         coords_chosen = [df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[0], df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[1]]
#         bool_coord_chosen = x_coord_descent in coords_chosen    
#         # if the x_coord_descent is in the chosen pair of x_i, x_j
#         if bool_coord_chosen == True: # Time = 0.01 
            
#             coeff = df_sampled_queries[['coefficient']].iloc[idx_q].iloc[0]
#             xi = df_sampled_queries[['pairs indices 1']].iloc[idx_q].iloc[0]
#             xi = fake_X[xi]
#             xj = df_sampled_queries[['pairs indices 2']].iloc[idx_q].iloc[0]
#             xj = fake_X[xj]
#             noisy_output = real_data_noisy_output[idx_q]
#             deg_2_part_deri_q_loss =  hession_loss_per_q.evalf(data_precision, subs={c:coeff, x_i:xi, x_j:xj, q_hat:noisy_output})
        
            
#         # if the coordinate is not one of the chosen ones, then the partial derivative = 0
#         else: 
            
#             deg_2_part_deri_q_loss = 0
        
#         # add the partial derivative of loss per every query to the total part deri
#         deg_2_part_deri_total_loss += deg_2_part_deri_q_loss

# # Now we are ready to update fake_X at x_coord_descent
# # start_time = time.time()
# # Time = 0.002 
# fake_X[x_coord_descent] = fake_X[x_coord_descent] - (part_derivative[x_coord_descent]/deg_2_part_deri_total_loss)
# # print("--- %s seconds ---" % (time.time() - start_time))

# # recalculate error for queries that were impacted
# # save a list of queries whose error were updated bc of change in x_coord_descent
# which_q_impacted = [] 
# for idx_q in range(k):
        
#     # check if x_coord_descent is one of the x_i x_j used in the query
#     coords_chosen = [df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[0], df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[1]]
#     bool_coord_chosen = x_coord_descent in coords_chosen    
#     # if the coordinate is in the chosen pair of x_i, x_j, we update the error for that query
    
#     if bool_coord_chosen == True: # Time = 0.02 
        
#         # compute query input on the current synopsis 
#         fake_xi = fake_X[df_sampled_queries['pairs indices 1'].iloc[idx_q]]
#         fake_xj = fake_X[df_sampled_queries['pairs indices 2'].iloc[idx_q]]

#         # compute query output on fake data 
#         fake_data_output[idx_q] = deg_2_poly.evalf(data_precision, subs={c:df_sampled_queries['coefficient'].iloc[idx_q], x_i:fake_xi, x_j:fake_xj})

#         # update error for that query
#         # notice that this is |q(X) - real_data_noisy_output|
#         error[idx_q] = abs(fake_data_output[idx_q]-real_data_noisy_output[idx_q])
#         which_q_impacted.append(idx_q)
    
   

In [None]:

# #### COORDINATE DESCENT LOOP STARTS HERE ###
# #### In this loop, we do coordinate descent, NOT multivariate Newton's method ####

# # initialize number of coordinate descent iterations = 0
# num_iter_descent = 0

# # while we don't have |q(X) - noisy_output|<lambda/2 for all q, continue coordinate descent 
# while not np.all(error < Llambda/2):
    
#     # initialize the partial derivative of each coordinate to be zero
#     part_derivative = np.zeros(num_points)
    
#     # compute the partial derivative of the loss function with respect to each coordinate     
#     for coordinate in range(num_points):
        
#         # calculate partial derivative of total_loss wrt to the coordinate 
#         # initiate the partial derivative of total loss to be 0.
#         part_deri_total_loss = 0 
        
#         # eventually this is a sum over gradient_loss_per_q for all q in df_sampled_queries
#         for idx_q in range(k):
            
#             # check if coordinate is one of the x_i x_j used in the query
#             coords_chosen = [df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[0], df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[1]]
#             bool_coord_chosen = coordinate in coords_chosen    
#             # if the coordinate is in the chosen pair of x_i, x_j
#             if bool_coord_chosen == True:
#                 coeff = df_sampled_queries[['coefficient']].iloc[idx_q].iloc[0]
#                 xi = df_sampled_queries[['pairs indices 1']].iloc[idx_q].iloc[0]
#                 xi = fake_X[xi]
#                 xj = df_sampled_queries[['pairs indices 2']].iloc[idx_q].iloc[0]
#                 xj = fake_X[xj]
#                 noisy_output = real_data_noisy_output[idx_q]
#                 part_deri_q_loss =  gradient_loss_per_q.evalf(data_precision, subs={c:coeff, x_i:xi, x_j:xj, q_hat:noisy_output})
            
#             # if the coordinate is not one of the chosen ones, then the partial derivative = 0
#             else: 
                
#                 part_deri_q_loss = 0
            
#             # add the partial derivative of loss per every query to the total part deri
#             part_deri_total_loss += part_deri_q_loss
        
#         part_derivative[coordinate] = part_deri_total_loss
        
        
#     # find the coordinate with a negative part_derivative which has the largest absolute value
    
#     # Get indices of negative values
#     negative_part_deri_indices = np.where(part_derivative < 0)[0]    
#     # Get the negative values
#     negative_part_deri_values = part_derivative[negative_part_deri_indices]    
#     # Find the index of the maximum absolute value among the negative values
#     max_neg_idx = np.argmax(np.abs(negative_part_deri_values))
#     # Get the index in part_derivative
#     x_coord_descent = negative_part_deri_indices[max_neg_idx]
#     #     # Get the value from part_derivative
#     # max_part_deri_value = part_derivative[max_part_deri_index]
    
    
#     # update x value at the coordinate x_coord_descent 
    
#     # First, we calculate the 2nd derivative wrt x_coord_descent
#     deg_2_part_deri_total_loss = 0
#     for idx_q in range(k):
            
#             # check if x_coord_descent is one of the x_i x_j used in the query
#             coords_chosen = [df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[0], df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[1]]
#             bool_coord_chosen = x_coord_descent in coords_chosen    
#             # if the x_coord_descent is in the chosen pair of x_i, x_j
#             if bool_coord_chosen == True:
#                 coeff = df_sampled_queries[['coefficient']].iloc[idx_q].iloc[0]
#                 xi = df_sampled_queries[['pairs indices 1']].iloc[idx_q].iloc[0]
#                 xi = fake_X[xi]
#                 xj = df_sampled_queries[['pairs indices 2']].iloc[idx_q].iloc[0]
#                 xj = fake_X[xj]
#                 noisy_output = real_data_noisy_output[idx_q]
#                 deg_2_part_deri_q_loss =  hession_loss_per_q.evalf(data_precision, subs={c:coeff, x_i:xi, x_j:xj, q_hat:noisy_output})
            
#             # if the coordinate is not one of the chosen ones, then the partial derivative = 0
#             else: 
                
#                 deg_2_part_deri_q_loss = 0
            
#             # add the partial derivative of loss per every query to the total part deri
#             deg_2_part_deri_total_loss += deg_2_part_deri_q_loss
      
#     # Now we are ready to update fake_X at x_coord_descent
#     fake_X[x_coord_descent] = fake_X[x_coord_descent] - (part_derivative[x_coord_descent]/deg_2_part_deri_total_loss)
    
    
#     # recalculate error for queries that were impacted
#     # save a list of queries whose error were updated bc of change in x_coord_descent
#     which_q_impacted = [] 
#     for idx_q in range(k):
            
#         # check if x_coord_descent is one of the x_i x_j used in the query
#         coords_chosen = [df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[0], df_sampled_queries[['pairs indices 1', 'pairs indices 2']].iloc[idx_q].iloc[1]]
#         bool_coord_chosen = x_coord_descent in coords_chosen    
#         # if the coordinate is in the chosen pair of x_i, x_j, we update the error for that query
#         if bool_coord_chosen == True:
        
#             # compute query output on the current synopsis 
#             fake_xi = fake_X[df_sampled_queries['pairs indices 1'].iloc[idx_q]]
#             fake_xj = fake_X[df_sampled_queries['pairs indices 2'].iloc[idx_q]]
    
#             # compute query output on fake data 
#             fake_data_output[idx_q] = deg_2_poly.evalf(data_precision, subs={c:df_sampled_queries['coefficient'].iloc[idx_q], x_i:fake_xi, x_j:fake_xj})
 
#             # update error for that query
#             # notice that this is |q(X) - real_data_noisy_output|
#             error[idx_q] = abs(fake_data_output[idx_q]-real_data_noisy_output[idx_q])
#             which_q_impacted.append(idx_q)
    
#     # add 1 to num_iter_descent 
#     num_iter_descent += 1
    
        
        
      

In [None]:

# query does the following:
# Given a database X, it selects two points x_i, x_j \in X uniformly at random and computes
# q_c(X) = c(x_i + x_j)^2.
# The set of queries Q consists of all such q_c with coefficient \in (0, 1] 
# Note that we are not setting the precision of coefficients to allow a large size of Q

# c, x_i, x_j = sp.symbols('c x_i x_j')

# # deg_2_poly = Function('deg_2_poly')

# deg_2_poly = c*(x_i+x_j)**2

# gradient_deg_2_poly = sp.diff(deg_2_poly, x_i)

# # evaluate a function
# gradient_deg_2_poly.evalf(4, subs={c:1, x_i:2.32, x_j:3})

# gradient_deg_2_poly(1,2,3)

# def query_quadratic(data, coefficient):
    
#     # select a pair of x_i and x_j uniformly at random from data
#     pair_indices = np.random.choice(len(data) , size = 2, replace = False, p = None)
#     pair = data[pair_indices]
#     output = coefficient*(pair[0]+pair[1])**2
    
#     return pair_indices, pair, output

# partial derivative of the loss function wrt to a selected coordinate of X
# this function is specific for the query q(x) = c(x_i+x_j)^2
# we might be able to generalize this using built in derivative functions







# def partial_derivative_loss(X, coordinate, c):
#     part_deri_total_loss = 0
#     order_2_part_deri_total_loss = 0
#     for index, coefficient in enumerate(c):
#         # check if the coordinate wrt which we are taking the partial derivative is in the pair
#         # if in the pair, take the partial derivative, otherwise partial derivative = 0
#         if coordinate in pairs_indices[index,:]:
#             x_i = pairs[index,:][0]
#             x_j = pairs[index,:][0]
#             part_deri_query_loss = (1/Llambda**2)*coefficient**2*4*(x_i+x_j)**3-4*noisy_output[index]*coefficient*(x_i+x_j)
#             order_2_part_deri_query_loss = (1/Llambda**2)*12*coefficient**2*(x_i+x_j)**2-4*coefficient*noisy_output[index]
#         else: 
#             part_deri_query_loss = 0
#             order_2_part_deri_query_loss = 0
#         part_deri_total_loss += part_deri_query_loss
#         order_2_part_deri_total_loss += order_2_part_deri_query_loss
#     return part_deri_total_loss, order_2_part_deri_total_loss


