<a href="https://colab.research.google.com/github/dirknbr/privacy/blob/main/privacy_aggregation_compression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Privacy: aggregation, compression and unpacking

Imagine you have some N data points. For privacy reasons you need to aggregate the data at some group level. If we record some stats (mean, std, correlation matrix) about the distributions can we reconstruct the full data, and then retrieve our true effects between the variables.

Here we simulate 200 rows with 2 covariates and 5 groups.

Deaggregation assumes a multivariate normal distribution.

In [7]:

import numpy as np
# from scipy.special import expit
import pandas as pd
from sklearn import linear_model

def corr2cov(C, S):
  return np.diag(S).dot(C).dot(np.diag(S))

def mape(y, pred):
  return np.mean(abs(y - pred) / y)

# conversion test
C = np.array([[1, 0.25, .9], [.25, 1, .5], [.9, .5, 1]])
S = np.array([1, 4, 9])
print(corr2cov(C, S))



[[ 1.   1.   8.1]
 [ 1.  16.  18. ]
 [ 8.1 18.  81. ]]


In [8]:
# simulate data
np.random.seed(22)
N = 200
G = 5 # groups
x1 = np.random.normal(10, 1.1, size=N)
x2 = np.random.normal(12, 1.5, size=N)
# we have unequal groups
prob = np.random.uniform(size=G)
prob = prob / sum(prob)
gr = np.random.choice(G, size=N, p=prob)
# true beta is [.5, -.5]
y = 10 + (gr == 2) + .5 * x1 - .5 * x2 + np.random.normal(0, .5, size=N)
df = pd.DataFrame({'y': y, 'x1': x1, 'x2': x2, 'gr': gr})
print(df.describe())
print(df.corr())

                y          x1          x2          gr
count  200.000000  200.000000  200.000000  200.000000
mean     9.189527   10.122275   12.122534    2.025000
std      1.194722    1.123926    1.623312    1.599113
min      5.722211    6.953514    8.044215    0.000000
25%      8.345902    9.381114   11.056564    0.000000
50%      9.142382   10.086897   12.116145    2.000000
75%     10.072746   10.820536   13.225434    4.000000
max     12.267408   13.382937   16.271999    4.000000
           y        x1        x2        gr
y   1.000000  0.495406 -0.693098  0.002618
x1  0.495406  1.000000 -0.050348 -0.081966
x2 -0.693098 -0.050348  1.000000 -0.050279
gr  0.002618 -0.081966 -0.050279  1.000000


In [9]:
# the true model we cannot see
model1 = linear_model.LinearRegression()
model1.fit(df[['x1', 'x2']], df.y)
print('coef', model1.intercept_, model1.coef_)
print('r2', model1.score(df[['x1', 'x2']], df.y))
print('mape', mape(df.y, model1.predict(df[['x1', 'x2']])))



coef 10.198279557905902 [ 0.49076153 -0.49299726]
r2 0.6929933422207775
mape 0.05711989213548861


In [10]:
# aggregate
df_agg = pd.pivot_table(df, index='gr', values=['y', 'x1', 'x2'],
	aggfunc=[np.mean, np.std, len]).reset_index()
# record all bivariate correlations
df_agg2 = df.groupby('gr').apply(lambda x:
	np.corrcoef(x.y, x.x1)[0, 1]).reset_index()
df_agg3 = df.groupby('gr').apply(lambda x:
	np.corrcoef(x.y, x.x2)[0, 1]).reset_index()
df_agg4 = df.groupby('gr').apply(lambda x:
	np.corrcoef(x.x1, x.x2)[0, 1]).reset_index()
df_agg = pd.merge(df_agg, df_agg2, on='gr')
df_agg = pd.merge(df_agg, df_agg3, on='gr')
df_agg = pd.merge(df_agg, df_agg4, on='gr')
df_agg.rename(columns={'0_x': 'c_y_x1', '0_y': 'c_y_x2', 0: 'c_x1_x2'}, inplace=True)
print(df_agg)
print(list(df_agg))


   gr  (gr, )  (mean, x1)  (mean, x2)  ...  (len, y)    c_y_x1    c_y_x2   c_x1_x2
0   0       0   10.206112   12.133294  ...      65.0  0.585624 -0.801932 -0.109793
1   1       1   10.244797   11.509837  ...       4.0  0.766124 -0.953576 -0.922497
2   2       2   10.185683   12.570055  ...      45.0  0.469898 -0.757229  0.068230
3   3       3   10.132624   11.827099  ...      33.0  0.552282 -0.783936 -0.136691
4   4       4    9.949929   11.959557  ...      53.0  0.388246 -0.808595  0.018465

[5 rows x 14 columns]
['gr', ('gr', ''), ('mean', 'x1'), ('mean', 'x2'), ('mean', 'y'), ('std', 'x1'), ('std', 'x2'), ('std', 'y'), ('len', 'x1'), ('len', 'x2'), ('len', 'y'), 'c_y_x1', 'c_y_x2', 'c_x1_x2']




In [11]:
# deaggregate
y_deagg = np.zeros(N)
x1_deagg = np.zeros(N)
x2_deagg = np.zeros(N)
i = 0
for idx, row in df_agg.iterrows():
  # print(idx, row)
  n = int(row[[('len', 'y')]])
  idx2 = np.arange(i, i + n)
  means = row[[('mean', 'y'), ('mean', 'x1'), ('mean', 'x2')]]
  S = row[[('std', 'y'), ('std', 'x1'), ('std', 'x2')]]
  C = np.ones((3, 3))
  C[0, 1] = row['c_y_x1']
  C[0, 2] = row['c_y_x2']
  C[1, 2] = row['c_x1_x2']
  C = np.maximum(C, C.T) # C is symmetric
  cov = corr2cov(C, S)
  # print(means, cov, n)
  yx = np.random.multivariate_normal(means, cov, size=n)
  print(n, yx.shape)
  y_deagg[idx2] = yx[:, 0]
  x1_deagg[idx2] = yx[:, 1]
  x2_deagg[idx2] = yx[:, 2]
  i += n


65 (65, 3)
4 (4, 3)
45 (45, 3)
33 (33, 3)
53 (53, 3)


In [12]:
# model after deaggregation
df_deagg = pd.DataFrame({'y': y, 'x1': x1_deagg, 'x2': x2_deagg})
model2 = linear_model.LinearRegression()
model2.fit(df_deagg[['x1', 'x2']], df_deagg.y)
print('coef', model2.intercept_, model1.coef_)
# note that R2 will be zero here because of our MVN
print('r2', model2.score(df_deagg[['x1', 'x2']], df_deagg.y))
print('mape', mape(df_deagg.y, model2.predict(df_deagg[['x1', 'x2']])))


coef 10.24300843272974 [ 0.49076153 -0.49299726]
r2 0.0062336945376377795
mape 0.10660918172816865


We have successfully got our true beta back despite our compression of 200 rows into 5. The error has worsened but not dramatically. Our R2 has probably lost its purpose here.

