# Generate synpuf disclosure risk

Compare synthetic PUFs trained from a 10% sample, both to the training set and a 10% holdout. Synthetic file (1) is from synthimpute random forests; (2) is from the synthpop R package.

## Setup

### Imports

In [1]:
import pandas as pd
import numpy as np
import synthimpute as si
import synpuf





**UPDATE!**

In [2]:
SYNTHESIS_ID = 17
PCT_TRAIN = 100

Folders.

In [3]:
PUF_SAMPLE_DIR = '~/Downloads/puf/'
SYN_DIR = '~/Downloads/syntheses/'
NEAREST_DIR = '~/Downloads/nearest/'

### Load data

In [4]:
synth = pd.read_csv(SYN_DIR + 'synpuf' + str(SYNTHESIS_ID) + '.csv')
train = pd.read_csv(PUF_SAMPLE_DIR + 'puf.csv')

## Preprocessing

Drop calculated features used as seeds, and drop s006.

In [5]:
synpuf.add_subtracted_features(train)
DROPS = ['S006', 'e00600_minus_e00650', 'e01500_minus_e01700',
         'RECID', 'E00100', 'E09600']
train.drop(DROPS, axis=1, inplace=True)
synth.columns = [x.upper() for x in synth.columns]
synth = synth[train.columns]

In [6]:
synth.columns

Index(['DSI', 'EIC', 'FDED', 'F2441', 'F6251', 'MARS', 'MIDR', 'N24', 'XTOT',
       'E00200', 'E00300', 'E00400', 'E00650', 'E00700', 'E00800', 'E00900',
       'E01100', 'E01200', 'E01400', 'E01700', 'E02000', 'E02100', 'E02300',
       'E02400', 'E03150', 'E03210', 'E03220', 'E03230', 'E03270', 'E03240',
       'E03290', 'E03300', 'E03400', 'E03500', 'E07240', 'E07260', 'E07300',
       'E07400', 'E07600', 'P08000', 'E09700', 'E09800', 'E09900', 'E11200',
       'E17500', 'E18400', 'E18500', 'E19200', 'E19800', 'E20100', 'E20400',
       'P22250', 'P23250', 'E24515', 'E24518', 'E26270', 'E27200', 'E32800',
       'E58990', 'E62900', 'E87521', 'E87530', 'E00600', 'E01500'],
      dtype='object')

In [7]:
synth.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)

## Nearest calculation

Compare nearest standardized Euclidean distance. Takes ~10 hours.

In [37]:
%%time
nearest = si.nearest_record(synth.sample(frac=0.01), train, k=3, scale=True)

CPU times: user 4min 56s, sys: 151 ms, total: 4min 56s
Wall time: 4min 56s


In [36]:
XA = synth.sample(frac=0.005)
means = XA.mean()
stds = XA.std()
((XA - means) / stds).isnull().sum().sum()

0

In [38]:
nearest.describe()

Unnamed: 0,id_A,id_B1,dist1,id_B2,dist2,id_B3,dist3
count,8189.0,8189.0,8189.0,8189.0,8189.0,8189.0,8189.0
mean,406508.659665,75527.957992,1.30798,75775.094273,1.370221,74506.29039,1.4701
std,235531.961951,47002.877388,2.945142,47102.515958,3.102298,47305.750708,3.228872
min,121.0,131.0,0.0,130.0,0.0,130.0,0.0
25%,199901.0,33642.0,0.141111,33625.0,0.165039,31624.0,0.209669
50%,405861.0,73629.0,0.579294,73837.0,0.625315,72481.0,0.717788
75%,609068.0,115463.0,1.43005,116299.0,1.48713,114154.0,1.60336
max,818921.0,163733.0,104.90986,163782.0,110.575039,163783.0,115.252957


In [10]:
nearest.to_csv(NEAREST_DIR + 'nearest' + str(SYNTHESIS_ID) + '.csv',
               index=False)

In [47]:
n = nearest.copy(deep=True)

In [48]:
n['min_dist'] = n[['dist1', 'dist2', 'dist3']].min(axis=1)
n['max_dist'] = n[['dist1', 'dist2', 'dist3']].max(axis=1)
n['mid_dist'] = n[['dist1', 'dist2', 'dist3']].sum(axis=1) - n.min_dist - n.max_dist

In [49]:
n['min_id'] = np.where(n.min_dist == n.dist1, n.id_B1,
                       np.where(n.min_dist == n.dist2, n.id_B2, n.id_B3))
n['max_id'] = np.where(n.max_dist == n.dist1, n.id_B1,
                       np.where(n.max_dist == n.dist2, n.id_B2, n.id_B3))
n['mid_id'] = n[['id_B1', 'id_B2', 'id_B3']].sum(axis=1) - n.min_id - n.max_id

In [50]:
n['id_B1'] = n.min_id
n['id_B2'] = n.mid_id
n['id_B3'] = n.max_id

n['dist1'] = n.min_dist
n['dist2'] = n.mid_dist
n['dist3'] = n.max_dist

In [51]:
n[nearest.columns].describe()

Unnamed: 0,id_A,id_B1,dist1,id_B2,dist2,id_B3,dist3
count,8189.0,8189.0,8189.0,8189.0,8189.0,8189.0,8189.0
mean,406508.659665,75051.59409,1.223916,76536.427647,1.414414,74221.320918,1.50997
std,235531.961951,47354.913184,2.872669,47117.25428,3.127181,47384.808385,3.265456
min,121.0,131.0,0.0,-42101.0,0.0,130.0,0.0
25%,199901.0,32690.0,0.105027,34387.0,0.188821,30989.0,0.22738
50%,405861.0,73694.0,0.509639,74663.0,0.67394,71543.0,0.756264
75%,609068.0,115032.0,1.326712,117116.0,1.534164,114474.0,1.645695
max,818921.0,163733.0,104.90986,282200.0,110.575039,163783.0,115.252957


In [52]:
n[nearest.columns].to_csv(NEAREST_DIR + 'nearest' + str(SYNTHESIS_ID) + '.csv',
                          index=False)