# Generate synpuf disclosure risk

Compare synthetic PUFs trained from a 10% sample, both to the training set and a 10% holdout. Synthetic file (1) is from synthimpute random forests; (2) is from the synthpop R package.

## Setup

### Imports

In [1]:
import pandas as pd
import numpy as np
import synthimpute as si
import synpuf

**UPDATE!**

In [2]:
SYNTHESIS_ID = 20
PCT_TRAIN = 100

Folders.

In [3]:
PUF_SAMPLE_DIR = '~/Downloads/puf/'
SYN_DIR = '~/Downloads/syntheses/'
NEAREST_DIR = '~/Downloads/nearest/'

### Load data

In [4]:
synth = pd.read_csv(SYN_DIR + 'synpuf' + str(SYNTHESIS_ID) + '.csv')
train = pd.read_csv(PUF_SAMPLE_DIR + 'puf.csv')

## Preprocessing

Drop calculated features used as seeds, and drop s006.

In [5]:
synpuf.add_subtracted_features(train)
DROPS = ['S006', 'e00600_minus_e00650', 'e01500_minus_e01700',
         'RECID', 'E00100', 'E09600']
train.drop(DROPS, axis=1, inplace=True)
synth.columns = [x.upper() for x in synth.columns]
synth = synth[train.columns]

In [6]:
synth.columns

Index(['DSI', 'EIC', 'FDED', 'F2441', 'F6251', 'MARS', 'MIDR', 'N24', 'XTOT',
       'E00200', 'E00300', 'E00400', 'E00650', 'E00700', 'E00800', 'E00900',
       'E01100', 'E01200', 'E01400', 'E01700', 'E02000', 'E02100', 'E02300',
       'E02400', 'E03150', 'E03210', 'E03220', 'E03230', 'E03270', 'E03240',
       'E03290', 'E03300', 'E03400', 'E03500', 'E07240', 'E07260', 'E07300',
       'E07400', 'E07600', 'P08000', 'E09700', 'E09800', 'E09900', 'E11200',
       'E17500', 'E18400', 'E18500', 'E19200', 'E19800', 'E20100', 'E20400',
       'P22250', 'P23250', 'E24515', 'E24518', 'E26270', 'E27200', 'E32800',
       'E58990', 'E62900', 'E87521', 'E87530', 'E00600', 'E01500'],
      dtype='object')

In [7]:
synth.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)

## Nearest calculation

Compare nearest standardized Euclidean distance. Takes ~10 hours.

In [8]:
%%time
nearest = si.nearest_record(synth, #.sample(frac=0.01),
                            train, k=3, scale=True)

CPU times: user 13h 26min 29s, sys: 6h 57min 24s, total: 20h 23min 53s
Wall time: 20h 51min 49s


In [9]:
nearest.to_csv(NEAREST_DIR + 'nearest' + str(SYNTHESIS_ID) + '.csv',
               index=False)

In [10]:
n = nearest.copy(deep=True)

In [11]:
ndist = n[['dist1', 'dist2', 'dist3']]
n['min_dist'] = ndist.min(axis=1)
n['max_dist'] = ndist.max(axis=1)
n['mid_dist'] = ndist.sum(axis=1) - n.min_dist - n.max_dist

In [12]:
n['min_id'] = np.where(n.min_dist == n.dist1, n.id_B1,
                       np.where(n.min_dist == n.dist2, n.id_B2, n.id_B3))
# Run through these in a different order to avoid using the same ID
# for min and max, if they have the same distance (e.g. exact matches).
n['max_id'] = np.where(n.max_dist == n.dist3, n.id_B3,
                       np.where(n.max_dist == n.dist2, n.id_B2, n.id_B1))
n['mid_id'] = n[['id_B1', 'id_B2', 'id_B3']].sum(axis=1) - n.min_id - n.max_id

In [13]:
n['id_B1'] = n.min_id
n['id_B2'] = n.mid_id
n['id_B3'] = n.max_id

n['dist1'] = n.min_dist
n['dist2'] = n.mid_dist
n['dist3'] = n.max_dist

In [14]:
n[nearest.columns].describe()

Unnamed: 0,id_A,id_B1,dist1,id_B2,dist2,id_B3,dist3
count,818930.0,818930.0,818930.0,818930.0,818930.0,818930.0,818930.0
mean,409464.5,78952.3548,0.967648,78743.28484,1.134613,78542.301992,1.21759
std,236404.872311,46608.589266,2.814877,46513.650653,3.25581,46520.7558,3.480952
min,0.0,1.0,0.0,1.0,0.0,1.0,0.0
25%,204732.25,38269.0,0.000495,38251.0,0.010425,37885.0,0.015152
50%,409464.5,78207.0,0.147468,77925.0,0.234045,77624.5,0.277552
75%,614196.75,118456.0,0.969709,118013.75,1.196888,117909.0,1.316261
max,818929.0,163785.0,228.548483,163785.0,231.076655,163784.0,237.43558


Data checks.

In [15]:
assert n[(n.id_B1 == n.id_B2) | (n.id_B1 == n.id_B3) | (n.id_B2 == n.id_B3)].shape[0] == 0

In [16]:
assert n[(n.id_B1 < 0) | (n.id_B1 < 0) | (n.id_B3 < 0)].shape[0] == 0

In [17]:
n[nearest.columns].to_csv(NEAREST_DIR + 'nearest' + str(SYNTHESIS_ID) + '.csv',
                          index=False)