# Generate synpuf disclosure risk

Compare synthetic PUFs trained from a 10% sample, both to the training set and a 10% holdout. Synthetic file (1) is from synthimpute random forests; (2) is from the synthpop R package.

## Setup

### Imports

In [1]:
import pandas as pd
import numpy as np
import synthimpute as si
import synpuf

**UPDATE!**

In [2]:
SYNTHESIS_ID = 17
PCT_TRAIN = 100

Folders.

In [3]:
PUF_SAMPLE_DIR = '~/Downloads/puf/'
SYN_DIR = '~/Downloads/syntheses/'
NEAREST_DIR = '~/Downloads/nearest/'

### Load data

In [4]:
synth = pd.read_csv(SYN_DIR + 'synpuf' + str(SYNTHESIS_ID) + '.csv')
train = pd.read_csv(PUF_SAMPLE_DIR + 'puf.csv')

## Preprocessing

Drop calculated features used as seeds, and drop s006.

In [5]:
synpuf.add_subtracted_features(train)
DROPS = ['S006', 'e00600_minus_e00650', 'e01500_minus_e01700',
         'RECID', 'E00100', 'E09600']
train.drop(DROPS, axis=1, inplace=True)
synth.columns = [x.upper() for x in synth.columns]
synth = synth[train.columns]

In [6]:
synth.columns

Index(['DSI', 'EIC', 'FDED', 'F2441', 'F6251', 'MARS', 'MIDR', 'N24', 'XTOT',
       'E00200', 'E00300', 'E00400', 'E00650', 'E00700', 'E00800', 'E00900',
       'E01100', 'E01200', 'E01400', 'E01700', 'E02000', 'E02100', 'E02300',
       'E02400', 'E03150', 'E03210', 'E03220', 'E03230', 'E03270', 'E03240',
       'E03290', 'E03300', 'E03400', 'E03500', 'E07240', 'E07260', 'E07300',
       'E07400', 'E07600', 'P08000', 'E09700', 'E09800', 'E09900', 'E11200',
       'E17500', 'E18400', 'E18500', 'E19200', 'E19800', 'E20100', 'E20400',
       'P22250', 'P23250', 'E24515', 'E24518', 'E26270', 'E27200', 'E32800',
       'E58990', 'E62900', 'E87521', 'E87530', 'E00600', 'E01500'],
      dtype='object')

In [7]:
synth.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)

## Nearest calculation

Compare nearest standardized Euclidean distance. Takes ~10 hours.

In [8]:
%%time
nearest = si.nearest_record(synth,  #.sample(frac=0.01),
                            train, k=3, scale=True)

CPU times: user 13h 51min 11s, sys: 5h 3min 40s, total: 18h 54min 51s
Wall time: 19h 22min 33s


In [9]:
nearest.describe()

Unnamed: 0,id_A,id_B1,dist1,id_B2,dist2,id_B3,dist3
count,818930.0,818930.0,818930.0,818930.0,818930.0,818930.0,818930.0
mean,409464.5,74917.518713,1.213341,75571.156228,1.269989,74486.31953,1.373517
std,236404.872311,47277.49936,2.928344,46943.353897,3.062981,47295.793056,3.289503
min,0.0,2.0,0.0,2.0,0.0,2.0,0.0
25%,204732.25,32427.0,0.127777,33366.0,0.154334,32254.0,0.197878
50%,409464.5,72443.0,0.526357,73487.0,0.574475,71849.5,0.653988
75%,614196.75,115449.75,1.305382,116090.0,1.363631,114651.0,1.479125
max,818929.0,163785.0,280.242324,163785.0,207.052363,163785.0,280.261679


In [10]:
nearest.to_csv(NEAREST_DIR + 'nearest' + str(SYNTHESIS_ID) + '.csv',
               index=False)

In [11]:
n = nearest.copy(deep=True)

In [12]:
n['min_dist'] = n[['dist1', 'dist2', 'dist3']].min(axis=1)
n['max_dist'] = n[['dist1', 'dist2', 'dist3']].max(axis=1)
n['mid_dist'] = n[['dist1', 'dist2', 'dist3']].sum(axis=1) - n.min_dist - n.max_dist

In [13]:
n['min_id'] = np.where(n.min_dist == n.dist1, n.id_B1,
                       np.where(n.min_dist == n.dist2, n.id_B2, n.id_B3))
n['max_id'] = np.where(n.max_dist == n.dist1, n.id_B1,
                       np.where(n.max_dist == n.dist2, n.id_B2, n.id_B3))
n['mid_id'] = n[['id_B1', 'id_B2', 'id_B3']].sum(axis=1) - n.min_id - n.max_id

In [14]:
n['id_B1'] = n.min_id
n['id_B2'] = n.mid_id
n['id_B3'] = n.max_id

n['dist1'] = n.min_dist
n['dist2'] = n.mid_dist
n['dist3'] = n.max_dist

In [15]:
n[nearest.columns].describe()

Unnamed: 0,id_A,id_B1,dist1,id_B2,dist2,id_B3,dist3
count,818930.0,818930.0,818930.0,818930.0,818930.0,818930.0,818930.0
mean,409464.5,74512.875658,1.128435,76272.177344,1.316411,74189.94147,1.412
std,236404.872311,47482.771099,2.708568,46928.018169,3.155054,47379.099869,3.385077
min,0.0,2.0,0.0,-112931.0,0.0,2.0,0.0
25%,204732.25,31485.25,0.09441,33783.0,0.178369,32208.0,0.207307
50%,409464.5,71916.0,0.461061,74545.0,0.615724,71240.0,0.686823
75%,614196.75,115113.0,1.212425,117259.0,1.409424,114534.0,1.519579
max,818929.0,163784.0,195.111502,254924.0,207.265275,163785.0,280.261679


In [16]:
n[nearest.columns].to_csv(NEAREST_DIR + 'nearest' + str(SYNTHESIS_ID) + '.csv',
                          index=False)

In [17]:
n = nearest.copy(deep=True)
ndist = n[['dist1', 'dist2', 'dist3']]

n['min_dist'] = ndist.min(axis=1)
n['max_dist'] = ndist.max(axis=1)
n['mid_dist'] = ndist.sum(axis=1) - n.min_dist - n.max_dist

n['min_id'] = np.where(n.min_dist == n.dist1, n.id_B1,
                       np.where(n.min_dist == n.dist2, n.id_B2, n.id_B3))
# Run through these in a different order to avoid using the same ID
# for min and max, if they have the same distance (e.g. exact matches).
n['max_id'] = np.where(n.max_dist == n.dist3, n.id_B3,
                       np.where(n.max_dist == n.dist2, n.id_B2, n.id_B1))
n['mid_id'] = n[['id_B1', 'id_B2', 'id_B3']].sum(axis=1) - n.min_id - n.max_id

In [18]:
n['id_B1'] = n.min_id
n['id_B2'] = n.mid_id
n['id_B3'] = n.max_id

n['dist1'] = n.min_dist
n['dist2'] = n.mid_dist
n['dist3'] = n.max_dist

In [19]:
n[nearest.columns].to_csv(NEAREST_DIR + 'nearest' + str(SYNTHESIS_ID) + '.csv',
                          index=False)