# Generate synpuf disclosure risk

Compare Don's synthetic PUF trained from a 100% sample to the training set.

## Setup

### Imports

In [1]:
import pandas as pd
import numpy as np
import synthimpute as si
import synpuf

**UPDATE!**

In [2]:
SYNTHESIS_ID = 19
PCT_TRAIN = 100

Folders.

In [3]:
PUF_SAMPLE_DIR = '~/Downloads/puf/'
SYN_DIR = '~/Downloads/syntheses/'
NEAREST_DIR = '~/Downloads/nearest/'

### Load data

In [5]:
synth = pd.read_csv(SYN_DIR + 'synpuf' + str(SYNTHESIS_ID) + '.csv')
train = pd.read_csv(PUF_SAMPLE_DIR + 'train100.csv')

## Preprocessing

Drop calculated features used as seeds, and drop s006.

In [6]:
synpuf.add_subtracted_features(train)
DROPS = ['S006', 'e00600_minus_e00650', 'e01500_minus_e01700',
         'RECID', 'E00100', 'E09600']
train.drop(DROPS, axis=1, inplace=True)
synth.columns = [x.upper() for x in synth.columns]
synth = synth[train.columns]

In [7]:
synth.columns

Index(['DSI', 'EIC', 'FDED', 'F2441', 'F6251', 'MARS', 'MIDR', 'N24', 'XTOT',
       'E00200', 'E00300', 'E00400', 'E00650', 'E00700', 'E00800', 'E00900',
       'E01100', 'E01200', 'E01400', 'E01700', 'E02000', 'E02100', 'E02300',
       'E02400', 'E03150', 'E03210', 'E03220', 'E03230', 'E03270', 'E03240',
       'E03290', 'E03300', 'E03400', 'E03500', 'E07240', 'E07260', 'E07300',
       'E07400', 'E07600', 'P08000', 'E09700', 'E09800', 'E09900', 'E11200',
       'E17500', 'E18400', 'E18500', 'E19200', 'E19800', 'E20100', 'E20400',
       'P22250', 'P23250', 'E24515', 'E24518', 'E26270', 'E27200', 'E32800',
       'E58990', 'E62900', 'E87521', 'E87530', 'E00600', 'E01500'],
      dtype='object')

In [8]:
synth.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)

## Nearest calculation

Compare nearest standardized Euclidean distance. Takes ~10 hours.

In [20]:
%%time
nearest = si.nearest_record(synth.sample(frac=0.01),
                            train, k=3, scale=True)

CPU times: user 6min 20s, sys: 197 ms, total: 6min 20s
Wall time: 6min 21s


In [24]:
def nonzero_rows(df):
    if isinstance(df, pd.Series):
        return df[df != 0]
    return df.loc[(df!=0).any(axis=1)]

In [26]:
nonzero_rows(train.iloc[26403])

FDED          2.0
MARS          1.0
XTOT          1.0
E00200    29400.0
E02000    -7310.0
Name: 26403, dtype: float64

In [25]:
nonzero_rows(synth.iloc[42011])

FDED          2.0
MARS          1.0
XTOT          1.0
E00200    29400.0
E02000    -7310.0
Name: 42011, dtype: float64

In [22]:
nearest[nearest.dist1 == 0]

Unnamed: 0,id_A,id_B1,dist1,id_B2,dist2,id_B3,dist3
42011,42011,26403,0.0,114278,0.002874,45946,0.004334
95588,95588,135803,0.0,134178,0.000000,60251,0.000000
138821,138821,48062,0.0,108970,0.000000,1141,0.000000
52006,52006,47091,0.0,38392,0.002735,41962,0.002568
869610,869610,109216,0.0,11756,0.000000,37262,0.000000
52603,52603,97347,0.0,107922,0.227633,98102,0.297669
71366,71366,5692,0.0,26972,0.000000,104849,0.000000
37106,37106,83140,0.0,111143,2.075801,73176,2.320902
25184,25184,103362,0.0,48951,0.137960,86961,0.217955
400627,400627,1036,0.0,11457,0.000224,4489,0.000224


In [9]:
nearest.to_csv(NEAREST_DIR + 'nearest' + str(SYNTHESIS_ID) + '.csv',
               index=False)

In [27]:
n = nearest.copy(deep=True)
ndist = n[['dist1', 'dist2', 'dist3']]

In [28]:
n['min_dist'] = ndist.min(axis=1)
n['max_dist'] = ndist.max(axis=1)
n['mid_dist'] = ndist.sum(axis=1) - n.min_dist - n.max_dist

In [29]:
n['min_id'] = np.where(n.min_dist == n.dist1, n.id_B1,
                       np.where(n.min_dist == n.dist2, n.id_B2, n.id_B3))
# Run through these in a different order to avoid using the same ID
# for min and max, if they have the same distance (e.g. exact matches).
n['max_id'] = np.where(n.max_dist == n.dist3, n.id_B3,
                       np.where(n.max_dist == n.dist2, n.id_B2, n.id_B1))
n['mid_id'] = n[['id_B1', 'id_B2', 'id_B3']].sum(axis=1) - n.min_id - n.max_id

In [30]:
n['id_B1'] = n.min_id
n['id_B2'] = n.mid_id
n['id_B3'] = n.max_id

n['dist1'] = n.min_dist
n['dist2'] = n.mid_dist
n['dist3'] = n.max_dist

In [31]:
n[nearest.columns].describe()

Unnamed: 0,id_A,id_B1,dist1,id_B2,dist2,id_B3,dist3
count,9827.0,9827.0,9827.0,9827.0,9827.0,9827.0,9827.0
mean,492843.504935,79505.963773,0.861242,78437.353007,1.128884,78508.793426,1.220631
std,284932.974593,46484.909288,2.77396,46382.525631,3.18558,46523.856789,3.326625
min,41.0,17.0,0.0,1.0,0.0,7.0,0.0
25%,246929.5,38993.5,6.6e-05,38379.5,0.016134,37889.5,0.021575
50%,493479.0,78982.0,0.086967,76932.0,0.2771,78204.0,0.325703
75%,741560.0,119022.0,0.814428,117281.5,1.206932,117706.0,1.358239
max,982695.0,163783.0,110.665245,163784.0,111.133769,163763.0,115.235383


In [35]:
nonzero_rows(train.iloc[54214])

DSI          1.0
FDED         2.0
MARS         1.0
E00200    6720.0
P22250    -110.0
Name: 54214, dtype: float64

In [34]:
nonzero_rows(synth.iloc[30854])

DSI          1.0
FDED         2.0
MARS         1.0
E00200    6720.0
P22250    -110.0
Name: 30854, dtype: float64

In [37]:
n[n.dist1 == 0].shape[0] / n.shape[0]

0.2422916454665717

In [36]:
(n.dist1 == 0).mean()

0.2422916454665717

In [33]:
n[nearest.columns][n.dist1 == 0]

Unnamed: 0,id_A,id_B1,dist1,id_B2,dist2,id_B3,dist3
30854,30854,54214,0.0,57469,0.000350,55967,0.000350
42011,42011,26403,0.0,114278,0.002874,45946,0.004334
877301,877301,132387,0.0,66767,0.000956,17526,0.001010
95588,95588,135803,0.0,134178,0.000000,60251,0.000000
28067,28067,82258,0.0,39497,0.145122,132010,0.348595
43881,43881,28320,0.0,7649,0.003306,5740,0.008868
138821,138821,48062,0.0,108970,0.000000,1141,0.000000
52006,52006,47091,0.0,41962,0.002568,38392,0.002735
361340,361340,134548,0.0,93718,0.815662,137630,0.853986
11739,11739,74085,0.0,61201,0.733190,66529,0.832816


In [15]:
n[nearest.columns].to_csv(NEAREST_DIR + 'nearest' + str(SYNTHESIS_ID) + '.csv',
                          index=False)