# Generate synpuf disclosure risk

Compare synthetic PUFs trained from a 10% sample, both to the training set and a 10% holdout. Synthetic file (1) is from synthimpute random forests; (2) is from the synthpop R package.

## Setup

### Imports

In [1]:
import pandas as pd
import numpy as np
import synthimpute as si

### Load data

In [2]:
synth = pd.read_csv('~/Downloads/puf_synth_50p_sample.csv')
# synthpop = pd.read_csv('~/Downloads/synthpop_samp.csv')
train = pd.read_csv('~/Downloads/puf_50p_sample_train.csv')
test = pd.read_csv('~/Downloads/puf_50p_sample_test.csv')

## Preprocessing

Drop calculated features used as seeds, and drop s006.

In [3]:
synth.drop(['E00100', 'E09600', 'S006'], axis=1, inplace=True)
train.drop(['E00100', 'E09600', 'S006'], axis=1, inplace=True)
test.drop(['E00100', 'E09600', 'S006'], axis=1, inplace=True)

Check pending https://github.com/MaxGhenis/synthimpute/issues/8.

In [None]:
# synthpop = synthpop[synth.columns]

In [4]:
synth.reset_index(drop=True, inplace=True)
# synthpop.reset_index(drop=True, inplace=True)
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

In [5]:
# TODO: Do this in the file creation.
def add_subtracted_features(df):
    df['E00600'] = df.E00650 + df.e00600_minus_e00650
    df['E01500'] = df.E01700 + df.e01500_minus_e01700
    df.drop(['e00600_minus_e00650', 'e01500_minus_e01700'], axis=1, inplace=True)
    
add_subtracted_features(train)
add_subtracted_features(test)

## RF

Compare nearest standardized Euclidean distance.

In [6]:
from scipy.spatial.distance import cdist

def nearest_record1(XA1, XB):
    """Get the nearest record between XA1 and XB.

    Args:
        XA: Series.
        XB: DataFrame.

    Returns:
        DataFrame with columns for id_B (from XB) and dist.
    """
    dist = cdist(XA1.values.reshape(1, -1), XB)[0]
    return pd.Series([np.amin(dist), np.argmin(dist)], 
                     index=['dist', 'id_B'])

def nearest_record(XA, XB):
    """Get the nearest record in XA for each record in XB.

    Args:
        XA: DataFrame. Each record is matched against the nearest in XB.
        XB: DataFrame.

    Returns:
        DataFrame with columns for id_A (from XA), id_B (from XB), and dist.
        Each id_A maps to a single id_B, which is the nearest record from XB.
    """
    res = XA.apply(lambda x: nearest_record1(x, XB), axis=1)
    res['id_A'] = XA.index
    # id_B is sometimes returned as an object.
    res['id_B'] = res.id_B.astype(int)
    # Reorder columns.
    return res[['id_A', 'id_B', 'dist']]

In [None]:
nearest = nearest_record(synth, train)

In [7]:
BLOCKS = ['MARS', 'DSI', 'XTOT']  # Reduce after optimizing.

In [8]:
XA = synth
XB = train
block_vars = BLOCKS

In [9]:
A_blocks = XA[block_vars].drop_duplicates()
B_blocks = XB[block_vars].drop_duplicates()
blocks = A_blocks.merge(B_blocks, on=block_vars)
n_blocks = blocks.shape[0]
res = []

In [10]:
blocks

Unnamed: 0,MARS,DSI,XTOT
0,2,0,2
1,2,0,3
2,1,0,1
3,2,0,5
4,2,0,4
5,4,0,3
6,4,0,2
7,4,0,1
8,1,1,0
9,1,0,2


In [11]:
for index, row in blocks.iterrows():
   # if verbose:
    print('Running block ' + str(index + 1) + ' of ' + str(n_blocks) +
          '...')
    res.append(si.nearest_record_single(si.subset_from_row(XA, row),
                                        si.subset_from_row(XB, row)))

Running block 1 of 15...


KeyboardInterrupt: 

In [13]:
nr = si.nearest_record(synth, train, BLOCKS)

KeyboardInterrupt: 

In [12]:
si.nearest_record2(synth, train, test, BLOCKS).head()

Running block 1 of 15...
Running block 2 of 15...
Running block 3 of 15...
Running block 4 of 15...
Running block 5 of 15...


KeyboardInterrupt: 

In [10]:
si.nearest_record(synth, train, test, BLOCKS).head()

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [8]:
%%time
nearest = si.nearest_synth_train_test(synth, train, test, BLOCKS, metric='euclidean')

Running block 1 of 15...
Running block 2 of 15...
Running block 3 of 15...
Running block 4 of 15...
Running block 5 of 15...
Running block 6 of 15...
Running block 7 of 15...
Running block 8 of 15...
Running block 9 of 15...
Running block 10 of 15...
Running block 11 of 15...
Running block 12 of 15...
Running block 13 of 15...
Running block 14 of 15...
Running block 15 of 15...
Running block 1 of 15...
Running block 2 of 15...
Running block 3 of 15...
Running block 4 of 15...
Running block 5 of 15...
Running block 6 of 15...
Running block 7 of 15...
Running block 8 of 15...
Running block 9 of 15...
Running block 10 of 15...
Running block 11 of 15...
Running block 12 of 15...
Running block 13 of 15...
Running block 14 of 15...
Running block 15 of 15...
CPU times: user 1min 46s, sys: 27.7 s, total: 2min 13s
Wall time: 1min 49s


In [9]:
nearest.to_csv('~/Downloads/nearest_synpuf7.csv')

## synthpop

In [10]:
%%time
nearest2 = si.nearest_synth_train_test(synthpop, train, test, BLOCKS, metric='euclidean')

Running block 1 of 14...
Running block 2 of 14...
Running block 3 of 14...
Running block 4 of 14...
Running block 5 of 14...
Running block 6 of 14...
Running block 7 of 14...
Running block 8 of 14...
Running block 9 of 14...
Running block 10 of 14...
Running block 11 of 14...
Running block 12 of 14...
Running block 13 of 14...
Running block 14 of 14...
Running block 1 of 14...
Running block 2 of 14...
Running block 3 of 14...
Running block 4 of 14...
Running block 5 of 14...
Running block 6 of 14...
Running block 7 of 14...
Running block 8 of 14...
Running block 9 of 14...
Running block 10 of 14...
Running block 11 of 14...
Running block 12 of 14...
Running block 13 of 14...
Running block 14 of 14...
CPU times: user 1min 42s, sys: 22.9 s, total: 2min 5s
Wall time: 1min 42s


In [11]:
nearest2.to_csv('~/Downloads/nearest_synthpop_samp.csv')