# Generate samples for disclosure risk survey

Generate a semi-random sample of nearest matches for a survey asking which seems most problematic.

## Setup

### Imports

In [1]:
import pandas as pd
import numpy as np
import synthimpute as si

### Load data

In [2]:
PUF_SAMPLE_DIR = '~/Downloads/puf/'
SYN_DIR = '~/Downloads/syntheses/'
NEAREST_DIR = '~/Downloads/nearest/'

In [3]:
train = pd.read_csv(PUF_SAMPLE_DIR + 'puf.csv')

## Preprocessing

In [4]:
# TODO: Do this in the file creation.
def add_subtracted_features(df):
    df['E00600'] = df.E00650 + df.e00600_minus_e00650
    df['E01500'] = df.E01700 + df.e01500_minus_e01700
    df.drop(['e00600_minus_e00650', 'e01500_minus_e01700'], axis=1, inplace=True)
    
add_subtracted_features(train)

Align columns.

In [5]:
train.drop(['E00100', 'E09600', 'RECID', 'S006'], axis=1, inplace=True)

In [6]:
def nonzero_rows(df):
    if isinstance(df, pd.Series):
        return df[df != 0]
    return df.loc[(df!=0).any(axis=1)]

## Settings

In [7]:
BLOCK_BOUNDS = [-np.inf, 0, 0.01, 0.1, 1, 5, np.inf]

How many records should each bin have?

In [8]:
THRESHOLD = 1

## First synth

In [9]:
ID = 19
synth = pd.read_csv(SYN_DIR + 'synpuf' + str(ID) + '.csv')
nearest = pd.read_csv(NEAREST_DIR + 'nearest' + str(ID) + '.csv')
synth = synth[train.columns]

Define blocks based on `dist1` and `dist23`.

In [10]:
def add_blocks(nearest):
    nearest['dist2_3'] = nearest[['dist2', 'dist3']].mean(axis=1)
    nearest['dist1_block'] = pd.cut(nearest.dist1, BLOCK_BOUNDS)
    nearest['dist2_3_block'] = pd.cut(nearest.dist2_3, BLOCK_BOUNDS)

In [11]:
add_blocks(nearest)

In [12]:
nearest.groupby(['dist1_block', 'dist2_3_block']).size()

dist1_block  dist2_3_block
(-inf, 0.0]  (-inf, 0.0]       47631
             (0.0, 0.01]       23606
             (0.01, 0.1]        1120
             (0.1, 1.0]          290
             (1.0, 5.0]           24
             (5.0, inf]            3
(0.0, 0.01]  (0.0, 0.01]       98865
             (0.01, 0.1]       27598
             (0.1, 1.0]         4450
             (1.0, 5.0]          565
             (5.0, inf]            1
(0.01, 0.1]  (0.01, 0.1]      102066
             (0.1, 1.0]        29151
             (1.0, 5.0]          603
             (5.0, inf]            9
(0.1, 1.0]   (0.1, 1.0]       232811
             (1.0, 5.0]        35038
             (5.0, inf]           30
(1.0, 5.0]   (1.0, 5.0]       181692
             (5.0, inf]         6255
(5.0, inf]   (5.0, inf]        27122
dtype: int64

In [13]:
(nearest.groupby(['dist1_block', 'dist2_3_block']).size() >= THRESHOLD).sum()

21

In [14]:
l = []

In [15]:
def add_cases(l, nearest):
    for i in nearest.dist1_block.unique():
        for j in nearest.dist2_3_block.unique():
            tmp_nearest = nearest[(nearest.dist1_block == i) & 
                                  (nearest.dist2_3_block == j)]
            if tmp_nearest.shape[0] >= THRESHOLD:
                tmp_nearest = tmp_nearest.sample(1)
                tmp = nonzero_rows(si.nearest_synth_train_records(
                    tmp_nearest, synth, train,
                    k=3, verbose=False, label_distance=False).astype(int))
                tmp['model_id'] = ID
                tmp['synth_id'] = tmp_nearest.iloc[0].id_A
                tmp['dist1_block'] = i
                tmp['dist2_3_block'] = j
                l.append(tmp)

In [16]:
add_cases(l, nearest)

## #2

In [17]:
ID = 20
synth = pd.read_csv(SYN_DIR + 'synpuf' + str(ID) + '.csv')
nearest = pd.read_csv(NEAREST_DIR + 'nearest' + str(ID) + '.csv')
synth = synth[train.columns]

In [18]:
add_blocks(nearest)

In [19]:
nearest.groupby(['dist1_block', 'dist2_3_block']).size()

dist1_block  dist2_3_block
(-inf, 0.0]  (-inf, 0.0]       50823
             (0.0, 0.01]       57454
             (0.01, 0.1]       15168
             (0.1, 1.0]         8912
             (1.0, 5.0]         1805
             (5.0, inf]           41
(0.0, 0.01]  (0.0, 0.01]       85442
             (0.01, 0.1]       32563
             (0.1, 1.0]         8355
             (1.0, 5.0]         1373
             (5.0, inf]           23
(0.01, 0.1]  (0.01, 0.1]       83494
             (0.1, 1.0]        32087
             (1.0, 5.0]         1658
             (5.0, inf]           30
(0.1, 1.0]   (0.1, 1.0]       204677
             (1.0, 5.0]        34195
             (5.0, inf]          135
(1.0, 5.0]   (1.0, 5.0]       163846
             (5.0, inf]         6805
(5.0, inf]   (5.0, inf]        30044
dtype: int64

In [20]:
(nearest.groupby(['dist1_block', 'dist2_3_block']).size() >= THRESHOLD).sum()

21

In [21]:
add_cases(l, nearest)

## Finalize

Create single `DataFrame` from list of `DataFrame`s.

In [22]:
dat = pd.concat(l)

Add a random identifier.

In [23]:
dat['model_synth_id'] = dat.model_id * 1e9 + dat.synth_id

In [24]:
model_synth_ids = dat.model_synth_id.unique()
n_model_synth_ids = model_synth_ids.shape[0]

In [25]:
model_synth_id_map = pd.DataFrame({
    'model_synth_id': model_synth_ids,
    'masked_model_synth_id': np.random.choice(np.arange(n_model_synth_ids), 
                                              n_model_synth_ids, replace=False)})

In [26]:
dat = dat.merge(model_synth_id_map, on='model_synth_id')

Export

In [27]:
dat.to_csv('~/Downloads/disclosure_survey_samples.csv')