# Generate samples for disclosure risk survey

Generate a semi-random sample of nearest matches for a survey asking which seems most problematic.

## Setup

### Imports

In [1]:
import pandas as pd
import numpy as np
import synthimpute as si

### Load data

In [2]:
PUF_SAMPLE_DIR = '~/Downloads/puf/'
SYN_DIR = '~/Downloads/syntheses/'
NEAREST_DIR = '~/Downloads/nearest/'

In [3]:
train = pd.read_csv(PUF_SAMPLE_DIR + 'puf.csv')

## Preprocessing

In [4]:
# TODO: Do this in the file creation.
def add_subtracted_features(df):
    df['E00600'] = df.E00650 + df.e00600_minus_e00650
    df['E01500'] = df.E01700 + df.e01500_minus_e01700
    df.drop(['e00600_minus_e00650', 'e01500_minus_e01700'], axis=1, inplace=True)
    
add_subtracted_features(train)

Align columns.

In [5]:
train.drop(['E00100', 'E09600', 'RECID', 'S006'], axis=1, inplace=True)

In [6]:
def nonzero_rows(df):
    if isinstance(df, pd.Series):
        return df[df != 0]
    return df.loc[(df!=0).any(axis=1)]

## Settings

In [7]:
BLOCK_BOUNDS = [-np.inf, 0, 0.01, 0.1, 1, 5, np.inf]

How many records should each bin have?

In [8]:
THRESHOLD = 5

## First synth

In [9]:
ID = 19
synth = pd.read_csv(SYN_DIR + 'synpuf' + str(ID) + '.csv')
nearest = pd.read_csv(NEAREST_DIR + 'nearest' + str(ID) + '.csv')
synth = synth[train.columns]

Define blocks based on `dist1`, `dist2`, and `dist3`.

In [10]:
nearest[['dist1', 'dist2', 'dist3']].describe()

Unnamed: 0,dist1,dist2,dist3
count,818930.0,818930.0,818930.0
mean,0.975344,1.116985,1.197715
std,2.582479,2.995367,3.220225
min,0.0,0.0,0.0
25%,0.010162,0.018833,0.024078
50%,0.221509,0.290681,0.336787
75%,1.071412,1.263653,1.375725
max,178.274368,225.001829,234.371409


In [11]:
nearest['dist1_block'] = pd.cut(nearest.dist1, BLOCK_BOUNDS)
nearest['dist2_block'] = pd.cut(nearest.dist2, BLOCK_BOUNDS)
nearest['dist3_block'] = pd.cut(nearest.dist3, BLOCK_BOUNDS)

In [12]:
nearest.groupby(['dist1_block', 'dist2_block', 'dist3_block']).size()

dist1_block  dist2_block  dist3_block
(-inf, 0.0]  (-inf, 0.0]  (-inf, 0.0]     47631
                          (0.0, 0.01]      8796
                          (0.01, 0.1]        48
                          (0.1, 1.0]          7
             (0.0, 0.01]  (0.0, 0.01]     14524
                          (0.01, 0.1]       578
                          (0.1, 1.0]         25
                          (1.0, 5.0]          2
             (0.01, 0.1]  (0.01, 0.1]       727
                          (0.1, 1.0]        120
                          (1.0, 5.0]          2
             (0.1, 1.0]   (0.1, 1.0]        178
                          (1.0, 5.0]         11
                          (5.0, inf]          1
             (1.0, 5.0]   (1.0, 5.0]         21
                          (5.0, inf]          2
             (5.0, inf]   (5.0, inf]          1
(0.0, 0.01]  (0.0, 0.01]  (0.0, 0.01]     94597
                          (0.01, 0.1]     10804
                          (0.1, 1.0]        525
  

In [13]:
(nearest.groupby(['dist1_block', 'dist2_block', 'dist3_block']).size() >= THRESHOLD).sum()

40

In [14]:
l = []

In [15]:
for i in nearest.dist1_block.unique():
    for j in nearest.dist2_block.unique():
        for k in nearest.dist3_block.unique():
            tmp_nearest = nearest[(nearest.dist1_block == i) & 
                                  (nearest.dist2_block == j) &
                                  (nearest.dist3_block == k)]
            if tmp_nearest.shape[0] > 0:
                tmp_nearest = tmp_nearest.sample(1)
                tmp = nonzero_rows(si.nearest_synth_train_records(
                    tmp_nearest, synth, train,
                    k=3, verbose=False, label_distance=False).astype(int))
                tmp['model_id'] = ID
                tmp['synth_id'] = tmp_nearest.iloc[0].id_A
                tmp['dist1_block'] = i
                tmp['dist2_block'] = j
                tmp['dist3_block'] = k
                l.append(tmp)

## #2

In [16]:
ID = 20
synth = pd.read_csv(SYN_DIR + 'synpuf' + str(ID) + '.csv')
nearest = pd.read_csv(NEAREST_DIR + 'nearest' + str(ID) + '.csv')
synth = synth[train.columns]

In [17]:
nearest[['dist1', 'dist2', 'dist3']].describe()

Unnamed: 0,dist1,dist2,dist3
count,818930.0,818930.0,818930.0
mean,0.967648,1.134613,1.21759
std,2.814877,3.25581,3.480952
min,0.0,0.0,0.0
25%,0.000495,0.010425,0.015152
50%,0.147468,0.234045,0.277552
75%,0.969709,1.196888,1.316261
max,228.548483,231.076655,237.43558


In [18]:
nearest['dist1_block'] = pd.cut(nearest.dist1, BLOCK_BOUNDS)
nearest['dist2_block'] = pd.cut(nearest.dist2, BLOCK_BOUNDS)
nearest['dist3_block'] = pd.cut(nearest.dist3, BLOCK_BOUNDS)
# nearest['dist1_pctile'] = nearest.dist1.quantile()
# nearest['dist2_pctile'] = nearest.dist2.quantile()
# nearest['dist1_block'] = pd.cut(nearest.dist1_pctile, BLOCK_BOUNDS)
# nearest['dist2_block'] = pd.cut(nearest.dist2_pctile, BLOCK_BOUNDS)

In [19]:
nearest.groupby(['dist1_block', 'dist2_block', 'dist3_block']).size()

dist1_block  dist2_block  dist3_block
(-inf, 0.0]  (-inf, 0.0]  (-inf, 0.0]     50823
                          (0.0, 0.01]     14565
                          (0.01, 0.1]       184
                          (0.1, 1.0]         42
                          (1.0, 5.0]          6
             (0.0, 0.01]  (0.0, 0.01]     41280
                          (0.01, 0.1]      3962
                          (0.1, 1.0]        498
                          (1.0, 5.0]         22
             (0.01, 0.1]  (0.01, 0.1]     11853
                          (0.1, 1.0]       2188
                          (1.0, 5.0]         39
             (0.1, 1.0]   (0.1, 1.0]       6468
                          (1.0, 5.0]        717
                          (5.0, inf]          1
             (1.0, 5.0]   (1.0, 5.0]       1506
                          (5.0, inf]         24
             (5.0, inf]   (5.0, inf]         25
(0.0, 0.01]  (0.0, 0.01]  (0.0, 0.01]     81606
                          (0.01, 0.1]      9771
  

In [20]:
(nearest.groupby(['dist1_block', 'dist2_block', 'dist3_block']).size() >= THRESHOLD).sum()

46

In [21]:
for i in nearest.dist1_block.unique():
    for j in nearest.dist2_block.unique():
        for k in nearest.dist3_block.unique():
            tmp_nearest = nearest[(nearest.dist1_block == i) & 
                                  (nearest.dist2_block == j) &
                                  (nearest.dist3_block == k)]
            if tmp_nearest.shape[0] >= THRESHOLD:
                tmp_nearest = tmp_nearest.sample(1)
                tmp = nonzero_rows(si.nearest_synth_train_records(
                    tmp_nearest, synth, train,
                    k=3, verbose=False, label_distance=False).astype(int))
                tmp['model_id'] = ID
                tmp['synth_id'] = tmp_nearest.iloc[0].id_A
                tmp['dist1_block'] = i
                tmp['dist2_block'] = j
                tmp['dist3_block'] = k
                l.append(tmp)

## Finalize

In [22]:
dat = pd.concat(l)
dat.to_csv('~/Downloads/disclosure_survey_samples.csv')