# Remove disclosures from synpuf20

## Setup

### Imports

In [1]:
import pandas as pd
import numpy as np
import synthimpute as si

### Load data

In [2]:
ID = 20

In [3]:
PUF_SAMPLE_DIR = '~/Downloads/puf/'
SYN_DIR = '~/Downloads/syntheses/'
NEAREST_DIR = '~/Downloads/'

In [4]:
synth = pd.read_csv(SYN_DIR + 'synpuf' + str(ID) + '.csv')
nearest = pd.read_csv(NEAREST_DIR + 'nearest' + str(ID) + '.csv')
train = pd.read_csv(PUF_SAMPLE_DIR + 'train100.csv')

In [5]:
synth.head(1)

Unnamed: 0,DSI,E00200,E00300,E00400,E00600,E00650,E00700,E00800,E00900,E01100,...,FDED,MARS,MIDR,N24,P08000,P22250,P23250,S006,XTOT,RECID
0,0.0,176500.0,4620.0,0.0,190.0,190.0,230.0,0.0,119589.0,0.0,...,1.0,2,0.0,1.0,0.0,0.0,0.0,1015,3.0,1


In [6]:
nearest.head()

Unnamed: 0,id_A,id_B1,dist1,id_B2,dist2,id_B3,dist3
0,0,98408,0.221333,113108,0.298573,135299,0.36212
1,1,113322,0.050151,90265,0.062779,89516,0.064895
2,2,51274,3.824837,131622,3.911574,50354,3.918281
3,3,160852,0.66309,121766,0.729695,103645,0.77467
4,4,81262,0.334493,37695,0.48846,95821,0.532224


In [7]:
disclosures = nearest[(nearest.dist1 == 0) & (nearest.dist2 > 0)].id_A

Share of records to drop.

In [8]:
disclosures.shape[0] / nearest.shape[0]

0.08374708461040627

In [9]:
synth_no_disclosures = synth.drop(index=disclosures)

This should be the same share.

In [10]:
1 - synth_no_disclosures.shape[0] / synth.shape[0]

0.08374708461040625

## Export

In [11]:
synth_no_disclosures.to_csv('~/Downloads/synpuf20_no_disclosures.csv',
                            index=False)

## Checks

In [12]:
def nonzero_rows(df):
    if isinstance(df, pd.Series):
        return df[df != 0]
    return df.loc[(df!=0).any(axis=1)]

Problematic record from earlier checks.

In [13]:
nonzero_rows(synth.iloc[522157])

E00200     25300.0
E09800       790.0
E87521      2500.0
FDED           2.0
MARS           4.0
N24            2.0
S006      145165.0
XTOT           3.0
RECID     522158.0
Name: 522157, dtype: float64

In [14]:
nonzero_rows(train.iloc[62701])

FDED           2.0
MARS           4.0
N24            2.0
XTOT           3.0
E00200     25300.0
E00100     25300.0
E09800       790.0
E87521      2500.0
RECID     139060.0
S006      145165.0
Name: 62701, dtype: float64

2nd-closest record.

In [15]:
nonzero_rows(train.iloc[48383])

EIC            1.0
FDED           2.0
MARS           2.0
N24            1.0
XTOT           4.0
E00200     12100.0
E02000    -11000.0
E00100      1090.0
E09800       680.0
E26270    -11000.0
RECID     105703.0
S006      144809.0
Name: 48383, dtype: float64

In [16]:
synth_no_disclosures[(synth_no_disclosures.E00200 == 25300) &
                     (synth_no_disclosures.E09800 == 790) &
                     (synth_no_disclosures.E87521 == 2500) &
                     (synth_no_disclosures.FDED == 2) &
                     (synth_no_disclosures.MARS == 4) &
                     (synth_no_disclosures.N24 == 2) &
                     (synth_no_disclosures.XTOT == 3)]

Unnamed: 0,DSI,E00200,E00300,E00400,E00600,E00650,E00700,E00800,E00900,E01100,...,FDED,MARS,MIDR,N24,P08000,P22250,P23250,S006,XTOT,RECID


Single record in the training set has these characteristics.

In [18]:
train[(train.E00200 == 25300) &
      (train.E09800 == 790) &
      (train.E87521 == 2500) &
      (train.FDED == 2) &
      (train.MARS == 4) &
      (train.N24 == 2) &
      (train.XTOT == 3)]

Unnamed: 0,DSI,EIC,FDED,F2441,F6251,MARS,MIDR,N24,XTOT,E00200,...,E27200,E32800,E58990,E62900,E87521,E87530,RECID,S006,e00600_minus_e00650,e01500_minus_e01700
62701,0,0,2,0,0,4,0,2,3,25300.0,...,0.0,0,0.0,0.0,2500.0,0,139060,145165,0.0,0.0


In [17]:
(disclosures.values == 522157).sum()

1