# Synthesis 

## Setup

In [1]:
import synpuf
import pandas as pd





In [2]:
INFILE = '~/puf2011.csv'
OUTFILE = '~/Downloads/syntheses/synpuf20.csv'

In [3]:
SEED_COLS = ['MARS', 'S006', 'E00100', 'E04600', 'P04470', 'E04800',
             'E62100', 'E05800', 'E08800', 'E59560', 'E26190']

In [4]:
AGG_RECIDS = [999996, 999997, 999998, 999999]

In [5]:
COL_ORDER = ['XTOT', 'EIC', 'F6251', 'F2441', 'DSI', 'MIDR', 'N24', 'FDED', 
             'E00200', 'E01700', 'e01500_minus_e01700', 'E01500', 'E02400',
             'E02000', 'E26270', 'E19200', 'P23250', 'E00900', 'E18400', 
             'E01400', 'e00600_minus_e00650', 'E00600', 'E00650', 'E18500', 'E19800',
             'E17500', 'E20400', 'E00300', 'E02300', 'E00400', 'E20100',
             'E09600', 'E87521', 'E00700', 'E03270', 'E03300', 'E32800',
             'E87530', 'E07300', 'E62900', 'E24515', 'E03150', 'E03210',
             'E03240', 'E00800', 'E03500', 'E09900', 'E27200', 'E03230',
             'E24518', 'E03290', 'E07400', 'E58990', 'E01100', 'E11200',
             'E07260', 'E07240', 'E03220', 'E07600', 'E03400', 'P08000',
             'E09700', 'E09800', 'E02100', 'P22250', 'E01200']

In [6]:
COLS = [
    'DSI',
    'E00200',
    'E00300',
    'E00400',
    'E00600',
    'E00650',
    'E00700',
    'E00800',
    'E00900',
    'E01100',
    'E01200',
    'E01400',
    'E01500',
    'E01700',
    'E02000',
    'E02100',
    'E02300',
    'E02400',
    'E03150',
    'E03210',
    'E03220',
    'E03230',
    'E03240',
    'E03270',
    'E03290',
    'E03300',
    'E03400',
    'E03500',
    'E07240',
    'E07260',
    'E07300',
    'E07400',
    'E07600',
    'E09700',
    'E09800',
    'E09900',
    'E11200',
    'E17500',
    'E18400',
    'E18500',
    'E19200',
    'E19800',
    'E20100',
    'E20400',
    'E24515',
    'E24518',
    'E26270',
    'E27200',
    'E32800',
    'E58990',
    'E62900',
    'E87521',
    'E87530',
    'EIC',
    'F2441',
    'F6251',
    'FDED',
    'MARS',
    'MIDR',
    'N24',
    'P08000',
    'P22250',
    'P23250',
    'S006',
    'XTOT']

## Load

In [7]:
train = pd.read_csv(INFILE)
# Calculate differences of variables that must be nonnegative for Tax-Calculator to run.
# Per https://github.com/donboyd5/synpuf/issues/17, e00600 must be weakly greater than
# e00650 and e01500 must be weakly greater than e01700.
train['e00600_minus_e00650'] = train.E00600 - train.E00650
train['e01500_minus_e01700'] = train.E01500 - train.E01700
train = train[~train.RECID.isin(AGG_RECIDS)][SEED_COLS + COL_ORDER]

## Synthesize

In [8]:
%%time
synth = synpuf.synthesize_puf_rf(
    train, seed_cols=SEED_COLS, synth_cols=COL_ORDER,
    trees=200, n=train.shape[0]*5  # Generate more records.
)

Synthesizing feature 1 of 66: XTOT...
Synthesizing feature 2 of 66: EIC...
Synthesizing feature 3 of 66: F6251...
Synthesizing feature 4 of 66: F2441...
Synthesizing feature 5 of 66: DSI...
Synthesizing feature 6 of 66: MIDR...
Synthesizing feature 7 of 66: N24...
Synthesizing feature 8 of 66: FDED...
Synthesizing feature 9 of 66: E00200...
Synthesizing feature 10 of 66: E01700...
Synthesizing feature 11 of 66: e01500_minus_e01700...
Synthesizing feature 12 of 66: E01500...
Synthesizing feature 13 of 66: E02400...
Synthesizing feature 14 of 66: E02000...
Synthesizing feature 15 of 66: E26270...
Synthesizing feature 16 of 66: E19200...
Synthesizing feature 17 of 66: P23250...
Synthesizing feature 18 of 66: E00900...
Synthesizing feature 19 of 66: E18400...
Synthesizing feature 20 of 66: E01400...
Synthesizing feature 21 of 66: e00600_minus_e00650...
Synthesizing feature 22 of 66: E00600...
Synthesizing feature 23 of 66: E00650...
Synthesizing feature 24 of 66: E18500...
Synthesizing fea

## Export

In [9]:
synth['RECID'] = synth.index + 1

In [10]:
synth[COLS + ['RECID']].to_csv(OUTFILE, index=False)