# Linked Births and Deaths

The CDC provides a linking between births and infant deaths here: https://www.cdc.gov/nchs/data_access/vitalstatsonline.htm#Downloadable 

These data have very similar columns as the natality set, so it's important for understanding the distributions of infant fatalities based on other factors. There are exploratory publications (e.g., https://www.cdc.gov/nchs/products/databriefs/db285.htm) which focus a lot on how the deaths are distributed among different demographic groups; something important to notice is that they "standardize" some of the results as "deaths per 1000 live births" which must mean that they combined knowledge that we can distil from combining the linked data with the natality data. However, there are many other measured factors to consider. 

**11)** How do the different factors in the linked data differ in distribution from the natality data itself? Could you create an objective "industry" or target audience in which understanding such differences could have value?

In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
def linked_Parser( line ):
    import pandas as pd
    ret_dict = dict(
        birth_year = line[8:12],
        birth_month = line[12:14],
        birth_time = line[18:22],
        birth_day_of_wk = line[22:23],
        birth_place = line[31:32],
        mothers_age_imputed = line[72:73],
        mothers_age = line[74:76],
        mothers_nativity = line[83:84],
        mothers_residence_status = line[103:104],
        mothers_race = line[104:106],
        mothers_race_imputed = line[110:111],
        mothers_hispanic_origin = line[111:112],
        mothers_hispanic_origin2 = line[116:117],
        paternity_acknow = line[118:119],
        mothers_marital_status = line[119:120],
        mothers_maristat_imputed = line[120:121],
        mothers_education = line[123:124],
        fathers_age = line[146:148],
        fathers_race = line[150:152],
        fathers_hispanic_origin = line[158:159],
        fathers_hispanic_origin2 = line[161:162],
        fathers_education = line[162:163],
        prior_living_births = line[170:172],
        prior_dead_births = line[172:174],
        prior_terminations = line[174:176],
        mo_since_last_live_birth = line[197:200],
        mo_since_last_other_birth = line[205:208],
        mo_prenatal_care_began = line[223:225],
        n_prenatal_visits = line[237:239],
        wic = line[250:251],
        cigs_tri1 = line[254:256],
        cigs_tri2 = line[256:258],
        cigs_tri3 = line[258:260],
        mothers_height = line[279:281],
        mothers_bmi = line[282:286],
        pre_preg_lbs = line[291:294],
        delivery_lbs = line[298:301],
        pre_preg_diab = line[312:313],
        gest_diab = line[313:314],
        pre_preg_hypten = line[314:315],
        gest_hypten = line[315:316],
        hypten_ecl = line[316:317],
        prev_preterm_birth = line[317:318],
        infertility_treatment = line[324:325],
        fertil_enhance = line[325:326],
        asst_repro_tech = line[326:327],
        n_prev_cesar = line[331:333],
        no_risk_reported = line[336:337],
        gonorrhea = line[342:343],
        syphilis = line[343:344],
        chlamydia = line[344:345],
        hepB = line[345:346],
        hepC = line[346:347],
        no_infection_reported = line[352:353],
        success_ext_cep = line[359:360],
        fail_ext_cep = line[360:361],
        induced_labor = line[382:383],
        aug_labor = line[383:384],
        steriods = line[384:385],
        antibiotics = line[385:386],
        chorioamnionitis = line[386:387],
        anesthesia = line[387:388],
        fetal_present_at_birth = line[400:401],
        final_delivery_method = line[401:402],
        trial_of_labor_attempt = line[402:403],
        maternal_transfusion = line[414:415],
        perineal_laceration = line[415:416],
        rupt_uterus = line[416:417],
        unplanned_hyster = line[417:418],
        admit_to_IC = line[418:419],
        attendant_at_birth = line[432:433],
        mother_transferred = line[433:434],
        delivery_payment_source = line[434:435],
        APGAR_score_5min = line[443:445],
        APGAR_score_10min = line[447:449],
        plurality = line[453:454],
        pluarlity_imputed = line[455:456],
        sex_of_infant = line[474:475],
        sex_of_infant_imputed = line[475:476],
        last_norm_menses_mo = line[476:478],
        last_norm_menses_yr = line[480:484],
        combined_gestation_imputed = line[487:488],
        obst_est_of_gestation_used = line[488:489],
        combined_gestation_wk = line[489:491],
        obst_est_edit_wk = line[498:500],
        birth_weight_gm = line[503:507],
        assist_vent_immed = line[516:517],
        assist_vent_after6 = line[517:518],
        admit_NICU = line[518:519],
        surfactant = line[519:520],
        antibiotics_for_newborn = line[520:521],
        seizures = line[521:522],
        anencephaly = line[536:537],
        meningo_spina_bif = line[537:538],
        cyn_cong_heart_disease = line[538:539],
        cong_diaph_hernia = line[539:540],
        omphalocele = line[540:541],
        gastroschisis = line[541:542],
        limb_reduc_defect = line[548:549],
        cleft_lip_or_palate = line[549:550],
        cleft_palate_only = line[550:551],
        down_syndr = line[551:552],
        suspect_chromo_disorder = line[552:553],
        hypospadias = line[553:554],
        no_cong_anamolies_checked = line[560:561],
        infant_transferred = line[566:567],
        infant_living_at_report = line[567:568],
        infant_breastfed_at_discharge = line[568:569]
    )
    return pd.Series( ret_dict )

def __single_df( idx_line ):
    import pandas as pd
    idx = idx_line[0]
    line = idx_line[1]
    return pd.DataFrame( linkedBD_Parser( line ), index = [idx] )

def createNat2018DF( lines ):
    import pandas as pd
    import multiprocessing as mp
    pool = mp.Pool( mp.cpu_count() - 1 )
    ret_df =  pd.concat(  pool.map( __single_df, enumerate(lines) ), axis = 0 )
    pool.close()
    return ret_df

def divvyLinked2017OverCSV( linked17fwf_fp,
                         nfiles = 10,
                         output_dir = None,
                         output_fprefix = None ):
    from os.path import dirname, abspath, basename, join
    import numpy as np
    import pandas as pd
    if output_dir is None:
        output_dir = abspath( dirname( linked17fwf_fp ) )
    if output_fprefix is None:
        output_fprefix = basename( linked17fwf_fp )
    csv_files = pd.Series( [ join(output_dir, output_fprefix ) + ('_%d.csv' % i) for i in range(1, nfiles+1) ] )
    # open connections to all the output file (erasing any previous file)
    fps_conn = csv_files.apply( lambda x: open(x, 'w') )
    # write the column names
    idx_order = linkedBD_Parser(' '*600).index
    colnames = ','.join( idx_order )
    fps_conn.apply( lambda conn: conn.write( colnames + '\n' ) )
    # close and reopen to append
    fps_conn.apply( lambda conn: conn.close() )
    fps_conn = csv_files.apply( lambda x: open(x, 'a') )
    # write the lines
    with open( linked17fwf_fp, 'r' ) as fin:
        idx = 0
        line = fin.readline()
        while line:
            conn = fps_conn[ idx ]
            conn.write( ','.join( linkedBD_Parser(line)[idx_order] ) + '\n' )
            line = fin.readline()
            idx = (idx + 1)%nfiles
    # close the connections
    fps_conn.apply( lambda conn: conn.close() )