# Linked Births and Deaths

The CDC provides a linking between births and infant deaths here: https://www.cdc.gov/nchs/data_access/vitalstatsonline.htm#Downloadable 

These data have very similar columns as the natality set, so it's important for understanding the distributions of infant fatalities based on other factors. There are exploratory publications (e.g., https://www.cdc.gov/nchs/products/databriefs/db285.htm) which focus a lot on how the deaths are distributed among different demographic groups; something important to notice is that they "standardize" some of the results as "deaths per 1000 live births" which must mean that they combined knowledge that we can distil from combining the linked data with the natality data. However, there are many other measured factors to consider. 

**11)** How do the different factors in the linked data differ in distribution from the natality data itself? Could you create an objective "industry" or target audience in which understanding such differences could have value?

In [13]:
import pandas as pd
import numpy as np
import sklearn

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Large Data File Parser

In [2]:
def linked_Parser( line ):
    import pandas as pd
    ret_dict = dict(
        birth_year = line[8:12],
        birth_month = line[12:14],
        birth_day_of_wk = line[22:23],
        birth_place = line[31:32],
        mothers_age = line[74:76],
        mothers_age_recode = line[78:79],
        mothers_nativity = line[83:84],
        mothers_race = line[106:107],
        mothers_hispanic_origin = line[114:115],
        paternity_acknow = line[118:119],
        mothers_marital_status = line[119:120],
        mothers_education = line[123:124],
        fathers_age = line[148:150],
        fathers_race = line[152:153],
        fathers_hispanic_origin = line[159:160],
        fathers_education = line[162:163],
        prior_living_births = line[170:172],
        prior_dead_births = line[172:174],
        prior_terminations = line[174:176],
        mo_since_last_live_birth = line[200:202],
        mo_since_last_other_birth = line[208:210],
        mo_prenatal_care_began = line[226:227],
        n_prenatal_visits = line[237:239],
        wic = line[250:251],
        cigs_before_preg = line[252:254],
        cigs_tri1 = line[254:256],
        cigs_tri2 = line[256:258],
        cigs_tri3 = line[258:260],
        mothers_bmi = line[286:287],
        pre_preg_lbs = line[291:294],
        weight_gain = line[303:305],
        pre_preg_diab = line[312:313],
        gest_diab = line[313:314],
        pre_preg_hypten = line[314:315],
        gest_hypten = line[315:316],
        hypten_ecl = line[316:317],
        prev_preterm_birth = line[317:318],
        infertility_treatment = line[324:325],
        fertil_enhance = line[325:326],
        asst_repro_tech = line[326:327],
        prev_cesar = line[330:331],
        no_risk_reported = line[336:337],
        gonorrhea = line[342:343],
        syphilis = line[343:344],
        chlamydia = line[344:345],
        hepB = line[345:346],
        hepC = line[346:347],
        no_infection_reported = line[352:353],
        success_ext_cep = line[359:360],
        fail_ext_cep = line[360:361],
        induced_labor = line[382:383],
        aug_labor = line[383:384],
        steriods = line[384:385],
        antibiotics = line[385:386],
        chorioamnionitis = line[386:387],
        anesthesia = line[387:388],
        fetal_present_at_birth = line[400:401],
        trial_of_labor_attempt = line[402:403],
        delivery_method = line[407:408],
        maternal_transfusion = line[414:415],
        perineal_laceration = line[415:416],
        rupt_uterus = line[416:417],
        unplanned_hyster = line[417:418],
        admit_to_ICU = line[418:419],
        attendant_at_birth = line[432:433],
        mother_transferred = line[433:434],
        payment_source = line[435:436],
        APGAR_score_5min = line[443:445],
        APGAR_score_10min = line[447:449],
        sex_of_infant = line[474:475],
        obst_est_of_gestation_used = line[488:489],
        combined_gestation_week = line[489:491],
        combined_gestation_week_recode = line[491:493],
        birth_weight_gm = line[503:507],
        birth_weight_gm_recode = line[508:510],
        assist_vent_immed = line[516:517],
        assist_vent_after6 = line[517:518],
        admit_NICU = line[518:519],
        surfactant = line[519:520],
        antibiotics_for_newborn = line[520:521],
        seizures = line[521:522],
        anencephaly = line[536:537],
        meningo_spina_bif = line[537:538],
        cyn_cong_heart_disease = line[538:539],
        cong_diaph_hernia = line[539:540],
        omphalocele = line[540:541],
        gastroschisis = line[541:542],
        limb_reduc_defect = line[548:549],
        cleft_lip_or_palate = line[549:550],
        cleft_palate_only = line[550:551],
        down_syndr = line[551:552],
        suspect_chromo_disorder = line[552:553],
        hypospadias = line[553:554],
        no_cong_anamolies_checked = line[560:561],
        infant_living_at_report = line[567:568],
        infant_breastfed_at_discharge = line[568:569],
        age_at_death_in_days = line[1355:1358],
        age_at_death_intervals = line[1358:1359],
        manner_of_death = line[1361:1362],
        place_of_injury = line[1365:1366],
        infant_cause_of_death = line[1372:1375]
    )
    return pd.Series( ret_dict )

def __single_df( idx_line ):
    import pandas as pd
    idx = idx_line[0]
    line = idx_line[1]
    return pd.DataFrame( linked_Parser( line ), index = [idx] )

def createLinked2017DF( lines ):
    import pandas as pd
    import multiprocessing as mp
    pool = mp.Pool( mp.cpu_count() - 1 )
    ret_df =  pd.concat(  pool.map( __single_df, enumerate(lines) ), axis = 0 )
    pool.close()
    return ret_df

def divvyLinked2017OverCSV( linked17fwf_fp,
                         nfiles = 12,
                         output_dir = None,
                         output_fprefix = None ):
    from os.path import dirname, abspath, basename, join
    import numpy as np
    import pandas as pd
    if output_dir is None:
        output_dir = abspath( dirname( linked17fwf_fp ) )
    if output_fprefix is None:
        output_fprefix = basename( linked17fwf_fp )
    csv_files = pd.Series( [ join(output_dir, output_fprefix ) + ('_%d.csv' % i) for i in range(1, nfiles+1) ] )
    # open connections to all the output file (erasing any previous file)
    fps_conn = csv_files.apply( lambda x: open(x, 'w') )
    # write the column names
    idx_order = linked_Parser(' '*600).index
    colnames = ','.join( idx_order )
    fps_conn.apply( lambda conn: conn.write( colnames + '\n' ) )
    # close and reopen to append
    fps_conn.apply( lambda conn: conn.close() )
    fps_conn = csv_files.apply( lambda x: open(x, 'a') )
    # write the lines
    with open( linked17fwf_fp, 'r' ) as fin:
        idx = 0
        line = fin.readline()
        while line:
            conn = fps_conn[ idx ]
            conn.write( ','.join( linked_Parser(line)[idx_order] ) + '\n' )
            line = fin.readline()
            idx = (idx + 1)%nfiles
    # close the connections
    fps_conn.apply( lambda conn: conn.close() )

In [3]:
fp = '../../data/Linked_BD_17/Linked_2017'
print(fp)

../../data/Linked_BD_17/Linked_2017


In [12]:
divvyLinked2017OverCSV(fp)

# Preliminary Data Analysis

In [3]:
linked_bd = pd.read_csv('../../data/Linked_BD_17/Linked_2017_1.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [19]:
linked_bd_2 = pd.read_csv('../../data/Linked_BD_17/Linked_2017_2.csv')

In [28]:
linked_bd_3 = pd.read_csv('../../data/Linked_Bd_17/Linked_2017_3.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [24]:
# columns to get rid of if not already gone
def linked_cleaner(df):
    del df['birth_day_of_wk'] 
    del df['age_at_death_in_days'] 
    del df['age_at_death_intervals']
    del df['manner_of_death'] 
    del df['place_of_injury']
    del df['infant_cause_of_death']
    del df['mother_transferred']
    del df['mo_since_last_other_birth']
    
    return df

In [29]:
linked_bd = linked_cleaner(linked_bd)
linked_bd_2 = linked_cleaner(linked_bd_2)
linked_bd_3 = linked_cleaner(linked_bd_3)

In [30]:
# number of living vs dead infants
print(linked_bd.infant_living_at_report.value_counts())
print(linked_bd_2.infant_living_at_report.value_counts())
print(linked_bd_3.infant_living_at_report.value_counts())

Y    320652
N       876
U       538
Name: infant_living_at_report, dtype: int64
Y    320645
N       861
U       559
Name: infant_living_at_report, dtype: int64
Y    320736
N       788
U       541
Name: infant_living_at_report, dtype: int64


In [33]:
dfs = [linked_bd, linked_bd_2, linked_bd_3]
linked_data = pd.concat(dfs)
linked_data.shape # 966,196 rows, quarter of the entire 2017 period linked birth/death data set

Unnamed: 0,birth_year,birth_month,birth_place,mothers_age,mothers_age_recode,mothers_nativity,mothers_race,mothers_hispanic_origin,paternity_acknow,mothers_marital_status,mothers_education,fathers_age,fathers_race,fathers_hispanic_origin,fathers_education,prior_living_births,prior_dead_births,prior_terminations,mo_since_last_live_birth,mo_prenatal_care_began,n_prenatal_visits,wic,cigs_before_preg,cigs_tri1,cigs_tri2,cigs_tri3,mothers_bmi,pre_preg_lbs,weight_gain,pre_preg_diab,gest_diab,pre_preg_hypten,gest_hypten,hypten_ecl,prev_preterm_birth,infertility_treatment,fertil_enhance,asst_repro_tech,prev_cesar,no_risk_reported,gonorrhea,syphilis,chlamydia,hepB,hepC,no_infection_reported,success_ext_cep,fail_ext_cep,induced_labor,aug_labor,steriods,antibiotics,chorioamnionitis,anesthesia,fetal_present_at_birth,trial_of_labor_attempt,delivery_method,maternal_transfusion,perineal_laceration,rupt_uterus,unplanned_hyster,admit_to_ICU,attendant_at_birth,payment_source,APGAR_score_5min,APGAR_score_10min,sex_of_infant,obst_est_of_gestation_used,combined_gestation_week,combined_gestation_week_recode,birth_weight_gm,birth_weight_gm_recode,assist_vent_immed,assist_vent_after6,admit_NICU,surfactant,antibiotics_for_newborn,seizures,anencephaly,meningo_spina_bif,cyn_cong_heart_disease,cong_diaph_hernia,omphalocele,gastroschisis,limb_reduc_defect,cleft_lip_or_palate,cleft_palate_only,down_syndr,suspect_chromo_disorder,hypospadias,no_cong_anamolies_checked,infant_living_at_report,infant_breastfed_at_discharge
0,2017,1,1,31,5,1,2,0,X,1,5,5,2,0,4,3,0,0,5,1,11,Y,0,0,0,0,4,220,16,N,N,N,N,N,N,N,X,X,N,1,N,N,N,N,N,1,N,N,N,N,N,N,N,Y,1,X,1,N,N,N,N,N,3,4,9,88,F,,40,8,,10,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,1,Y,Y
1,2017,1,1,26,4,1,1,0,Y,2,3,4,1,0,4,2,0,0,2,2,10,N,0,0,0,0,2,113,25,N,N,N,N,N,Y,N,X,X,N,0,N,N,N,N,N,1,N,N,N,N,N,Y,N,N,1,X,1,N,N,N,N,N,1,1,9,88,M,,39,7,,9,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,1,Y,Y
2,2017,1,2,35,6,1,1,0,X,1,7,6,1,0,4,2,0,0,5,1,12,N,0,0,0,0,2,170,32,N,N,N,N,N,N,N,X,X,N,1,N,N,N,N,N,1,N,N,N,N,N,N,N,N,1,X,1,N,N,N,N,N,3,2,9,88,F,,38,6,,9,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,1,Y,Y
3,2017,1,1,26,4,1,1,0,N,2,3,11,9,9,9,1,0,3,7,2,22,Y,20,10,5,5,2,125,48,N,N,N,N,N,N,N,X,X,N,1,N,N,N,N,N,1,N,N,Y,N,N,Y,N,N,1,X,1,N,N,N,N,N,1,1,9,88,F,,41,9,,8,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,1,Y,Y
4,2017,1,1,38,6,1,1,0,X,1,5,5,1,0,4,1,0,0,4,1,11,N,0,0,0,0,3,164,36,N,N,N,N,N,N,N,X,X,N,1,N,N,N,N,N,1,N,N,N,N,N,N,N,N,1,X,1,N,N,N,N,N,1,2,9,88,M,,41,9,,10,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,1,Y,Y


In [None]:
linked_data.to_csv('mortality_2017.csv')

# Group By's

In [None]:
.agg(lambda x: x.value_counts().index[0])

In [11]:
linked_bd.groupby(['mothers_race', 'wic', 'infant_living_at_report']).agg({'admit_NICU':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,admit_NICU
mothers_race,wic,infant_living_at_report,Unnamed: 3_level_1
1,N,N,375
1,N,U,217
1,N,Y,151798
1,U,N,22
1,U,U,22
1,U,Y,2404
1,Y,N,141
1,Y,U,164
1,Y,Y,79961
2,N,N,168


In [48]:
# total counts of babies admitted to NICU
df_nicu_count = linked_bd.groupby(['mothers_race', 'mothers_education', 'admit_NICU']).agg({'birth_year':'count'})
df_nicu_count

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,birth_year
mothers_race,mothers_education,admit_NICU,Unnamed: 3_level_1
1,1,N,7469
1,1,U,9
1,1,Y,632
1,2,N,19960
1,2,U,13
1,2,Y,1995
1,3,N,51767
1,3,U,46
1,3,Y,4966
1,4,N,42398


In [43]:
# percentage of dead babies admitted to NICU grouped by mother's race and mother's education
df_nicu_percent = df_nicu_count*100 / df_nicu_count.groupby(level=['mothers_race', 'mothers_education']).sum()
df_nicu_percent

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,birth_year
mothers_race,mothers_education,admit_NICU,Unnamed: 3_level_1
1,1,N,92.096178
1,1,U,0.110974
1,1,Y,7.792848
1,2,N,90.859432
1,2,U,0.059177
1,2,Y,9.081391
1,3,N,91.172793
1,3,U,0.081016
1,3,Y,8.746191
1,4,N,91.209879


In [44]:
df_nicu_count2 = linked_bd.groupby(['mothers_race', 'admit_NICU']).agg({'birth_year':'count'})

In [46]:
df_nicu_percent2 = df_nicu_count2*100 / df_nicu_count2.groupby(level='mothers_race').sum()

In [47]:
df_nicu_percent2

Unnamed: 0_level_0,Unnamed: 1_level_0,birth_year
mothers_race,admit_NICU,Unnamed: 2_level_1
1,N,91.530982
1,U,0.090598
1,Y,8.37842
2,N,88.291758
2,U,0.086217
2,Y,11.622026
3,N,89.358845
3,U,0.100705
3,Y,10.54045
4,N,91.89515
