# Parser Script

In [1]:
import pandas as pd
import numpy as np
import sklearn

In [3]:
def nat2018Parser( line ):
    import pandas as pd
    ret_dict = dict(
        birth_year = line[8:12],
        birth_month = line[12:14],
        birth_time = line[18:22],
        birth_day_of_wk = line[22:23],
        birth_place = line[31:32],
        mothers_age_imputed = line[72:73],
        mothers_age = line[74:76],
        mothers_nativity = line[83:84],
        mothers_residence_status = line[103:104],
        mothers_race = line[104:106],
        mothers_race_imputed = line[110:111],
        mothers_hispanic_origin = line[111:112],
        mothers_hispanic_origin2 = line[116:117],
        paternity_acknow = line[118:119],
        mothers_marital_status = line[119:120],
        mothers_maristat_imputed = line[120:121],
        mothers_education = line[123:124],
        fathers_age = line[146:148],
        fathers_race = line[150:152],
        fathers_hispanic_origin = line[158:159],
        fathers_hispanic_origin2 = line[161:162],
        fathers_education = line[162:163],
        prior_living_births = line[170:172],
        prior_dead_births = line[172:174],
        prior_terminations = line[174:176],
        mo_since_last_live_birth = line[197:200],
        mo_since_last_other_birth = line[205:208],
        mo_prenatal_care_began = line[223:225],
        n_prenatal_visits = line[237:239],
        wic = line[250:251],
        cigs_tri1 = line[254:256],
        cigs_tri2 = line[256:258],
        cigs_tri3 = line[258:260],
        mothers_height = line[279:281],
        mothers_bmi = line[282:286],
        pre_preg_lbs = line[291:294],
        delivery_lbs = line[298:301],
        pre_preg_diab = line[312:313],
        gest_diab = line[313:314],
        pre_preg_hypten = line[314:315],
        gest_hypten = line[315:316],
        hypten_ecl = line[316:317],
        prev_preterm_birth = line[317:318],
        infertility_treatment = line[324:325],
        fertil_enhance = line[325:326],
        asst_repro_tech = line[326:327],
        n_prev_cesar = line[331:333],
        no_risk_reported = line[336:337],
        gonorrhea = line[342:343],
        syphilis = line[343:344],
        chlamydia = line[344:345],
        hepB = line[345:346],
        hepC = line[346:347],
        no_infection_reported = line[352:353],
        success_ext_cep = line[359:360],
        fail_ext_cep = line[360:361],
        induced_labor = line[382:383],
        aug_labor = line[383:384],
        steriods = line[384:385],
        antibiotics = line[385:386],
        chorioamnionitis = line[386:387],
        anesthesia = line[387:388],
        fetal_present_at_birth = line[400:401],
        final_delivery_method = line[401:402],
        trial_of_labor_attempt = line[402:403],
        maternal_transfusion = line[414:415],
        perineal_laceration = line[415:416],
        rupt_uterus = line[416:417],
        unplanned_hyster = line[417:418],
        admit_to_IC = line[418:419],
        attendant_at_birth = line[432:433],
        mother_transferred = line[433:434],
        delivery_payment_source = line[434:435],
        APGAR_score_5min = line[443:445],
        APGAR_score_10min = line[447:449],
        plurality = line[453:454],
        pluarlity_imputed = line[455:456],
        sex_of_infant = line[474:475],
        sex_of_infant_imputed = line[475:476],
        last_norm_menses_mo = line[476:478],
        last_norm_menses_yr = line[480:484],
        combined_gestation_imputed = line[487:488],
        obst_est_of_gestation_used = line[488:489],
        combined_gestation_wk = line[489:491],
        obst_est_edit_wk = line[498:500],
        birth_weight_gm = line[503:507],
        assist_vent_immed = line[516:517],
        assist_vent_after6 = line[517:518],
        admit_NICU = line[518:519],
        surfactant = line[519:520],
        antibiotics_for_newborn = line[520:521],
        seizures = line[521:522],
        anencephaly = line[536:537],
        meningo_spina_bif = line[537:538],
        cyn_cong_heart_disease = line[538:539],
        cong_diaph_hernia = line[539:540],
        omphalocele = line[540:541],
        gastroschisis = line[541:542],
        limb_reduc_defect = line[548:549],
        cleft_lip_or_palate = line[549:550],
        cleft_palate_only = line[550:551],
        down_syndr = line[551:552],
        suspect_chromo_disorder = line[552:553],
        hypospadias = line[553:554],
        no_cong_anamolies_checked = line[560:561],
        infant_transferred = line[566:567],
        infant_living_at_report = line[567:568],
        infant_breastfed_at_discharge = line[568:569]
    )
    return pd.Series( ret_dict )

def __single_df( idx_line ):
    import pandas as pd
    idx = idx_line[0]
    line = idx_line[1]
    return pd.DataFrame( nat2018Parser( line ), index = [idx] )

def createNat2018DF( lines ):
    import pandas as pd
    import multiprocessing as mp
    pool = mp.Pool( mp.cpu_count() - 1 )
    ret_df =  pd.concat(  pool.map( __single_df, enumerate(lines) ), axis = 0 )
    pool.close()
    return ret_df

def divvyNat2018OverCSV( nat18fwf_fp,
                         nfiles = 10,
                         output_dir = None,
                         output_fprefix = None ):
    from os.path import dirname, abspath, basename, join
    import numpy as np
    import pandas as pd
    if output_dir is None:
        output_dir = abspath( dirname( nat18fwf_fp ) )
    if output_fprefix is None:
        output_fprefix = basename( nat18fwf_fp )
    csv_files = pd.Series( [ join(output_dir, output_fprefix ) + ('_%d.csv' % i) for i in range(1, nfiles+1) ] )
    # open connections to all the output file (erasing any previous file)
    fps_conn = csv_files.apply( lambda x: open(x, 'w') )
    # write the column names
    idx_order = nat2018Parser(' '*600).index
    colnames = ','.join( idx_order )
    fps_conn.apply( lambda conn: conn.write( colnames + '\n' ) )
    # close and reopen to append
    fps_conn.apply( lambda conn: conn.close() )
    fps_conn = csv_files.apply( lambda x: open(x, 'a') )
    # write the lines
    with open( nat18fwf_fp, 'r' ) as fin:
        idx = 0
        line = fin.readline()
        while line:
            conn = fps_conn[ idx ]
            conn.write( ','.join( nat2018Parser(line)[idx_order] ) + '\n' )
            line = fin.readline()
            idx = (idx + 1)%nfiles
    # close the connections
    fps_conn.apply( lambda conn: conn.close() )

In [3]:
nat18fwf_fp = '../../data/Nat2017PublicUS.c20180516.r20180808.txt'
print(nat17fwf_fp)

divvyNat2018OverCSV(nat17fwf_fp)

../../data/Nat2017PublicUS.c20180516.r20180808.txt


# Load Natality Data

In [3]:
nat17 = pd.read_csv('../../data/Nat2017_txt_1.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
nat17.head().T

Unnamed: 0,0,1,2,3,4
birth_year,2017,2017,2017,2017,2017
birth_month,1,1,1,1,1
birth_time,735,1818,345,837,219
birth_day_of_wk,2,4,4,6,3
birth_place,1,1,1,1,1
mothers_age_imputed,,,,,
mothers_age,31,22,33,33,35
mothers_nativity,1,1,1,1,1
mothers_residence_status,1,1,1,2,1
mothers_race,2,1,1,1,10


In [6]:
del nat17['birth_time']
del nat17['birth_day_of_wk'] 
del nat17['mothers_age_imputed']
del nat17['mothers_nativity'] 
del nat17['mothers_race_imputed'] 
del nat17['mothers_hispanic_origin']
del nat17['mothers_maristat_imputed'] 
del nat17['fathers_hispanic_origin'] 
del nat17['unplanned_hyster'] 
del nat17['mother_transferred'] 
del nat17['plurality']
del nat17['pluarlity_imputed'] 
del nat17['sex_of_infant_imputed']
del nat17['last_norm_menses_mo']
del nat17['last_norm_menses_yr'] 
del nat17['combined_gestation_imputed'] 
del nat17['obst_est_of_gestation_used']
del nat17['obst_est_edit_wk'] 
del nat17['infant_transferred']

In [7]:
print(nat17.dtypes.to_string())

birth_year                         int64
birth_month                        int64
birth_place                        int64
mothers_age                        int64
mothers_residence_status           int64
mothers_race                       int64
mothers_hispanic_origin2           int64
paternity_acknow                  object
mothers_marital_status            object
mothers_education                  int64
fathers_age                        int64
fathers_race                       int64
fathers_hispanic_origin2           int64
fathers_education                  int64
prior_living_births                int64
prior_dead_births                  int64
prior_terminations                 int64
mo_since_last_live_birth           int64
mo_since_last_other_birth          int64
mo_prenatal_care_began             int64
n_prenatal_visits                  int64
wic                              float64
cigs_tri1                          int64
cigs_tri2                          int64
cigs_tri3       

In [19]:
nat17.mothers_marital_status = nat17.mothers_marital_status.astype(str)

In [25]:
nat17.prior_living_births.unique()

array([ 3,  0,  1,  4,  2,  9,  8,  6,  5,  7, 10, 13, 99, 14, 12, 11, 15,
       17, 16])

In [31]:
nat17.groupby(['mothers_education', 'fathers_education']).agg({'n_prenatal_visits':'median', 'birth_year':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,n_prenatal_visits,birth_year
mothers_education,fathers_education,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,10.0,6661
1,2,11.0,1853
1,3,10.5,1528
1,4,11.0,346
1,5,10.0,86
1,6,10.0,143
1,7,9.0,29
1,8,13.0,7
1,9,9.0,1958
2,1,11.0,2593


In [7]:
def cleanNatality(natality):
    natality = natality.drop(['Notes', 'Infant Birth Weight Code', 'Year Code', 'Gender Code'], axis=1)
    natality = natality.loc[~(natality.isna().all(axis=1))]
    print(natality.loc[natality.isna().any(axis=1)])
    natality['State Code'] = natality['State Code'].astype(int).astype(str)
    natality['Age of Mother'] = natality['Age of Mother'].astype(str)
    natality.Year = natality.Year.astype(int).astype(str)
    natality.Births = natality.Births.astype(int)
    
    return natality

In [6]:
natality = cleanNatality(natality)
natality

Unnamed: 0,State,State Code,Age of Mother,Age of Mother Code,Year,Gender,Infant Birth Weight,Births
0,Alabama,1,Under 15 years,15,1999,Female,2000 - 2499 grams,12
1,Alabama,1,Under 15 years,15,1999,Female,2500 - 2999 grams,30
2,Alabama,1,Under 15 years,15,1999,Female,3000 - 3499 grams,39
3,Alabama,1,Under 15 years,15,1999,Female,3500 - 3999 grams,13
4,Alabama,1,Under 15 years,15,1999,Male,2500 - 2999 grams,27
...,...,...,...,...,...,...,...,...
314520,Wyoming,56,39 years,39,2018,Male,3500 - 3999 grams,10
314521,Wyoming,56,40 years,40,2018,Female,2500 - 2999 grams,13
314522,Wyoming,56,40 years,40,2018,Female,3000 - 3499 grams,11
314523,Wyoming,56,41 years,41,2018,Female,3000 - 3499 grams,15


In [8]:
# check dtypes, check for any other NaN's and output the rows with missingness
print(natality.dtypes)
print('*' * 50)
print(natality.isna().sum())
print('*' * 50)
natality.loc[(natality.isna().any(axis=1))]

State                  object
State Code             object
Age of Mother          object
Age of Mother Code     object
Year                   object
Gender                 object
Infant Birth Weight    object
Births                  int64
dtype: object
**************************************************
State                  0
State Code             0
Age of Mother          0
Age of Mother Code     0
Year                   0
Gender                 0
Infant Birth Weight    0
Births                 0
dtype: int64
**************************************************


Unnamed: 0,State,State Code,Age of Mother,Age of Mother Code,Year,Gender,Infant Birth Weight,Births


# Take a Sample of the Large Dataset

In [9]:
natsample = natality.copy().sample(int(314525/10)) # 10 percent of the data
print(natsample.isnull().values.any()) # check for any remaining NaN's; Should be False
natsample

False


Unnamed: 0,State,State Code,Age of Mother,Age of Mother Code,Year,Gender,Infant Birth Weight,Births
237864,Ohio,39,25 years,25,2014,Female,2000 - 2499 grams,233
203930,South Dakota,46,18 years,18,2012,Male,4000 - 4499 grams,10
61821,California,6,47 years,47,2005,Female,3000 - 3499 grams,24
76329,Utah,49,26 years,26,2005,Female,1500 - 1999 grams,20
271089,Nebraska,31,31 years,31,2016,Female,4000 - 4499 grams,59
...,...,...,...,...,...,...,...,...
157216,Idaho,16,21 years,21,2010,Female,2500 - 2999 grams,128
202297,Oklahoma,40,28 years,28,2012,Female,3500 - 3999 grams,405
173808,Delaware,10,31 years,31,2011,Male,4000 - 4499 grams,29
139444,Illinois,17,38 years,38,2009,Female,1000 - 1499 grams,18


In [10]:
np.unique(natsample['Age of Mother']) # Youngest is 13, oldest is 50 and over

array(['13 years', '14 years', '15 years', '15-19 years', '16 years',
       '17 years', '18 years', '19 years', '20 years', '20-24 years',
       '21 years', '22 years', '23 years', '24 years', '25 years',
       '25-29 years', '26 years', '27 years', '28 years', '29 years',
       '30 years', '30-34 years', '31 years', '32 years', '33 years',
       '34 years', '35 years', '35-39 years', '36 years', '37 years',
       '38 years', '39 years', '40 years', '40-44 years', '41 years',
       '42 years', '43 years', '44 years', '45 years', '45-49 years',
       '46 years', '47 years', '48 years', '49 years',
       '50 years and over', 'Under 15 years'], dtype=object)

In [11]:
# Categorize mother's ages
def ageGroup(age):
    if (age in ['13 years', '14 years', 'Under 15 years']):
        age = 'Under 15'
    elif (age in ['15 years', '15-19 years', '16 years', '17 years', '18 years', '19 years']):
        age = '15-19'
    elif (age in ['20 years', '20-24 years', '21 years', '22 years', '23 years', '24 years']):
        age = '20-24'
    elif (age in ['25 years', '25-29 years', '26 years', '27 years', '28 years', '29 years']):
        age = '25-29'
    elif (age in ['30 years', '30-34 years', '31 years', '32 years', '33 years', '24 years']):
        age = '30-34'
    elif (age in ['35 years', '35-39 years', '36 years', '37 years', '38 years', '39 years']):
        age = '35-39'
    elif (age in ['40 years', '40-44 years', '41 years', '42 years', '43 years', '44 years']):
        age = '40-44'
    elif (age in ['45 years', '45-49 years', '46 years', '47 years', '48 years', '49 yeras']):
        age = '45-49'
    else:
        age = '50 and over'
        
    return age # series value

natsample2 = natsample.copy()
natsample2['Age of Mother'] = natsample2['Age of Mother'].apply(ageGroup)
natsample2
# natsample2[natsample2['Age of Mother'] == 'Under 15']

Unnamed: 0,State,State Code,Age of Mother,Age of Mother Code,Year,Gender,Infant Birth Weight,Births
237864,Ohio,39,25-29,25,2014,Female,2000 - 2499 grams,233
203930,South Dakota,46,15-19,18,2012,Male,4000 - 4499 grams,10
61821,California,6,45-49,47,2005,Female,3000 - 3499 grams,24
76329,Utah,49,25-29,26,2005,Female,1500 - 1999 grams,20
271089,Nebraska,31,30-34,31,2016,Female,4000 - 4499 grams,59
...,...,...,...,...,...,...,...,...
157216,Idaho,16,20-24,21,2010,Female,2500 - 2999 grams,128
202297,Oklahoma,40,25-29,28,2012,Female,3500 - 3999 grams,405
173808,Delaware,10,30-34,31,2011,Male,4000 - 4499 grams,29
139444,Illinois,17,35-39,38,2009,Female,1000 - 1499 grams,18


In [12]:
natality['Age of Mother'] = natality['Age of Mother'].apply(ageGroup)
natality

Unnamed: 0,State,State Code,Age of Mother,Age of Mother Code,Year,Gender,Infant Birth Weight,Births
0,Alabama,1,Under 15,15,1999,Female,2000 - 2499 grams,12
1,Alabama,1,Under 15,15,1999,Female,2500 - 2999 grams,30
2,Alabama,1,Under 15,15,1999,Female,3000 - 3499 grams,39
3,Alabama,1,Under 15,15,1999,Female,3500 - 3999 grams,13
4,Alabama,1,Under 15,15,1999,Male,2500 - 2999 grams,27
...,...,...,...,...,...,...,...,...
314520,Wyoming,56,35-39,39,2018,Male,3500 - 3999 grams,10
314521,Wyoming,56,40-44,40,2018,Female,2500 - 2999 grams,13
314522,Wyoming,56,40-44,40,2018,Female,3000 - 3499 grams,11
314523,Wyoming,56,40-44,41,2018,Female,3000 - 3499 grams,15


# Plots using Plotly

In [14]:
# rename columns for easier access when plotting
print(natality.columns)
natality.rename(columns = {'State Code':'State_Code', 'Age of Mother':'Age_of_Mother', 'Age of Mother Code':'Age_of_Mother_Code', 'Infant Birth Weight':'Infant_Birth_Weight'}, inplace = True)
print(natality.columns)

Index(['State', 'State_Code', 'Age_of_Mother', 'Age_of_Mother_Code', 'Year',
       'Gender', 'Infant_Birth_Weight', 'Births'],
      dtype='object')
Index(['State', 'State_Code', 'Age_of_Mother', 'Age_of_Mother_Code', 'Year',
       'Gender', 'Infant_Birth_Weight', 'Births'],
      dtype='object')


In [16]:
# create groupedby dataframe
import plotly.express as px

data_nat1 = natality.groupby(['State', 'Age_of_Mother', 'Infant_Birth_Weight']).agg({'Births': ['sum', 'mean', 'median', 'max', 'min']})
data_nat1.columns = data_nat1.columns.droplevel(0)
data_nat1 = data_nat1.reset_index()
data_nat1['mean'] = data_nat1['mean'].round()
data_nat1

Unnamed: 0,State,Age_of_Mother,Infant_Birth_Weight,sum,mean,median,max,min
0,Alabama,15-19,1000 - 1499 grams,1085,19.0,14.0,64,10
1,Alabama,15-19,1500 - 1999 grams,2790,28.0,22.0,135,10
2,Alabama,15-19,2000 - 2499 grams,10327,69.0,48.0,394,11
3,Alabama,15-19,2500 - 2999 grams,36481,218.0,154.0,1264,14
4,Alabama,15-19,3000 - 3499 grams,57706,343.0,261.0,1947,13
...,...,...,...,...,...,...,...,...
3616,Wyoming,50 and over,2000 - 2499 grams,50,12.0,11.0,18,10
3617,Wyoming,50 and over,2500 - 2999 grams,729,24.0,22.0,43,15
3618,Wyoming,50 and over,3000 - 3499 grams,1356,42.0,41.5,65,29
3619,Wyoming,50 and over,3500 - 3999 grams,895,29.0,27.0,52,15


In [18]:
# example query by state
import plotly.express as px

data_nat = natality.query("State == 'New York'")
fig = px.bar(data_nat, x='Infant_Birth_Weight', y='Births')
fig.show()

In [19]:
import plotly.express as px

# create dataframe with only moms under 15
natU15 = natality[natality['Age_of_Mother'] == 'Under 15']
fig = px.bar(natU15, x='Infant_Birth_Weight', y='Births')
fig.show()

In [23]:
import plotly.figure_factory as ff
import numpy as np

teens = natality[(natality['Age_of_Mother'] == 'Under 15') | (natality['Age_of_Mother'] == '15-19')].Births
hist_data = [teens]
group_labels = ['distplot'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels)
fig.show()

# Natality 2017

In [9]:
nat17_1.head()

Unnamed: 0,0,1,2,3,4
birth_year,2017,2017,2017,2017,2017
birth_month,1,1,1,1,1
birth_time,735,1818,345,837,219
birth_day_of_wk,2,4,4,6,3
birth_place,1,1,1,1,1
mothers_age_imputed,,,,,
mothers_age,31,22,33,33,35
mothers_nativity,1,1,1,1,1
mothers_residence_status,1,1,1,2,1
mothers_race,2,1,1,1,10


In [None]:
nat17_1.columns

# Mortality Data

In [31]:
mortality = pd.read_csv('../../data/Mortality rates AGE birthplace.csv')

In [32]:
def cleanerMortality(mort_df):
    mort_df = mort_df.drop(['Notes', 'State Code'], axis=1)
    mort_df = mort_df.loc[~(mort_df.isna().all(axis=1))]
    print(mort_df.loc[mort_df.isna().any(axis=1)])
    return mort_df

In [33]:
mortality = cleanerMortality(mortality)

Empty DataFrame
Columns: [State, Age of Mother, Age of Mother Code, Birthplace, Birthplace Code, ICD-10 130 Cause List (Infants), ICD-10 130 Cause List (Infants) Code, Age of Infant at Death, Age of Infant at Death Code, Deaths, Births, Death Rate]
Index: []


In [41]:
mortality['Age of Mother'] = mortality['Age of Mother'].apply(ageGroup)
mortality

Unnamed: 0,State,Age of Mother,Age of Mother Code,Birthplace,Birthplace Code,ICD-10 130 Cause List (Infants),ICD-10 130 Cause List (Infants) Code,Age of Infant at Death,Age of Infant at Death Code,Deaths,Births,Death Rate
0,Alabama,15-19,15-19,In Hospital,1,Certain infectious and parasitic diseases (A00...,GR130-001,28 - 364 days,5,10,28316,0.35 (Unreliable)
1,Alabama,15-19,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 hour,1,23,28316,0.81
2,Alabama,15-19,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,1 - 23 hours,2,59,28316,2.08
3,Alabama,15-19,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,1 - 6 days,3,37,28316,1.31
4,Alabama,15-19,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,7 - 27 days,4,24,28316,0.85
...,...,...,...,...,...,...,...,...,...,...,...,...
27417,Wisconsin,35-39,35-39,In Hospital,1,"#Congenital malformations, deformations and ch...",GR130-118,28 - 364 days,5,14,32149,0.44 (Unreliable)
27418,Wisconsin,35-39,35-39,In Hospital,1,"Symptoms, signs and abnormal clinical and labo...",GR130-134,28 - 364 days,5,11,32149,0.34 (Unreliable)
27419,Wisconsin,35-39,35-39,In Hospital,1,"External causes of mortality (*U01,V01-Y84)",GR130-138,28 - 364 days,5,10,32149,0.31 (Unreliable)
27420,Wyoming,20-24,20-24,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 hour,1,11,7335,1.50 (Unreliable)


In [43]:
np.unique(mortality['Age of Mother']) == np.unique(mortality['Age of Mother Code'])

array([ True,  True,  True,  True,  True,  True,  True])

In [50]:
mortality = mortality.drop(['Age of Mother Code'], axis=1)

In [51]:
np.unique(mortality['Age of Infant at Death'])

array(['Under 1 day', 'Under 1 hour', 'Under 1 month', 'Under 1 week',
       'Under 1 year'], dtype=object)

In [45]:
def ageGroup2(infant_age):
    if (infant_age == '1 - 23 hours'):
        infant_age = 'Under 1 day'
    elif (infant_age == '1 - 6 days'):
        infant_age = 'Under 1 week'
    elif (infant_age == '7 - 27 days'):
        infant_age = 'Under 1 month'
    elif (infant_age == '28 - 364 days'):
        infant_age = 'Under 1 year'
    else:
        pass
    
    return infant_age

In [48]:
mortality['Age of Infant at Death'] = mortality['Age of Infant at Death'].apply(ageGroup2)
np.unique(mortality['Age of Infant at Death'])

In [53]:
np.unique(mortality['ICD-10 130 Cause List (Infants)'])

array(['#Accidents (unintentional injuries) (V01-X59)',
       '#Assault (homicide) (*U01,X85-Y09)', '#Atelectasis (P28.0-P28.1)',
       '#Bacterial sepsis of newborn (P36)',
       '#Chronic respiratory disease originating in the perinatal period (P27)',
       '#Congenital malformations, deformations and chromosomal abnormalities (Q00-Q99)',
       '#Diarrhea and gastroenteritis of infectious origin (A09)',
       '#Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism (D50-D89)',
       '#Diseases of the circulatory system (I00-I99)',
       '#Disorders related to short gestation and low birth weight, not elsewhere classified (P07)',
       '#Gastritis, duodenitis, and noninfective enteritis and colitis (K29,K50-K55)',
       '#Hematological disorders (P60-P61)',
       '#Hydrops fetalis not due to hemolytic disease (P83.2)',
       '#Influenza and pneumonia (J09-J18)',
       '#Intrauterine hypoxia and birth asphyxia (P20-P21)',
      

In [62]:
mortality['Death Rate'] = mortality['Death Rate'].map(lambda d: d.replace(' (Unreliable)', ''))
mortality['Death Rate'] = mortality['Death Rate'].astype(float)

In [67]:
mortality.dtypes

State                                    object
Age of Mother                            object
Birthplace                               object
Birthplace Code                           int64
ICD-10 130 Cause List (Infants)          object
ICD-10 130 Cause List (Infants) Code     object
Age of Infant at Death                   object
Age of Infant at Death Code               int64
Deaths                                    int64
Births                                    int64
Death Rate                              float64
dtype: object

In [71]:
mortality

Unnamed: 0,State,Age of Mother,Birthplace,Birthplace Code,ICD-10 130 Cause List (Infants),ICD-10 130 Cause List (Infants) Code,Age of Infant at Death,Age of Infant at Death Code,Deaths,Births,Death Rate
0,Alabama,15-19,In Hospital,1,Certain infectious and parasitic diseases (A00...,GR130-001,Under 1 year,5,10,28316,0.35
1,Alabama,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 hour,1,23,28316,0.81
2,Alabama,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 day,2,59,28316,2.08
3,Alabama,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 week,3,37,28316,1.31
4,Alabama,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 month,4,24,28316,0.85
...,...,...,...,...,...,...,...,...,...,...,...
27417,Wisconsin,35-39,In Hospital,1,"#Congenital malformations, deformations and ch...",GR130-118,Under 1 year,5,14,32149,0.44
27418,Wisconsin,35-39,In Hospital,1,"Symptoms, signs and abnormal clinical and labo...",GR130-134,Under 1 year,5,11,32149,0.34
27419,Wisconsin,35-39,In Hospital,1,"External causes of mortality (*U01,V01-Y84)",GR130-138,Under 1 year,5,10,32149,0.31
27420,Wyoming,20-24,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 hour,1,11,7335,1.50


In [76]:
mortality.rename(columns={'Age of Mother': 'Age_of_Mother', 'Birthplace Code': 'Birthplace_Code', 'ICD-10 130 Cause List (Infants)': 'Cause of Death', 'ICD-10 130 Cause List (Infants) Code': 'Cause of Death Code', 'Age of Infant at Death': 'Age_of_Infant_at_Death', 'Age of Infant at Death Code': 'Age_of_Infant_at_Death_Code', 'Death Rate': 'Death_Rate'}, inplace=True)

In [79]:
mortality.query("Age_of_Mother == '15-19'")

Unnamed: 0,State,Age_of_Mother,Birthplace,Birthplace_Code,Cause of Death,Cause of Death Code,Age_of_Infant_at_Death,Age_of_Infant_at_Death_Code,Deaths,Births,Death_Rate
0,Alabama,15-19,In Hospital,1,Certain infectious and parasitic diseases (A00...,GR130-001,Under 1 year,5,10,28316,0.35
1,Alabama,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 hour,1,23,28316,0.81
2,Alabama,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 day,2,59,28316,2.08
3,Alabama,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 week,3,37,28316,1.31
4,Alabama,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 month,4,24,28316,0.85
...,...,...,...,...,...,...,...,...,...,...,...
27325,Wisconsin,15-19,In Hospital,1,Disorders related to length of gestation and f...,GR130-086,Under 1 day,2,14,11745,1.19
27326,Wisconsin,15-19,In Hospital,1,#Disorders related to short gestation and low ...,GR130-088,Under 1 day,2,14,11745,1.19
27327,Wisconsin,15-19,In Hospital,1,"Symptoms, signs and abnormal clinical and labo...",GR130-134,Under 1 year,5,11,11745,0.94
27328,Wisconsin,15-19,In Hospital,1,"External causes of mortality (*U01,V01-Y84)",GR130-138,Under 1 year,5,13,11745,1.11


In [80]:
np.unique(mortality.Birthplace)

array(['In Hospital', 'Not in Hospital'], dtype=object)

In [86]:
mort1 = mortality.groupby(['Cause of Death']).agg({'Deaths' : 'sum'}).reset_index().sort_values(by=['Deaths'], ascending=False)

In [88]:
mort1.head(25)

Unnamed: 0,Cause of Death,Deaths
36,Certain conditions originating in the perinata...,223549
5,"#Congenital malformations, deformations and ch...",76004
46,Disorders related to length of gestation and f...,67646
9,#Disorders related to short gestation and low ...,66542
81,"Symptoms, signs and abnormal clinical and labo...",50739
50,Extremely low birth weight or extreme immaturi...,48942
60,Newborn affected by maternal factors and by co...,40109
24,#Sudden infant death syndrome (R95),31642
49,"External causes of mortality (*U01,V01-Y84)",21996
19,#Newborn affected by maternal complications of...,18735


In [99]:
mort1['Percentage of Total Deaths'] = (mort1.Deaths / mort1.Deaths.sum()).round(4) * 100

In [102]:
mort1[mort1['Percentage of Total Deaths'] > 0.1]

Unnamed: 0,Cause of Death,Deaths,Percentage of Total Deaths
36,Certain conditions originating in the perinata...,223549,27.89
5,"#Congenital malformations, deformations and ch...",76004,9.48
46,Disorders related to length of gestation and f...,67646,8.44
9,#Disorders related to short gestation and low ...,66542,8.3
81,"Symptoms, signs and abnormal clinical and labo...",50739,6.33
50,Extremely low birth weight or extreme immaturi...,48942,6.11
60,Newborn affected by maternal factors and by co...,40109,5.0
24,#Sudden infant death syndrome (R95),31642,3.95
49,"External causes of mortality (*U01,V01-Y84)",21996,2.74
19,#Newborn affected by maternal complications of...,18735,2.34


In [92]:
import plotly.graph_objects as go

fig = go.Figure(go.Bar(
            x=mort1['Deaths'].head(10),
            y=mort1['Cause of Death'].head(10),
            orientation='h'))

fig.show()

In [109]:
mortality.head()

Unnamed: 0,State,Age_of_Mother,Birthplace,Birthplace_Code,Cause of Death,Cause of Death Code,Age_of_Infant_at_Death,Age_of_Infant_at_Death_Code,Deaths,Births,Death_Rate
0,Alabama,15-19,In Hospital,1,Certain infectious and parasitic diseases (A00...,GR130-001,Under 1 year,5,10,28316,0.35
1,Alabama,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 hour,1,23,28316,0.81
2,Alabama,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 day,2,59,28316,2.08
3,Alabama,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 week,3,37,28316,1.31
4,Alabama,15-19,In Hospital,1,Certain conditions originating in the perinata...,GR130-070,Under 1 month,4,24,28316,0.85


In [108]:
np.unique(mortality.Age_of_Infant_at_Death)

array(['Under 1 day', 'Under 1 hour', 'Under 1 month', 'Under 1 week',
       'Under 1 year'], dtype=object)

In [125]:
mort2 = mortality.groupby(['Age_of_Mother', 'Age_of_Infant_at_Death']).agg({'Deaths' : 'sum'})
mort2 = mort2.reset_index()
mort2

Index(['Under 1 day', 'Under 1 hour', 'Under 1 month', 'Under 1 week',
       'Under 1 year', 'Under 1 day', 'Under 1 hour', 'Under 1 month',
       'Under 1 week', 'Under 1 year', 'Under 1 day', 'Under 1 hour',
       'Under 1 month', 'Under 1 week', 'Under 1 year', 'Under 1 day',
       'Under 1 hour', 'Under 1 month', 'Under 1 week', 'Under 1 year',
       'Under 1 day', 'Under 1 hour', 'Under 1 month', 'Under 1 week',
       'Under 1 year', 'Under 1 day', 'Under 1 hour', 'Under 1 month',
       'Under 1 week', 'Under 1 year', 'Under 1 day'],
      dtype='object', name='Age_of_Infant_at_Death')

In [147]:
mort2.iloc[:30]

Unnamed: 0,Age_of_Mother,Age_of_Infant_at_Death,Deaths
0,15-19,Under 1 day,32867
1,15-19,Under 1 hour,16282
2,15-19,Under 1 month,6855
3,15-19,Under 1 week,6607
4,15-19,Under 1 year,32398
5,20-24,Under 1 day,76883
6,20-24,Under 1 hour,42102
7,20-24,Under 1 month,19410
8,20-24,Under 1 week,18562
9,20-24,Under 1 year,84591


In [150]:
mort3 = pd.DataFrame({"Age_of_Mother":['45-49', '45-49', '45-49', '45-49', '45-49'], 
                    "Age_of_Infant_at_Death":['Under 1 hour', 'Under 1 day', 'Under 1 week', 'Under 1 month', 'Under 1 year'],  
                    "Deaths":[0, 23, 0, 0, 0]}) 
mort3

Unnamed: 0,Age_of_Mother,Age_of_Infant_at_Death,Deaths
0,45-49,Under 1 hour,0
1,45-49,Under 1 day,23
2,45-49,Under 1 week,0
3,45-49,Under 1 month,0
4,45-49,Under 1 year,0


In [151]:
mort4 = mort2.iloc[:30].append(mort3, ignore_index = True)
mort4['Death_Percentage'] = mort4.Deaths

In [174]:
mort4.loc[0:4].Death_Percentage

0    32867
1    16282
2     6855
3     6607
4    32398
Name: Death_Percentage, dtype: int64

In [176]:
def deathperc(mort4):
    for i in mort4.index:
        if i in [0,1,2,3,4]:
            mort4.loc[i].Death_Percentage = mort4.loc[i].Death_Percentage/mort4.loc[0:4].Death_Percentage.sum()
        elif i in [5,6,7,8,9]:
            mort4.loc[i].Death_Percentage = mort4.loc[i].Death_Percentage/mort4.loc[5:9].Death_Percentage.sum()
        elif i in [10,11,12,13,14]:
            mort4.loc[i].Death_Percentage = mort4.loc[i].Death_Percentage/mort4.loc[10:14].Death_Percentage.sum()
        elif i in [15,16,17,18,19]:
            mort4.loc[i].Death_Percentage = mort4.loc[i].Death_Percentage/mort4.loc[15:19].Death_Percentage.sum()
        elif i in [20,21,22,23,24]:
            mort4.loc[i].Death_Percentage = mort4.loc[i].Death_Percentage/mort4.loc[20:24].Death_Percentage.sum()
        elif i in [25,26,27,28,29]:
            mort4.loc[i].Death_Percentage = mort4.loc[i].Death_Percentage/mort4.loc[25:29].Death_Percentage.sum()
        elif i in [30,31,32,33,34]:
            mort4.loc[i].Death_Percentage = mort4.loc[i].Death_Percentage/mort4.loc[30:34].Death_Percentage.sum()
        else:
            pass
        
    return mort4

In [188]:
mort4[mort4.Age_of_Mother == '15-19'][['Death_Percentage']] = mort4[mort4.Age_of_Mother == '15-19'].Death_Percentage/mort4.loc[0:4].Death_Percentage.sum()

In [189]:
mort4

Unnamed: 0,Age_of_Mother,Age_of_Infant_at_Death,Deaths,Death_Percentage
0,15-19,Under 1 day,32867,32867
1,15-19,Under 1 hour,16282,16282
2,15-19,Under 1 month,6855,6855
3,15-19,Under 1 week,6607,6607
4,15-19,Under 1 year,32398,32398
5,20-24,Under 1 day,76883,76883
6,20-24,Under 1 hour,42102,42102
7,20-24,Under 1 month,19410,19410
8,20-24,Under 1 week,18562,18562
9,20-24,Under 1 year,84591,84591


In [186]:
mort4.dtypes

Age_of_Mother             object
Age_of_Infant_at_Death    object
Deaths                     int64
Death_Percentage           int64
dtype: object

In [178]:
mort5

Unnamed: 0,Age_of_Mother,Age_of_Infant_at_Death,Deaths,Death_Percentage
0,15-19,Under 1 day,32867,32867
1,15-19,Under 1 hour,16282,16282
2,15-19,Under 1 month,6855,6855
3,15-19,Under 1 week,6607,6607
4,15-19,Under 1 year,32398,32398
5,20-24,Under 1 day,76883,76883
6,20-24,Under 1 hour,42102,42102
7,20-24,Under 1 month,19410,19410
8,20-24,Under 1 week,18562,18562
9,20-24,Under 1 year,84591,84591


In [None]:
def deathPercent(death_df):
    for num in death_df[death_df.Age_of_Mother == '15-19'].Deaths:
        num = num / death_df[death_df.Age_of_Mother == '15-19'].Deaths.sum()
    if (death_df.Age_of_Mother == '15-19'):
        death_df = [deathdf.Age_of_Mother == ]
    elif (age in ['15 years', '15-19 years', '16 years', '17 years', '18 years', '19 years']):
        age = '15-19'
    elif (age in ['20 years', '20-24 years', '21 years', '22 years', '23 years', '24 years']):
        age = '20-24'
    elif (age in ['25 years', '25-29 years', '26 years', '27 years', '28 years', '29 years']):
        age = '25-29'
    elif (age in ['30 years', '30-34 years', '31 years', '32 years', '33 years', '24 years']):
        age = '30-34'
    elif (age in ['35 years', '35-39 years', '36 years', '37 years', '38 years', '39 years']):
        age = '35-39'
    elif (age in ['40 years', '40-44 years', '41 years', '42 years', '43 years', '44 years']):
        age = '40-44'
    elif (age in ['45 years', '45-49 years', '46 years', '47 years', '48 years', '49 yeras']):
        age = '45-49'
    else:
        age = '50 and over'
        
    return age # series value

In [161]:
def deathPercent(df):
    if df.Age_of_Mother == '15-19':
        df.Death_Percentage = df.Death_Percentage/df.Death_Percentage.sum()

In [107]:
import plotly.graph_objects as go

top_labels = np.unique(mortality.Age_of_Infant_at_Death)

colors = ['rgba(38, 24, 74, 0.8)', 'rgba(71, 58, 131, 0.8)',
          'rgba(122, 120, 168, 0.8)', 'rgba(164, 163, 204, 0.85)',
          'rgba(190, 192, 213, 1)']

x_data = [[21, 30, 21, 16, 12],
          [24, 31, 19, 15, 11],
          [27, 26, 23, 11, 13],
          [29, 24, 15, 18, 14]]

# y_data = ['The course was effectively<br>organized',
#           'The course developed my<br>abilities and skills ' +
#           'for<br>the subject', 'The course developed ' +
#           'my<br>ability to think critically about<br>the subject',
#           'I would recommend this<br>course to a friend']

# fig = go.Figure()

# for i in range(0, len(x_data[0])):
#     for xd, yd in zip(x_data, y_data):
#         fig.add_trace(go.Bar(
#             x=[xd[i]], y=[yd],
#             orientation='h',
#             marker=dict(
#                 color=colors[i],
#                 line=dict(color='rgb(248, 248, 249)', width=1)
#             )
#         ))

# fig.update_layout(
#     xaxis=dict(
#         showgrid=False,
#         showline=False,
#         showticklabels=False,
#         zeroline=False,
#         domain=[0.15, 1]
#     ),
#     yaxis=dict(
#         showgrid=False,
#         showline=False,
#         showticklabels=False,
#         zeroline=False,
#     ),
#     barmode='stack',
#     paper_bgcolor='rgb(248, 248, 255)',
#     plot_bgcolor='rgb(248, 248, 255)',
#     margin=dict(l=120, r=10, t=140, b=80),
#     showlegend=False,
# )

# annotations = []

# for yd, xd in zip(y_data, x_data):
#     # labeling the y-axis
#     annotations.append(dict(xref='paper', yref='y',
#                             x=0.14, y=yd,
#                             xanchor='right',
#                             text=str(yd),
#                             font=dict(family='Arial', size=14,
#                                       color='rgb(67, 67, 67)'),
#                             showarrow=False, align='right'))
#     # labeling the first percentage of each bar (x_axis)
#     annotations.append(dict(xref='x', yref='y',
#                             x=xd[0] / 2, y=yd,
#                             text=str(xd[0]) + '%',
#                             font=dict(family='Arial', size=14,
#                                       color='rgb(248, 248, 255)'),
#                             showarrow=False))
#     # labeling the first Likert scale (on the top)
#     if yd == y_data[-1]:
#         annotations.append(dict(xref='x', yref='paper',
#                                 x=xd[0] / 2, y=1.1,
#                                 text=top_labels[0],
#                                 font=dict(family='Arial', size=14,
#                                           color='rgb(67, 67, 67)'),
#                                 showarrow=False))
#     space = xd[0]
#     for i in range(1, len(xd)):
#             # labeling the rest of percentages for each bar (x_axis)
#             annotations.append(dict(xref='x', yref='y',
#                                     x=space + (xd[i]/2), y=yd,
#                                     text=str(xd[i]) + '%',
#                                     font=dict(family='Arial', size=14,
#                                               color='rgb(248, 248, 255)'),
#                                     showarrow=False))
#             # labeling the Likert scale
#             if yd == y_data[-1]:
#                 annotations.append(dict(xref='x', yref='paper',
#                                         x=space + (xd[i]/2), y=1.1,
#                                         text=top_labels[i],
#                                         font=dict(family='Arial', size=14,
#                                                   color='rgb(67, 67, 67)'),
#                                         showarrow=False))
#             space += xd[i]

# fig.update_layout(annotations=annotations)

# fig.show()

array(['Under 1 day', 'Under 1 hour', 'Under 1 month', 'Under 1 week',
       'Under 1 year'], dtype=object)

In [33]:
mort16 = pd.read_csv('../../data/mortality_quart_2016.csv')

In [32]:
mort16.head().T

NameError: name 'mort16' is not defined

In [6]:
mort16.columns

Index(['Unnamed: 0', 'birth_year', 'birth_month', 'birth_time',
       'birth_day_of_wk', 'birth_place', 'mothers_age_imputed', 'mothers_age',
       'mothers_nativity', 'mothers_residence_status',
       ...
       'limb_reduc_defect', 'cleft_lip_or_palate', 'cleft_palate_only',
       'down_syndr', 'suspect_chromo_disorder', 'hypospadias',
       'no_cong_anamolies_checked', 'infant_transferred',
       'infant_living_at_report', 'infant_breastfed_at_discharge'],
      dtype='object', length=109)

In [None]:
nat