# Parser Script

In [39]:
import pandas as pd
import numpy as np
import sklearn

In [12]:
def nat2018Parser( line ):
    import pandas as pd
    ret_dict = dict(
        birth_year = line[8:12],
        birth_month = line[12:14],
        birth_time = line[18:22],
        birth_day_of_wk = line[22:23],
        birth_place = line[31:32],
        mothers_age_imputed = line[72:73],
        mothers_age = line[74:76],
        mothers_nativity = line[83:84],
        mothers_residence_status = line[103:104],
        mothers_race = line[104:106],
        mothers_race_imputed = line[110:111],
        mothers_hispanic_origin = line[111:112],
        mothers_hispanic_origin2 = line[116:117],
        paternity_acknow = line[118:119],
        mothers_marital_status = line[119:120],
        mothers_maristat_imputed = line[120:121],
        mothers_education = line[123:124],
        fathers_age = line[146:148],
        fathers_race = line[150:152],
        fathers_hispanic_origin = line[158:159],
        fathers_hispanic_origin2 = line[161:162],
        fathers_education = line[162:163],
        prior_living_births = line[170:172],
        prior_dead_births = line[172:174],
        prior_terminations = line[174:176],
        mo_since_last_live_birth = line[197:200],
        mo_since_last_other_birth = line[205:208],
        mo_prenatal_care_began = line[223:225],
        n_prenatal_visits = line[237:239],
        wic = line[250:250],
        cigs_tri1 = line[254:256],
        cigs_tri2 = line[256:258],
        cigs_tri3 = line[258:260],
        mothers_height = line[279:281],
        mothers_bmi = line[282:286],
        pre_preg_lbs = line[291:294],
        delivery_lbs = line[298:301],
        pre_preg_diab = line[312:313],
        gest_diab = line[313:314],
        pre_preg_hypten = line[314:315],
        gest_hypten = line[315:316],
        hypten_ecl = line[316:317],
        prev_preterm_birth = line[317:318],
        infertility_treatment = line[324:325],
        fertil_enhance = line[325:326],
        asst_repro_tech = line[326:327],
        n_prev_cesar = line[331:333],
        no_risk_reported = line[336:337],
        gonorrhea = line[342:343],
        syphilis = line[343:344],
        chlamydia = line[344:345],
        hepB = line[345:346],
        hepC = line[346:347],
        no_infection_reported = line[352:353],
        success_ext_cep = line[359:360],
        fail_ext_cep = line[360:361],
        induced_labor = line[382:383],
        aug_labor = line[383:384],
        steriods = line[384:385],
        antibiotics = line[385:386],
        chorioamnionitis = line[386:387],
        anesthesia = line[387:388],
        fetal_present_at_birth = line[400:401],
        final_delivery_method = line[401:402],
        trial_of_labor_attempt = line[402:403],
        maternal_transfusion = line[414:415],
        perineal_laceration = line[415:416],
        rupt_uterus = line[416:417],
        unplanned_hyster = line[417:418],
        admit_to_IC = line[418:419],
        attendant_at_birth = line[432:433],
        mother_transferred = line[433:434],
        delivery_payment_source = line[434:435],
        APGAR_score_5min = line[443:445],
        APGAR_score_10min = line[447:449],
        plurality = line[453:454],
        pluarlity_imputed = line[455:456],
        sex_of_infant = line[474:475],
        sex_of_infant_imputed = line[475:476],
        last_norm_menses_mo = line[476:478],
        last_norm_menses_yr = line[480:484],
        combined_gestation_imputed = line[487:488],
        obst_est_of_gestation_used = line[488:489],
        combined_gestation_wk = line[489:491],
        obst_est_edit_wk = line[498:500],
        birth_weight_gm = line[503:507],
        assist_vent_immed = line[516:517],
        assist_vent_after6 = line[517:518],
        admit_NICU = line[518:519],
        surfactant = line[519:520],
        antibiotics_for_newborn = line[520:521],
        seizures = line[521:522],
        anencephaly = line[536:537],
        meningo_spina_bif = line[537:538],
        cyn_cong_heart_disease = line[538:539],
        cong_diaph_hernia = line[539:540],
        omphalocele = line[540:541],
        gastroschisis = line[541:542],
        limb_reduc_defect = line[548:549],
        cleft_lip_or_palate = line[549:550],
        cleft_palate_only = line[550:551],
        down_syndr = line[551:552],
        suspect_chromo_disorder = line[552:553],
        hypospadias = line[553:554],
        no_cong_anamolies_checked = line[560:561],
        infant_transferred = line[566:567],
        infant_living_at_report = line[567:568],
        infant_breastfed_at_discharge = line[568:569]
    )
    return pd.Series( ret_dict )

def __single_df( idx_line ):
    import pandas as pd
    idx = idx_line[0]
    line = idx_line[1]
    return pd.DataFrame( nat2018Parser( line ), index = [idx] )

def createNat2018DF( lines ):
    import pandas as pd
    import multiprocessing as mp
    pool = mp.Pool( mp.cpu_count() - 1 )
    ret_df =  pd.concat(  pool.map( __single_df, enumerate(lines) ), axis = 0 )
    pool.close()
    return ret_df

def divvyNat2018OverCSV( nat18fwf_fp,
                         nfiles = 10,
                         output_dir = None,
                         output_fprefix = None ):
    from os.path import dirname, abspath, basename, join
    import numpy as np
    import pandas as pd
    if output_dir is None:
        output_dir = abspath( dirname( nat18fwf_fp ) )
    if output_fprefix is None:
        output_fprefix = basename( nat18fwf_fp )
    csv_files = pd.Series( [ join(output_dir, output_fprefix ) + ('_%d.csv' % i) for i in range(1, nfiles+1) ] )
    # open connections to all the output file (erasing any previous file)
    fps_conn = csv_files.apply( lambda x: open(x, 'w') )
    # write the column names
    idx_order = nat2018Parser(' '*600).index
    colnames = ','.join( idx_order )
    fps_conn.apply( lambda conn: conn.write( colnames + '\n' ) )
    # close and reopen to append
    fps_conn.apply( lambda conn: conn.close() )
    fps_conn = csv_files.apply( lambda x: open(x, 'a') )
    # write the lines
    with open( nat18fwf_fp, 'r' ) as fin:
        idx = 0
        line = fin.readline()
        while line:
            conn = fps_conn[ idx ]
            conn.write( ','.join( nat2018Parser(line)[idx_order] ) + '\n' )
            line = fin.readline()
            idx = (idx + 1)%nfiles
    # close the connections
    fps_conn.apply( lambda conn: conn.close() )

In [15]:
nat18fwf_fp = '../../data/Nat2018PublicUS.c20190509.r20190717.txt'

# divvyNat2018OverCSV(nat18fwf_fp)
print(nat18fwf_fp)

divvyNat2018OverCSV(nat18fwf_fp)

../../data/Nat2018PublicUS.c20190509.r20190717.txt


# Load Natality Data

In [194]:
natality = pd.read_csv('../../data/Natality AGE Data set_jason.csv')
natality

In [197]:
def cleanNatality(natality):
    natality = natality.drop(['Notes', 'Infant Birth Weight Code', 'Year Code', 'Gender Code'], axis=1)
    natality = natality.loc[~(natality.isna().all(axis=1))]
    print(natality.loc[natality.isna().any(axis=1)])
    natality['State Code'] = natality['State Code'].astype(int).astype(str)
    natality['Age of Mother'] = natality['Age of Mother'].astype(str)
    natality.Year = natality.Year.astype(int).astype(str)
    natality.Births = natality.Births.astype(int)
    
    return natality

natality = cleanNatality(natality)
natality

Unnamed: 0,State,State Code,Age of Mother,Age of Mother Code,Year,Gender,Infant Birth Weight,Births
0,Alabama,1,Under 15 years,15,1999,Female,2000 - 2499 grams,12
1,Alabama,1,Under 15 years,15,1999,Female,2500 - 2999 grams,30
2,Alabama,1,Under 15 years,15,1999,Female,3000 - 3499 grams,39
3,Alabama,1,Under 15 years,15,1999,Female,3500 - 3999 grams,13
4,Alabama,1,Under 15 years,15,1999,Male,2500 - 2999 grams,27
...,...,...,...,...,...,...,...,...
314520,Wyoming,56,39 years,39,2018,Male,3500 - 3999 grams,10
314521,Wyoming,56,40 years,40,2018,Female,2500 - 2999 grams,13
314522,Wyoming,56,40 years,40,2018,Female,3000 - 3499 grams,11
314523,Wyoming,56,41 years,41,2018,Female,3000 - 3499 grams,15


In [212]:
# check dtypes, check for any other NaN's and output the rows with missingness
print(natality.dtypes)
print('*' * 50)
print(natality.isna().sum())
print('*' * 50)
natality.loc[(natality.isna().any(axis=1))]

State                  object
State Code             object
Age of Mother          object
Age of Mother Code     object
Year                   object
Gender                 object
Infant Birth Weight    object
Births                  int64
dtype: object
**************************************************
State                  0
State Code             0
Age of Mother          0
Age of Mother Code     0
Year                   0
Gender                 0
Infant Birth Weight    0
Births                 0
dtype: int64
**************************************************


Unnamed: 0,State,State Code,Age of Mother,Age of Mother Code,Year,Gender,Infant Birth Weight,Births


# Take a Sample of the Large Dataset

In [225]:
natsample = natality.copy().sample(int(314525/10)) # 10 percent of the data
print(natsample.isnull().values.any()) # check for any remaining NaN's; Should be False
natsample

False


Unnamed: 0,State,State Code,Age of Mother,Age of Mother Code,Year,Gender,Infant Birth Weight,Births
303173,Kentucky,21,34 years,34,2018,Female,2000 - 2499 grams,60
20019,Utah,49,20-24 years,20-24,2001,Male,2500 - 2999 grams,1333
192814,Georgia,13,18 years,18,2012,Female,2000 - 2499 grams,134
79040,Arizona,4,20 years,20,2006,Female,500 - 999 grams,12
97044,Wyoming,56,36 years,36,2006,Female,3500 - 3999 grams,18
...,...,...,...,...,...,...,...,...
206682,West Virginia,54,35 years,35,2012,Male,4000 - 4499 grams,28
124148,Michigan,26,25 years,25,2008,Male,1000 - 1499 grams,25
267270,Kansas,20,32 years,32,2016,Female,1500 - 1999 grams,21
246677,Florida,12,32 years,32,2015,Female,4000 - 4499 grams,323


In [226]:
np.unique(natsample['Age of Mother']) # Youngest is 13, oldest is 50 and over

array(['13 years', '14 years', '15 years', '15-19 years', '16 years',
       '17 years', '18 years', '19 years', '20 years', '20-24 years',
       '21 years', '22 years', '23 years', '24 years', '25 years',
       '25-29 years', '26 years', '27 years', '28 years', '29 years',
       '30 years', '30-34 years', '31 years', '32 years', '33 years',
       '34 years', '35 years', '35-39 years', '36 years', '37 years',
       '38 years', '39 years', '40 years', '40-44 years', '41 years',
       '42 years', '43 years', '44 years', '45 years', '45-49 years',
       '46 years', '47 years', '48 years', '49 years',
       '50 years and over', 'Under 15 years'], dtype=object)

In [251]:
# Categorize mother's ages
def ageGroup(age):
    if (age in ['13 years', '14 years', 'Under 15 years']):
        age = 'Under 15'
    elif (age in ['15 years', '15-19 years', '16 years', '17 years', '18 years', '19 years']):
        age = '15-19'
    elif (age in ['20 years', '20-24 years', '21 years', '22 years', '23 years', '24 years']):
        age = '20-24'
    elif (age in ['25 years', '25-29 years', '26 years', '27 years', '28 years', '29 years']):
        age = '25-29'
    elif (age in ['30 years', '30-34 years', '31 years', '32 years', '33 years', '24 years']):
        age = '30-34'
    elif (age in ['35 years', '35-39 years', '36 years', '37 years', '38 years', '39 years']):
        age = '35-39'
    elif (age in ['40 years', '40-44 years', '41 years', '42 years', '43 years', '44 years']):
        age = '40-44'
    elif (age in ['45 years', '45-49 years', '46 years', '47 years', '48 years', '49 yeras']):
        age = '45-49'
    else:
        age = '50 and over'
        
    return age # series value

natsample2 = natsample.copy()
natsample2['Age of Mother'] = natsample2['Age of Mother'].apply(ageGroup)
natsample2
# natsample2[natsample2['Age of Mother'] == 'Under 15']

Unnamed: 0,State,State Code,Age of Mother,Age of Mother Code,Year,Gender,Infant Birth Weight,Births
303173,Kentucky,21,50 and over,34,2018,Female,2000 - 2499 grams,60
20019,Utah,49,20-24,20-24,2001,Male,2500 - 2999 grams,1333
192814,Georgia,13,15-19,18,2012,Female,2000 - 2499 grams,134
79040,Arizona,4,20-24,20,2006,Female,500 - 999 grams,12
97044,Wyoming,56,35-39,36,2006,Female,3500 - 3999 grams,18
...,...,...,...,...,...,...,...,...
206682,West Virginia,54,35-39,35,2012,Male,4000 - 4499 grams,28
124148,Michigan,26,25-29,25,2008,Male,1000 - 1499 grams,25
267270,Kansas,20,30-34,32,2016,Female,1500 - 1999 grams,21
246677,Florida,12,30-34,32,2015,Female,4000 - 4499 grams,323


In [252]:
natality['Age of Mother'] = natality['Age of Mother'].apply(ageGroup)
natality

Unnamed: 0,State,State Code,Age of Mother,Age of Mother Code,Year,Gender,Infant Birth Weight,Births
0,Alabama,1,Under 15,15,1999,Female,2000 - 2499 grams,12
1,Alabama,1,Under 15,15,1999,Female,2500 - 2999 grams,30
2,Alabama,1,Under 15,15,1999,Female,3000 - 3499 grams,39
3,Alabama,1,Under 15,15,1999,Female,3500 - 3999 grams,13
4,Alabama,1,Under 15,15,1999,Male,2500 - 2999 grams,27
...,...,...,...,...,...,...,...,...
314520,Wyoming,56,35-39,39,2018,Male,3500 - 3999 grams,10
314521,Wyoming,56,40-44,40,2018,Female,2500 - 2999 grams,13
314522,Wyoming,56,40-44,40,2018,Female,3000 - 3499 grams,11
314523,Wyoming,56,40-44,41,2018,Female,3000 - 3499 grams,15


# Plots using Plotly

In [None]:
# rename columns for easier access when plotting
print(natality.columns)
natality.rename(columns = {'State Code':'State_Code', 'Age of Mother':'Age_of_Mother', 'Age of Mother Code':'Age_of_Mother_Code', 'Infant Birth Weight':'Infant_Birth_Weight'}, inplace = True) 

In [300]:
# create groupedby dataframe
data_nat1 = natality.groupby(['State', 'Age_of_Mother', 'Infant_Birth_Weight']).agg({'Births': ['sum', 'mean', 'median', 'max', 'min']})
data_nat1.columns = data_nat1.columns.droplevel(0)
data_nat1 = data_nat1.reset_index()
data_nat1['mean'] = data_nat1['mean'].round()
data_nat1

In [308]:
# example query by state
data_nat = natality.query("State == 'New York'").sort_values(by=['Births'])
fig = px.bar(data_nat, x='Infant_Birth_Weight', y='Births')
fig.show()

TypeError: cannot perform reduce with flexible type

In [287]:
import plotly.express as px

# create dataframe with only moms under 15
natU15 = natality[natality['Age of Mother'] == 'Under 15']
fig = px.bar(natU15, x='Infant Birth Weight', y='Births')
fig.show()

In [294]:
import plotly.figure_factory as ff
import numpy as np

teens = natality[(natality['Age of Mother'] == 'Under 15') & (natality['Age of Mother'] == '15-19')]
hist_data = [teens]
group_labels = ['distplot'] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels)
fig.show()

PlotlyError: Oops, this function was written to handle multiple datasets, if you want to plot just one, make sure your hist_data variable is still a list of lists, i.e. x = [1, 2, 3] -> x = [[1, 2, 3]]