# Parser Script

In [39]:
import pandas as pd
import numpy as np
import sklearn

In [12]:
def nat2018Parser( line ):
    import pandas as pd
    ret_dict = dict(
        birth_year = line[8:12],
        birth_month = line[12:14],
        birth_time = line[18:22],
        birth_day_of_wk = line[22:23],
        birth_place = line[31:32],
        mothers_age_imputed = line[72:73],
        mothers_age = line[74:76],
        mothers_nativity = line[83:84],
        mothers_residence_status = line[103:104],
        mothers_race = line[104:106],
        mothers_race_imputed = line[110:111],
        mothers_hispanic_origin = line[111:112],
        mothers_hispanic_origin2 = line[116:117],
        paternity_acknow = line[118:119],
        mothers_marital_status = line[119:120],
        mothers_maristat_imputed = line[120:121],
        mothers_education = line[123:124],
        fathers_age = line[146:148],
        fathers_race = line[150:152],
        fathers_hispanic_origin = line[158:159],
        fathers_hispanic_origin2 = line[161:162],
        fathers_education = line[162:163],
        prior_living_births = line[170:172],
        prior_dead_births = line[172:174],
        prior_terminations = line[174:176],
        mo_since_last_live_birth = line[197:200],
        mo_since_last_other_birth = line[205:208],
        mo_prenatal_care_began = line[223:225],
        n_prenatal_visits = line[237:239],
        wic = line[250:250],
        cigs_tri1 = line[254:256],
        cigs_tri2 = line[256:258],
        cigs_tri3 = line[258:260],
        mothers_height = line[279:281],
        mothers_bmi = line[282:286],
        pre_preg_lbs = line[291:294],
        delivery_lbs = line[298:301],
        pre_preg_diab = line[312:313],
        gest_diab = line[313:314],
        pre_preg_hypten = line[314:315],
        gest_hypten = line[315:316],
        hypten_ecl = line[316:317],
        prev_preterm_birth = line[317:318],
        infertility_treatment = line[324:325],
        fertil_enhance = line[325:326],
        asst_repro_tech = line[326:327],
        n_prev_cesar = line[331:333],
        no_risk_reported = line[336:337],
        gonorrhea = line[342:343],
        syphilis = line[343:344],
        chlamydia = line[344:345],
        hepB = line[345:346],
        hepC = line[346:347],
        no_infection_reported = line[352:353],
        success_ext_cep = line[359:360],
        fail_ext_cep = line[360:361],
        induced_labor = line[382:383],
        aug_labor = line[383:384],
        steriods = line[384:385],
        antibiotics = line[385:386],
        chorioamnionitis = line[386:387],
        anesthesia = line[387:388],
        fetal_present_at_birth = line[400:401],
        final_delivery_method = line[401:402],
        trial_of_labor_attempt = line[402:403],
        maternal_transfusion = line[414:415],
        perineal_laceration = line[415:416],
        rupt_uterus = line[416:417],
        unplanned_hyster = line[417:418],
        admit_to_IC = line[418:419],
        attendant_at_birth = line[432:433],
        mother_transferred = line[433:434],
        delivery_payment_source = line[434:435],
        APGAR_score_5min = line[443:445],
        APGAR_score_10min = line[447:449],
        plurality = line[453:454],
        pluarlity_imputed = line[455:456],
        sex_of_infant = line[474:475],
        sex_of_infant_imputed = line[475:476],
        last_norm_menses_mo = line[476:478],
        last_norm_menses_yr = line[480:484],
        combined_gestation_imputed = line[487:488],
        obst_est_of_gestation_used = line[488:489],
        combined_gestation_wk = line[489:491],
        obst_est_edit_wk = line[498:500],
        birth_weight_gm = line[503:507],
        assist_vent_immed = line[516:517],
        assist_vent_after6 = line[517:518],
        admit_NICU = line[518:519],
        surfactant = line[519:520],
        antibiotics_for_newborn = line[520:521],
        seizures = line[521:522],
        anencephaly = line[536:537],
        meningo_spina_bif = line[537:538],
        cyn_cong_heart_disease = line[538:539],
        cong_diaph_hernia = line[539:540],
        omphalocele = line[540:541],
        gastroschisis = line[541:542],
        limb_reduc_defect = line[548:549],
        cleft_lip_or_palate = line[549:550],
        cleft_palate_only = line[550:551],
        down_syndr = line[551:552],
        suspect_chromo_disorder = line[552:553],
        hypospadias = line[553:554],
        no_cong_anamolies_checked = line[560:561],
        infant_transferred = line[566:567],
        infant_living_at_report = line[567:568],
        infant_breastfed_at_discharge = line[568:569]
    )
    return pd.Series( ret_dict )

def __single_df( idx_line ):
    import pandas as pd
    idx = idx_line[0]
    line = idx_line[1]
    return pd.DataFrame( nat2018Parser( line ), index = [idx] )

def createNat2018DF( lines ):
    import pandas as pd
    import multiprocessing as mp
    pool = mp.Pool( mp.cpu_count() - 1 )
    ret_df =  pd.concat(  pool.map( __single_df, enumerate(lines) ), axis = 0 )
    pool.close()
    return ret_df

def divvyNat2018OverCSV( nat18fwf_fp,
                         nfiles = 10,
                         output_dir = None,
                         output_fprefix = None ):
    from os.path import dirname, abspath, basename, join
    import numpy as np
    import pandas as pd
    if output_dir is None:
        output_dir = abspath( dirname( nat18fwf_fp ) )
    if output_fprefix is None:
        output_fprefix = basename( nat18fwf_fp )
    csv_files = pd.Series( [ join(output_dir, output_fprefix ) + ('_%d.csv' % i) for i in range(1, nfiles+1) ] )
    # open connections to all the output file (erasing any previous file)
    fps_conn = csv_files.apply( lambda x: open(x, 'w') )
    # write the column names
    idx_order = nat2018Parser(' '*600).index
    colnames = ','.join( idx_order )
    fps_conn.apply( lambda conn: conn.write( colnames + '\n' ) )
    # close and reopen to append
    fps_conn.apply( lambda conn: conn.close() )
    fps_conn = csv_files.apply( lambda x: open(x, 'a') )
    # write the lines
    with open( nat18fwf_fp, 'r' ) as fin:
        idx = 0
        line = fin.readline()
        while line:
            conn = fps_conn[ idx ]
            conn.write( ','.join( nat2018Parser(line)[idx_order] ) + '\n' )
            line = fin.readline()
            idx = (idx + 1)%nfiles
    # close the connections
    fps_conn.apply( lambda conn: conn.close() )

In [15]:
nat18fwf_fp = '../../data/Nat2018PublicUS.c20190509.r20190717.txt'

# divvyNat2018OverCSV(nat18fwf_fp)
print(nat18fwf_fp)

divvyNat2018OverCSV(nat18fwf_fp)

../../data/Nat2018PublicUS.c20190509.r20190717.txt


# Load Natality Data

In [85]:
natality = pd.read_csv('../../data/Natality AGE Data set_jason.csv')

In [86]:
natality

Unnamed: 0,Notes,State,State Code,Age of Mother,Age of Mother Code,Year,Year Code,Gender,Gender Code,Infant Birth Weight,Infant Birth Weight Code,Births
0,,Alabama,1.0,Under 15 years,15,1999.0,1999.0,Female,F,2000 - 2499 grams,5.0,12.0
1,,Alabama,1.0,Under 15 years,15,1999.0,1999.0,Female,F,2500 - 2999 grams,6.0,30.0
2,,Alabama,1.0,Under 15 years,15,1999.0,1999.0,Female,F,3000 - 3499 grams,7.0,39.0
3,,Alabama,1.0,Under 15 years,15,1999.0,1999.0,Female,F,3500 - 3999 grams,8.0,13.0
4,,Alabama,1.0,Under 15 years,15,1999.0,1999.0,Male,M,2500 - 2999 grams,6.0,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...
314520,,Wyoming,56.0,39 years,39,2018.0,2018.0,Male,M,3500 - 3999 grams,8.0,10.0
314521,,Wyoming,56.0,40 years,40,2018.0,2018.0,Female,F,2500 - 2999 grams,6.0,13.0
314522,,Wyoming,56.0,40 years,40,2018.0,2018.0,Female,F,3000 - 3499 grams,7.0,11.0
314523,,Wyoming,56.0,41 years,41,2018.0,2018.0,Female,F,3000 - 3499 grams,7.0,15.0


In [87]:
natality = natality.drop(['Notes', 'Infant Birth Weight Code', 'Year Code'], axis=1)

In [88]:
natality.dtypes

State                   object
State Code             float64
Age of Mother           object
Age of Mother Code      object
Year                   float64
Gender                  object
Gender Code             object
Infant Birth Weight     object
Births                 float64
dtype: object

# Take a Sample of the Large Dataset

In [89]:
natsample = natality.sample(int(314525/10))

In [91]:
natsample = natsample.loc[natsample['State'].notna()]
natsample.isnull().values.any()

False

In [92]:
natsample

Unnamed: 0,State,State Code,Age of Mother,Age of Mother Code,Year,Gender,Gender Code,Infant Birth Weight,Births
210308,District of Columbia,11.0,39 years,39,2013.0,Male,M,2500 - 2999 grams,16.0
72219,North Carolina,37.0,29 years,29,2005.0,Female,F,4000 - 4499 grams,196.0
968,Arizona,4.0,25-29 years,25-29,1999.0,Male,M,500 - 999 grams,45.0
121428,Indiana,18.0,41 years,41,2008.0,Female,F,4000 - 4499 grams,15.0
283717,Idaho,16.0,35 years,35,2017.0,Male,M,3500 - 3999 grams,133.0
...,...,...,...,...,...,...,...,...,...
66851,Louisiana,22.0,39 years,39,2005.0,Male,M,3000 - 3499 grams,71.0
263405,Colorado,8.0,28 years,28,2016.0,Female,F,4000 - 4499 grams,75.0
246442,Florida,12.0,20 years,20,2015.0,Female,F,1500 - 1999 grams,60.0
124885,Mississippi,28.0,17 years,17,2008.0,Female,F,3500 - 3999 grams,78.0


In [93]:
np.unique(natsample.State)

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'], dtype=object)

In [134]:
data_nat1 = data_nat1.reset_index()

Unnamed: 0_level_0,Infant Birth Weight,Births,Births
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
0,1000 - 1499 grams,47776.0,26.900901
1,1500 - 1999 grams,119999.0,42.00175
2,2000 - 2499 grams,394066.0,96.325104
3,2500 - 2999 grams,1428344.0,294.9905
4,3000 - 3499 grams,3017695.0,594.619704
5,3500 - 3999 grams,2200910.0,449.256991
6,4000 - 4499 grams,553531.0,135.007561
7,4500 - 4999 grams,72936.0,38.837061
8,499 grams or less,4074.0,15.914062
9,500 - 999 grams,32852.0,24.516418


In [145]:
import plotly.express as px

data_nat1 = natsample.groupby('Infant Birth Weight').agg({'Births': ['sum', 'mean']})
data_nat1 = data_nat1.reset_index()
fig = px.box(data_nat1, x="Infant Birth Weight", y="data_nat1[('Births', 'sum')]")
fig.show()
# data_nat1.Births['sum']

ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of [('Infant Birth Weight', ''), ('Births', 'sum'), ('Births', 'mean')] but received: data_nat1[('Births', 'sum')]

In [None]:
fig = px.histogram(natsample, x="Infant Birth Weight", y="tip", color="sex", marginal="rug",
                   hover_data=df.columns)
fig.show()

In [151]:
type(hist_data)

pandas.core.series.Series

In [150]:
import plotly.figure_factory as ff
import numpy as np

hist_data = natsample['Births']
group_labels = [''] # name of the dataset

fig = ff.create_distplot(hist_data, group_labels)
fig.show()

KeyError: 0

In [146]:
data_nat1[('Births', 'sum')]

0       47776.0
1      119999.0
2      394066.0
3     1428344.0
4     3017695.0
5     2200910.0
6      553531.0
7       72936.0
8        4074.0
9       32852.0
10       3239.0
11       3472.0
Name: (Births, sum), dtype: float64

In [123]:
data_nat1[('Births', 'sum')]

0       47776.0
1      119999.0
2      394066.0
3     1428344.0
4     3017695.0
5     2200910.0
6      553531.0
7       72936.0
8        4074.0
9       32852.0
10       3239.0
11       3472.0
Name: (Births, sum), dtype: float64

In [103]:
data_nat = natsample.query("State == 'New York'").sort_values(by=['Infant Birth Weight'])
fig = px.bar(data_nat, x='Infant Birth Weight', y='Births')
fig.show()

In [100]:
data_nat

Unnamed: 0,State,State Code,Age of Mother,Age of Mother Code,Year,Gender,Gender Code,Infant Birth Weight,Births
41265,Alabama,1.0,16 years,16,2004.0,Male,M,4000 - 4499 grams,14.0
33,Alabama,1.0,15-19 years,15-19,1999.0,Female,F,4000 - 4499 grams,172.0
243362,Alabama,1.0,17 years,17,2015.0,Male,M,4000 - 4499 grams,13.0
60201,Alabama,1.0,42 years,42,2005.0,Male,M,4000 - 4499 grams,10.0
225700,Alabama,1.0,37 years,37,2014.0,Male,M,3000 - 3499 grams,156.0
...,...,...,...,...,...,...,...,...,...
261340,Alabama,1.0,18 years,18,2016.0,Female,F,1500 - 1999 grams,18.0
94,Alabama,1.0,15-19 years,15-19,2002.0,Male,M,1000 - 1499 grams,41.0
134433,Alabama,1.0,21 years,21,2009.0,Male,M,2000 - 2499 grams,125.0
115887,Alabama,1.0,23 years,23,2008.0,Male,M,1000 - 1499 grams,18.0
