In [4]:
from sklearn import metrics, linear_model
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from statsmodels.graphics.gofplots import qqplot
from sklearn import datasets
from scipy import stats
from scipy.stats import shapiro
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
ols = LinearRegression()

plt.style.use('ggplot')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [5]:
nat_2016 = pd.read_csv('nat2016.csv')

In [111]:
col_mapping_dict = {c[0]:c[1] for c in enumerate(nat_2016)}
col_mapping_dict

{0: 'Unnamed: 0',
 1: 'birth_year',
 2: 'birth_month',
 3: 'birth_time',
 4: 'birth_day_of_wk',
 5: 'birth_place',
 6: 'mothers_age_imputed',
 7: 'mothers_age',
 8: 'mothers_nativity',
 9: 'mothers_residence_status',
 10: 'mothers_race',
 11: 'mothers_race_imputed',
 12: 'mothers_hispanic_origin',
 13: 'mothers_hispanic_origin2',
 14: 'paternity_acknow',
 15: 'mothers_marital_status',
 16: 'mothers_maristat_imputed',
 17: 'mothers_education',
 18: 'fathers_age',
 19: 'fathers_race',
 20: 'fathers_hispanic_origin',
 21: 'fathers_hispanic_origin2',
 22: 'fathers_education',
 23: 'prior_living_births',
 24: 'prior_dead_births',
 25: 'prior_terminations',
 26: 'mo_since_last_live_birth',
 27: 'mo_since_last_other_birth',
 28: 'mo_prenatal_care_began',
 29: 'n_prenatal_visits',
 30: 'wic',
 31: 'cigs_tri1',
 32: 'cigs_tri2',
 33: 'cigs_tri3',
 34: 'mothers_height',
 35: 'mothers_bmi',
 36: 'pre_preg_lbs',
 37: 'delivery_lbs',
 38: 'pre_preg_diab',
 39: 'gest_diab',
 40: 'pre_preg_hypten',
 

# EDA

In [9]:
# APGAR Score by Mother's Age
nat_2016.groupby(nat_2016.APGAR_score_10min)[['mothers_age']].median()

Unnamed: 0_level_0,mothers_age
APGAR_score_10min,Unnamed: 1_level_1
0,28.0
1,29.0
2,30.0
3,28.5
4,29.0
5,28.0
6,28.0
7,29.0
8,29.0
9,28.5


In [75]:
# Higher the eduation better the APGAR Score
nat_2016.groupby(nat_2016.mothers_education)[['APGAR_score_10min']].mean()

Unnamed: 0_level_0,APGAR_score_10min
mothers_education,Unnamed: 1_level_1
1,87.291668
2,87.082088
3,87.100036
4,87.158277
5,87.289042
6,87.415308
7,87.436825
8,87.379128
9,87.175628


In [76]:
# Education level 9 (unknown) is higher than most of the 
nat_2016.groupby(nat_2016.mothers_education)[['APGAR_score_5min']].mean()

Unnamed: 0_level_0,APGAR_score_5min
mothers_education,Unnamed: 1_level_1
1,9.210168
2,9.204148
3,9.225831
4,9.183674
5,9.195208
6,9.2423
7,9.257865
8,9.196796
9,9.823385


In [77]:
# Living at report
nat_2016[nat_2016['infant_living_at_report'] == 'Y'].groupby(nat_2016.mothers_race).count()[['Unnamed: 0']]

Unnamed: 0_level_0,Unnamed: 0
mothers_race,Unnamed: 1_level_1
1,724231
2,118733
3,11061
4,97605
5,3692
6,8300
7,737
8,803
9,184
10,5765


In [78]:
# Not living at report
nat_2016[nat_2016['infant_living_at_report'] == 'N'].groupby(nat_2016.mothers_race).count()[['Unnamed: 0']]

Unnamed: 0_level_0,Unnamed: 0
mothers_race,Unnamed: 1_level_1
1,1436
2,509
3,30
4,180
5,9
6,22
7,1
8,2
9,2
10,6


In [79]:
# Living at report rate 
nat_2016[nat_2016['infant_living_at_report'] == 'N'].groupby(nat_2016.mothers_race).sum()[['Unnamed: 0']] / nat_2016[nat_2016['infant_living_at_report'] == 'Y'].groupby(nat_2016.mothers_race).sum()[['Unnamed: 0']]

Unnamed: 0_level_0,Unnamed: 0
mothers_race,Unnamed: 1_level_1
1,0.002069
2,0.004423
3,0.003063
4,0.001896
5,0.00133
6,0.002589
7,0.002282
8,0.001283
9,0.005886
10,0.001074


In [100]:
# Not living at report
nat_2016[nat_2016['combined_gestation_wk'].groupby(nat_2016.mothers_race).count()[['Unnamed: 0']]

SyntaxError: unexpected EOF while parsing (<ipython-input-100-26d8342cad17>, line 2)

In [101]:
# Birth Weight
nat_2016.birth_weight_gm.groupby(nat_2016.mothers_age).mean()

mothers_age
12    2989.947368
13    3124.309859
14    3084.997696
15    3150.704775
16    3147.961558
17    3156.882581
18    3168.118353
19    3179.783545
20    3205.869973
21    3215.039893
22    3229.735579
23    3237.246220
24    3256.786110
25    3263.759714
26    3278.268826
27    3280.492826
28    3293.985558
29    3295.949882
30    3298.936298
31    3301.695417
32    3300.890113
33    3299.857732
34    3299.145770
35    3296.212642
36    3284.896680
37    3273.835568
38    3277.382987
39    3272.175527
40    3256.782629
41    3240.869713
42    3226.219520
43    3206.442019
44    3170.720331
45    3082.330671
46    3126.689394
47    3034.788889
48    3011.445344
49    2829.500000
50    2939.021212
Name: birth_weight_gm, dtype: float64

In [78]:
nat_2016.combined_gestation_wk.groupby(nat_2016.mothers_age).value_counts()

mothers_age  combined_gestation_wk
12           38                          4
             39                          4
             40                          3
             35                          2
             37                          2
             30                          1
             34                          1
             42                          1
             46                          1
13           39                         20
             40                         13
             38                          9
             37                          6
             34                          4
             36                          4
             41                          4
             42                          3
             32                          2
             33                          2
             27                          1
             30                          1
             35                          1
             43    

In [36]:
nat_2016.groupby(nat_2016.combined_gestation_wk)[['mothers_age','mothers_education','mothers_race']].agg(lambda x: x.value_counts().index[0])

Unnamed: 0_level_0,mothers_age,mothers_education,mothers_race
combined_gestation_wk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
17,22,3,2
18,31,3,1
19,36,3,1
20,31,3,1
21,26,3,1
22,26,3,1
23,32,3,1
24,30,3,1
25,33,3,1
26,29,3,1


In [37]:
nat_2016.mothers_age.value_counts()

31    60788
30    60422
29    59160
28    57954
32    57477
27    55618
33    55079
26    53570
34    50175
25    50007
24    46220
35    44107
23    42068
22    38677
36    37079
21    33565
37    29532
20    29240
38    23429
19    22716
39    18020
18    13747
40    13125
41     9149
17     7486
42     5881
16     3876
43     3665
44     2174
15     1487
45     1252
46      660
14      434
47      360
50      330
48      247
49      162
13       71
12       19
Name: mothers_age, dtype: int64

In [None]:
nat_2016.mothers_race

In [96]:
nat_2016.combined_gestation_wk.groupby(nat_2016.mothers_age).value_counts()

mothers_age  combined_gestation_wk
12           38                          4
             39                          4
             40                          3
             35                          2
             37                          2
             30                          1
             34                          1
             42                          1
             46                          1
13           39                         20
             40                         13
             38                          9
             37                          6
             34                          4
             36                          4
             41                          4
             42                          3
             32                          2
             33                          2
             27                          1
             30                          1
             35                          1
             43    

In [115]:
nat_2016.groupby('wic')[["Mother's Education Code", "Father's Education Code", "Ten Minute APGAR Score"]].agg(lambda x: x.value_counts().index[0])

KeyError: 'Columns not found: \'Ten Minute APGAR Score\', "Father\'s Education Code", "Mother\'s Education Code"'

In [125]:
# Median birth weight by sex of infant
nat_2016.groupby(nat_2016.sex_of_infant)[['birth_weight_gm']].median()

Unnamed: 0_level_0,birth_weight_gm
sex_of_infant,Unnamed: 1_level_1
F,3246
M,3365


NameError: name 'value_counts' is not defined

In [11]:
nat_2016.groupby(nat_2016.sex_of_infant)[['birth_weight_gm']].agg(lambda x: x.value_counts().index[0])

Unnamed: 0_level_0,birth_weight_gm
sex_of_infant,Unnamed: 1_level_1
F,3260
M,3430


In [42]:
nat_2016.groupby(nat_2016.admit_NICU)[['mothers_age']].agg(lambda x: x.value_counts().index[0])

Unnamed: 0_level_0,mothers_age
admit_NICU,Unnamed: 1_level_1
N,31
U,28
Y,31


In [45]:
nat_2016.groupby(nat_2016.admit_NICU)[['mothers_race']].median()

Unnamed: 0_level_0,mothers_race
admit_NICU,Unnamed: 1_level_1
N,1
U,1
Y,1


In [57]:
nat_2016[nat_2016['admit_NICU'] == 'Y'].groupby(nat_2016.mothers_race).count()[['admit_NICU']]

Unnamed: 0_level_0,admit_NICU
mothers_race,Unnamed: 1_level_1
1,55997
2,13484
3,947
4,6797
5,333
6,775
7,91
8,74
9,20
10,561


In [60]:
nat_2016[nat_2016['admit_NICU'] == 'N'].groupby(nat_2016.mothers_race).count()[['admit_NICU']]

Unnamed: 0_level_0,admit_NICU
mothers_race,Unnamed: 1_level_1
1,672425
2,105945
3,10174
4,91416
5,3392
6,7585
7,654
8,733
9,166
10,5252


In [75]:
nat_2016.groupby(nat_2016.mothers_race)[['admit_NICU']].count()

Unnamed: 0_level_0,admit_NICU
mothers_race,Unnamed: 1_level_1
1,728807
2,119543
3,11138
4,98231
5,3725
6,8365
7,745
8,807
9,186
10,5814


In [76]:
nat_2016.groupby(nat_2016.mothers_education)[['admit_NICU']].count()

Unnamed: 0_level_0,admit_NICU
mothers_education,Unnamed: 1_level_1
1,35129
2,100721
3,252180
4,199391
5,74920
6,187136
7,83823
8,26342
9,29386


In [55]:
nat_2016.groupby(nat_2016.mothers_race)[nat_2016['admit_NICU'] == 'Y'].count()

KeyError: 'Columns not found: False, True'

In [49]:
nat_2016.admit_NICU.value_counts()

N    908426
Y     80056
U       546
Name: admit_NICU, dtype: int64

In [51]:
908426 + 80056 + 546

989028

In [129]:
nat_2016.admit_NICU.value_counts()

N    908426
Y     80056
U       546
Name: admit_NICU, dtype: int64

In [133]:
# Impute values into NICU
def nicu_ordinal( x ):
    if 'Y' in x:
        return 2
    elif 'N' in x:
        return 1
    else:
        return 0

In [132]:
nat_2016.admit_NICU.map(nicu_ordinal).value_counts()

1    908426
2     80056
0       546
Name: admit_NICU, dtype: int64

In [8]:
# total counts of babies admitted to NICU
nat_2016.groupby(['mothers_race', 'mothers_education', 'admit_NICU']).agg({'birth_year':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,birth_year
mothers_race,mothers_education,admit_NICU,Unnamed: 3_level_1
1,1,N,28026
1,1,U,8
1,1,Y,2270
1,2,N,71564
1,2,U,39
1,2,Y,6402
1,3,N,171854
1,3,U,127
1,3,Y,14325
1,4,N,135364


In [112]:
nat_2016.groupby(nat_2016.sex_of_infant)[['birth_weight_gm','assist_vent_immed','assist_vent_after6','admit_NICU','antibiotics_for_newborn']].agg(lambda x: x.value_counts().index[0])

Unnamed: 0_level_0,birth_weight_gm,assist_vent_immed,assist_vent_after6,admit_NICU,antibiotics_for_newborn
sex_of_infant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,3260,N,N,N,N
M,3430,N,N,N,N


In [105]:
# change to boolean
nat_2016.assist_vent_immed

0         N
1         N
2         N
3         N
4         N
5         N
6         N
7         Y
8         N
9         N
10        N
11        N
12        N
13        N
14        N
15        N
16        N
17        N
18        Y
19        N
20        N
21        N
22        N
23        N
24        N
25        N
26        N
27        N
28        N
29        N
30        N
31        N
32        N
33        N
34        N
35        N
36        N
37        N
38        N
39        N
40        N
41        N
42        N
43        Y
44        Y
45        N
46        N
47        N
48        N
49        N
50        N
51        N
52        N
53        N
54        N
55        N
56        N
57        N
58        N
59        N
60        N
61        N
62        N
63        N
64        N
65        N
66        N
67        N
68        N
69        N
70        N
71        N
72        N
73        N
74        N
75        N
76        N
77        N
78        N
79        N
80        N
81        N
82        N
83  

In [106]:
# change to boolean
nat_2016.assist_vent_after6

0         N
1         N
2         N
3         N
4         N
5         N
6         N
7         Y
8         N
9         N
10        N
11        N
12        N
13        N
14        N
15        N
16        N
17        N
18        Y
19        N
20        N
21        N
22        N
23        N
24        N
25        N
26        N
27        N
28        N
29        N
30        N
31        N
32        N
33        N
34        N
35        N
36        N
37        N
38        N
39        N
40        N
41        N
42        N
43        Y
44        N
45        N
46        N
47        N
48        N
49        N
50        N
51        N
52        N
53        N
54        N
55        N
56        N
57        N
58        N
59        N
60        N
61        N
62        N
63        N
64        N
65        N
66        N
67        N
68        N
69        N
70        N
71        N
72        N
73        N
74        N
75        N
76        N
77        N
78        N
79        N
80        N
81        N
82        N
83  

In [107]:
# change to boolean
nat_2016.admit_NICU

0         N
1         N
2         N
3         N
4         N
5         N
6         N
7         Y
8         N
9         N
10        N
11        N
12        N
13        N
14        N
15        N
16        N
17        N
18        Y
19        N
20        N
21        N
22        N
23        N
24        N
25        N
26        N
27        Y
28        N
29        N
30        Y
31        N
32        N
33        N
34        N
35        N
36        N
37        Y
38        N
39        N
40        N
41        N
42        N
43        Y
44        N
45        N
46        N
47        N
48        N
49        N
50        N
51        N
52        N
53        N
54        N
55        N
56        N
57        N
58        N
59        N
60        N
61        N
62        N
63        N
64        N
65        N
66        N
67        N
68        N
69        N
70        N
71        N
72        N
73        N
74        N
75        N
76        Y
77        N
78        Y
79        N
80        N
81        N
82        N
83  

In [108]:
nat_2016.antibiotics_for_newborn

0         N
1         N
2         N
3         N
4         N
5         N
6         N
7         Y
8         N
9         N
10        N
11        N
12        N
13        N
14        N
15        N
16        N
17        N
18        Y
19        N
20        N
21        N
22        N
23        N
24        N
25        N
26        N
27        N
28        N
29        N
30        N
31        N
32        N
33        N
34        N
35        N
36        N
37        N
38        N
39        N
40        N
41        N
42        N
43        N
44        N
45        N
46        N
47        N
48        N
49        N
50        N
51        N
52        N
53        N
54        N
55        N
56        N
57        N
58        N
59        N
60        N
61        N
62        N
63        N
64        N
65        N
66        N
67        N
68        N
69        N
70        N
71        N
72        N
73        N
74        N
75        N
76        Y
77        N
78        N
79        N
80        N
81        N
82        N
83  

In [40]:
nat_2016.groupby(["no_infection_reported"])[["assist_vent_immed", "syphilis", "chlamydia", "hepB", "hepC"]]

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x0000020621471EF0>

In [42]:
nat_2016['pre_preg_diab'].groupby(nat_2016.mothers_education).value_counts()

KeyError: ('gonorrhea', 'syphilis', 'chlamydia', 'hepB', 'hepC')

In [82]:
# function to impute numbers into infection status
def no_infection(x):
    if 'Y' in x:
        return 1
    elif 'N' in x:
        return 2
    else:
        return 3

In [83]:
nat_2016.gonorrhea.map(no_infection).value_counts()

2    984500
3      2827
1      1701
Name: gonorrhea, dtype: int64

In [48]:
nat_2016.syphilis.map(no_infection).value_counts()    

2    985578
3      2827
1       623
Name: syphilis, dtype: int64

In [49]:
nat_2016.chlamydia.map(no_infection).value_counts()    

2    973659
1     12542
3      2827
Name: chlamydia, dtype: int64

In [50]:
nat_2016.hepB.map(no_infection).value_counts()    

2    984109
3      2827
1      2092
Name: hepB, dtype: int64

In [51]:
nat_2016.hepC.map(no_infection).value_counts()    

2    983949
3      2827
1      2252
Name: hepC, dtype: int64

In [84]:
# Impute numerical values into infection columns
infections = ["gonorrhea", "syphilis", "chlamydia", "hepB", "hepC"]

In [85]:
for infection in infections:
    nat_2016[infection] = nat_2016[infection].map(no_infection)

In [70]:
nat_2016.hepC

0         2
1         2
2         2
3         2
4         2
5         2
6         2
7         2
8         2
9         2
10        2
11        2
12        2
13        2
14        2
15        2
16        2
17        2
18        2
19        1
20        2
21        2
22        2
23        1
24        2
25        2
26        2
27        2
28        2
29        2
30        2
31        2
32        2
33        2
34        2
35        2
36        2
37        2
38        2
39        2
40        2
41        2
42        2
43        1
44        2
45        2
46        2
47        2
48        2
49        2
50        2
51        2
52        2
53        2
54        2
55        2
56        2
57        2
58        2
59        2
60        2
61        2
62        2
63        2
64        2
65        2
66        2
67        2
68        2
69        2
70        2
71        2
72        2
73        2
74        2
75        2
76        2
77        2
78        2
79        2
80        2
81        2
82        2
83  

In [63]:
nat_2016['gonorrhea'].map(no_infection)

0         2
1         2
2         2
3         2
4         2
5         2
6         2
7         2
8         2
9         2
10        2
11        2
12        2
13        2
14        2
15        2
16        2
17        2
18        2
19        2
20        2
21        2
22        2
23        2
24        2
25        2
26        2
27        2
28        2
29        2
30        2
31        2
32        2
33        2
34        2
35        2
36        2
37        2
38        2
39        2
40        2
41        2
42        2
43        2
44        2
45        2
46        2
47        2
48        2
49        2
50        2
51        2
52        2
53        2
54        2
55        2
56        2
57        2
58        2
59        2
60        2
61        2
62        2
63        2
64        2
65        2
66        2
67        2
68        2
69        2
70        2
71        2
72        2
73        2
74        2
75        2
76        2
77        2
78        2
79        2
80        2
81        2
82        2
83  

In [87]:
nat_2016.gonorrhea.value_counts()

2    984500
3      2827
1      1701
Name: gonorrhea, dtype: int64