In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
# Data Extraction

# Extract data from Nat2021us_small_200000.txt and construct a dataframe

col_mapping = []
# 9-12 becomes 8,12
col_mapping.append([9,  12, 'birth_year']) # 2021
col_mapping.append([13, 14, 'birth_month']) # 01=Jan, 12=Dec
col_mapping.append([19, 22, 'time_of_birth']) # HHMM, 9999=Not Staetd
col_mapping.append([23, 23, 'birth_day_of_week']) # 1=Sunday, 7=Saturday
col_mapping.append([32, 32, 'birth_place']) # 1=hospital, 2=freestanding birth center, 3=home(intended), ... 9=Unknown
col_mapping.append([75, 76, 'mother_age']) # 12=10-12years-old, 13=13, ... 50=50 and over
col_mapping.append([84, 84, 'mother_nativity']) # 1=born in the US, 2=born outside the US, 3=Unknown
col_mapping.append([104,104,'residence_status'])
col_mapping.append([105,106,'mother_race1']) # detailed breakdown, 31 values
col_mapping.append([107,107,'mother_race2']) # only 6 groups
col_mapping.append([108,109,'mother_race3']) # 15 groups
col_mapping.append([115,115,'mother_hispanic_origin']) # non-hispanic, mexican, puerto rican, etc
col_mapping.append([117,117,'mother_hispanic_race']) # hispanic, non-hispanic-white, non-hispanic black, etc
col_mapping.append([119,119,'paternity_acknowledged']) # Y/N, U=unknonw, X=N/A
col_mapping.append([120,120,'marital_status']) # 1=Married, 2=Unmarried, 3=Unmarried not living together, 9=Unknown
col_mapping.append([124,124,'mother_education']) # 1=8th grade or less, 2=9th-12th grade, ... 9=Unknown
col_mapping.append([147,148,'father_age']) # 09-98, 99-unknown or not stated
col_mapping.append([151,152,'father_race1']) # detailed breakdown, 31 values
col_mapping.append([153,153,'father_race2']) # only 6 groups
col_mapping.append([154,155,'father_race3']) # 15 groups
col_mapping.append([159,159,'father_hispanic_origin']) # non-hispanic, mexican, puerto rican, etc
col_mapping.append([162,162,'father_hispanic_race']) # hispanic, non-hispanic-white, non-hispanic black, etc
col_mapping.append([163,163,'father_education']) # 1=8th grade or less, 2=9th-12th grade, ... 9=Unknown
col_mapping.append([171,172,'prior_births_now_living']) # 0-30 number of children living from previous births, 99=Unknown
col_mapping.append([173,174,'prior_births_now_dead']) # 0-30 number of children dead from previous births, 99=Unknown
col_mapping.append([182,182,'total_birth_order']) # 1-7 total birth order, 8 or more total births, 9=Unknown
col_mapping.append([201,202,'interval_since_last_live_birth']) # 00(zero-to-3months plural delivery), 01=04-11month, 02=12-17months
col_mapping.append([227,227,'month_prenatal_care_began']) # 1=1st-3rd month, 2=4th-6th month, 3=7th-final, 4=no prenatal, 5=unknown
col_mapping.append([242,243,'number_of_prenatal_visits']) # 00-98, 99=Unknown
col_mapping.append([251,251,'wic']) # Y/N/U, Women Infant Care Program
col_mapping.append([261,261,'cigarettes_before_pregnancy']) # 0=Non-smoker, 1=1-5, 2=6-10, .. 5=41 or more, 6=Unknown
col_mapping.append([262,262,'cigarettes_1st_trimester']) # 0=Non-smoker, 1=1-5, 2=6-10, .. 5=41 or more, 6=Unknown
col_mapping.append([263,263,'cigarettes_2nd_trimester']) # 0=Non-smoker, 1=1-5, 2=6-10, .. 5=41 or more, 6=Unknown
col_mapping.append([264,264,'cigarettes_3rd_trimester']) # 0=Non-smoker, 1=1-5, 2=6-10, .. 5=41 or more, 6=Unknown
col_mapping.append([280,281,'mother_height_in_total_inches']) # 30-78, 99=Unknown
col_mapping.append([287,287,'bmi']) # 1=Underweigth<18.5, 2=Normal18.5-24.9, ... 9=Unknown
col_mapping.append([292,294,'prepregnancy_weight']) # 075-375
col_mapping.append([299,301,'delivery_weight']) # 100-400, 999=Unknown
col_mapping.append([304,305,'weight_gain']) # 00-97, 98=98 and over, 99=Unknown
col_mapping.append([306,306,'weight_gain_group']) # 1=less than 11lbs, 2=11-20, 9=Unknown

col_mapping.append([313,313,'prepregnancy_diabetes']) # Y/N/U
col_mapping.append([314,314,'gestational_diabetes']) # Y/N/U
col_mapping.append([315,315,'prepregnancy_hypertension']) # Y/N/U
col_mapping.append([316,316,'gestational_hypertension']) # Y/N/U
col_mapping.append([317,317,'hypertension_eclampsia']) # Y/N/U
col_mapping.append([318,318,'previous_preterm_birth']) # Y/N/U
col_mapping.append([325,325,'infertility_treatment_used']) # Y/N/U
col_mapping.append([326,326,'fertility_enhancing_drugs']) # Y/N/X/U, X=N/A
col_mapping.append([327,327,'asst_reproductive_technology']) # Y/N/X/U, X=N/A
col_mapping.append([331,331,'previous_cesarean']) # Y/N/U
col_mapping.append([332,333,'number_of_previous_cesareans']) # 00=None, 0-30, 99=Unknowns
col_mapping.append([337,337,'no_risk_factors_reported']) #1=True, 0=False, 9=Not reported

col_mapping.append([343,343,'gonorrhea']) # Y/N/U
col_mapping.append([344,344,'syphilis']) # Y/N/U
col_mapping.append([345,345,'chlamydia']) # Y/N/U
col_mapping.append([346,346,'hepatitis_b']) # Y/N/U
col_mapping.append([347,347,'hepatitis_c']) # Y/N/U

col_mapping.append([360,360,'successful_external_cephalic_version']) # Y/N/U
col_mapping.append([361,361,'failed_external_cephalic_version']) # Y/N/U

col_mapping.append([383,383,'induction_of_labor']) # Y/N/U
col_mapping.append([384,384,'augmentation_of_labor']) # Y/N/U
col_mapping.append([385,385,'steriods']) # Y/N/U
col_mapping.append([386,386,'antibiotics']) # Y/N/U
col_mapping.append([387,387,'chorioamnionitis']) # Y/N/U
col_mapping.append([388,388,'anesthesia']) # Y/N/U
col_mapping.append([395,395,'no_characteristics_of_labor_reported']) # 0=True, 1=False, 9=Not-Reported
col_mapping.append([401,401,'fetal_presentation_at_delivery']) # 1=Cephalic, 2=Breech, 3=Other, 9=Unknown
col_mapping.append([408,408,'delivery_method']) # 1=Vaginal, 2=C-Section, 9=Unknown

col_mapping.append([433,433,'attendant_at_birth']) # 1=MD, 2=OD, 3=CNM, 4=Other Midwife, 5=Other, 9=Unknown

col_mapping.append([454,454,'pluarality']) # 1=single, 2=twin, 3=tuplet, 4=quadruplet or higher

col_mapping.append([475,475,'sex_of_infant']) # M, F
col_mapping.append([477,478,'last_normal_menses_month']) # 01-12, 99=Unknown
col_mapping.append([481,484,'last_normal_menses_year']) # YYYY

col_mapping.append([492,493,'combined_gestation']) # 01=under 20 weeks, 02=20-27, 03=28-31, 04=32-33, ... 99=Unknown

col_mapping.append([501,502,'obstetric_estimate']) # 01=under 20 weeks, 02=20-27, 03=28-31, 04=32-33, 99=Unknown

col_mapping.append([509,510,'birth_weight1']) # 1=0227-0499g, 2=0500-0999g, etc
col_mapping.append([509,510,'birth_weight2']) # 1=0227-1499g, 2=1500-2499g, 3=2500-8165g, 4=Unknown
col_mapping.append([568,568,'infant_living_at_time_of_report']) # Y/N
col_mapping.append([569,569,'infant_breastfed_at_discharge']) # Y/N


col_mapping_df = pd.DataFrame(data=col_mapping, columns=['start', "end", 'field_name'])    
col_mapping_df['start'] = col_mapping_df['start'] - 1 # must minus because read_fwf expects start-1 to end
col_mapping_df['col_specs'] = list(zip(col_mapping_df.start, col_mapping_df.end))

col_specs=col_mapping_df.col_specs.values.tolist()
field_names=col_mapping_df.field_name.values.tolist()
#col_mapping_df

df = pd.read_fwf('Nat2021us/Nat2021us_small_200000.txt', colspecs=col_specs, names=field_names)
print(df.shape)



(194621, 79)


In [7]:
df

Unnamed: 0,birth_year,birth_month,time_of_birth,birth_day_of_week,birth_place,mother_age,mother_nativity,residence_status,mother_race1,mother_race2,...,pluarality,sex_of_infant,last_normal_menses_month,last_normal_menses_year,combined_gestation,obstetric_estimate,birth_weight1,birth_weight2,infant_living_at_time_of_report,infant_breastfed_at_discharge
0,2021,1,503,1,1,20,2,1,5,5,...,1,F,3,2020,7,6,7,7,Y,N
1,2021,1,2359,6,1,24,1,1,3,3,...,1,F,4,2020,6,6,9,9,Y,Y
2,2021,1,619,3,1,27,1,1,1,1,...,1,M,5,2020,5,5,6,6,Y,Y
3,2021,1,1152,6,1,34,1,2,1,1,...,1,F,3,2020,9,9,7,7,Y,Y
4,2021,1,1408,3,1,21,1,1,1,1,...,1,F,3,2020,8,8,8,8,Y,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194616,2021,12,1837,2,1,35,2,3,2,2,...,1,M,3,2021,7,7,8,8,Y,Y
194617,2021,12,649,4,1,19,1,1,1,1,...,1,M,4,2021,6,6,5,5,Y,N
194618,2021,12,353,2,1,35,1,1,2,2,...,1,F,3,2021,9,7,7,7,Y,Y
194619,2021,12,913,3,3,26,1,1,2,2,...,1,M,3,2021,8,8,7,7,Y,Y


In [51]:
#!pip install pandas_profiling

Collecting pandas_profiling
  Downloading pandas_profiling-3.6.3-py2.py3-none-any.whl (328 kB)
Collecting typeguard<2.14,>=2.13.2
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Collecting statsmodels<0.14,>=0.13.2
  Downloading statsmodels-0.13.5-cp38-cp38-win_amd64.whl (9.2 MB)
Collecting pydantic<1.11,>=1.8.1
  Downloading pydantic-1.10.4-cp38-cp38-win_amd64.whl (2.2 MB)
Collecting htmlmin==0.1.12

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.11.0 requires keras<2.12,>=2.11.0, but you have keras 2.8.0 which is incompatible.
tensorflow-intel 2.11.0 requires tensorboard<2.12,>=2.11, but you have tensorboard 2.8.0 which is incompatible.



  Using cached htmlmin-0.1.12-py3-none-any.whl
Collecting multimethod<1.10,>=1.4
  Downloading multimethod-1.9.1-py3-none-any.whl (10 kB)
Collecting phik<0.13,>=0.11.1
  Downloading phik-0.12.3-cp38-cp38-win_amd64.whl (663 kB)
Collecting visions[type_image_path]==0.7.5
  Downloading visions-0.7.5-py3-none-any.whl (102 kB)
Collecting tangled-up-in-unicode>=0.0.4
  Downloading tangled_up_in_unicode-0.2.0-py3-none-any.whl (4.7 MB)
Collecting imagehash
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl (296 kB)
Collecting scipy<1.10,>=1.4.1
  Downloading scipy-1.9.3-cp38-cp38-win_amd64.whl (39.8 MB)
Collecting typing-extensions>=4.2.0
  Downloading typing_extensions-4.4.0-py3-none-any.whl (26 kB)
Collecting packaging>=21.3
  Downloading packaging-23.0-py3-none-any.whl (42 kB)
Collecting patsy>=0.5.2
  Downloading patsy-0.5.3-py2.py3-none-any.whl (233 kB)
Installing collected packages: tangled-up-in-unicode, scipy, multimethod, visions, typing-extensions, patsy, packaging, imagehash, typeg

In [8]:

X = df
y = df['birth_weight1']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2


In [9]:
from pandas_profiling import ProfileReport

# Note that the following EDA profile takes over 2 minutes to generate
#def generate_features_profile():
"""Function to generate html files containing profiles of the
training and test data sets for EDA purposes."""

# Create and save training profile to an html file
train_profile = ProfileReport(
  X_train,
  minimal=True,
  title="Training Data Report",
  html={"style": {"full_width": True}})
train_profile.to_file(output_file="train_profile.html")
#files.download("train_profile.html")

#files.download("combined_profile.html")

#generate_features_profile()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]