## This notebook trains different models to predict newborn age

In [1]:
from datetime import datetime

import pickle
import time
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split

from pandas_profiling import ProfileReport

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import RandomizedSearchCV

import tensorflow as tf
from childbirth_common_util import *


In [2]:
print(datetime.now())

2023-02-14 19:29:24.508435


In [3]:
# laod CSV data

X_train_from_file = pd.read_csv("data_files/x_train_orig.csv")
y_train_from_file = pd.read_csv("data_files/y_train_orig.csv")
y_train_weight = y_train_from_file['birth_weight_in_g'].squeeze()
y_train_age = y_train_from_file['combined_gestation_week'].squeeze()

X_val_from_file = pd.read_csv("data_files/x_val_orig.csv")
y_val_from_file = pd.read_csv("data_files/y_val_orig.csv")
y_val_weight = y_val_from_file['birth_weight_in_g'].squeeze()
y_val_age = y_val_from_file['combined_gestation_week'].squeeze()

X_test_from_file = pd.read_csv("data_files/x_test_orig.csv")
y_test_from_file = pd.read_csv("data_files/y_test_orig.csv")
y_test_weight = y_test_from_file['birth_weight_in_g'].squeeze()
y_test_age = y_test_from_file['combined_gestation_week'].squeeze()

print(f"Train feature shape: {X_train_from_file.shape}, output shape: {y_train_age.shape}")
print(f"Val feature shape: {X_val_from_file.shape}, output shape: {y_val_age.shape}")
print(f"Test feature shape: {X_test_from_file.shape}, output shape: {y_test_age.shape}")



Train feature shape: (116615, 81), output shape: (116615,)
Val feature shape: (38872, 81), output shape: (38872,)
Test feature shape: (38872, 81), output shape: (38872,)


In [4]:
# TODO: marital_status has NA and its type becomes float
# X_train_from_file.dtypes.to_csv("DEBUG_dtypes.csv")

In [5]:
# X_train_correct_type = util_change_column_type(X_train_from_file)
# X_train_correct_type.dtypes.to_csv("DEBUG_dtypes.csv")

In [6]:
util_calc_baseline(y_train_age, "gestation week")

the gestation week's mean in training is 38.50819362860695
gestation week: rmse=2.50879856327732


In [7]:
# list of regressor model: 
# https://scikit-learn.org/stable/supervised_learning.htmlFor 

# Train Age Models


In [8]:
# load feature list from file
column_list = np.loadtxt(f'models/feature_list_age.txt', dtype="object")

print(column_list)
print(column_list.shape)


['birth_month' 'mother_age' 'mother_nativity' 'residence_status'
 'mother_race1' 'mother_hispanic_race' 'paternity_acknowledged'
 'marital_status' 'mother_education' 'father_age'
 'prior_births_now_living' 'total_birth_order'
 'interval_since_last_live_birth' 'month_prenatal_care_began'
 'number_of_prenatal_visits' 'wic' 'cigarettes_3rd_trimester'
 'mother_height_in_total_inches' 'bmi' 'prepregnancy_weight'
 'weight_gain_group' 'gestational_diabetes' 'prepregnancy_hypertension'
 'gestational_hypertension' 'previous_preterm_birth'
 'infertility_treatment_used' 'fertility_enhancing_drugs'
 'previous_cesarean' 'number_of_previous_cesareans'
 'no_risk_factors_reported' 'chlamydia' 'attendant_at_birth' 'pluarality'
 'sex_of_infant' 'last_normal_menses_month' 'combined_gestation_week'
 'birth_weight_in_g' 'infant_breastfed_at_discharge']
(38,)


In [9]:
# calculate and saved the scaler for each feature
X_train_scaled = util_handle_na_and_type(X_train_from_file[column_list].copy(), "age")
X_train_scaled = util_calc_save_scaler(X_train_scaled, "age")

# sanity check. mean = 0, std dev = 1
display(X_train_scaled.describe())

       birth_month  mother_age mother_race1  father_age  bmi
0                1        25.0            1        32.0  3.0
1               12        24.0            1        30.0  3.0
2               10        26.0           10        25.0  2.0
3               10        28.0            1        28.0  9.0
4                9        35.0            4        41.0  3.0
...            ...         ...          ...         ...  ...
116610          12        39.0            4        39.0  4.0
116611           8        31.0            4        36.0  2.0
116612           1        19.0            2        21.0  2.0
116613          12        25.0            1        25.0  2.0
116614          12        22.0            1        23.0  2.0

[116615 rows x 5 columns]


Unnamed: 0,birth_month,mother_age,mother_nativity,residence_status,mother_race1,mother_hispanic_race,paternity_acknowledged,marital_status,mother_education,father_age,...,fertility_enhancing_drugs,previous_cesarean,number_of_previous_cesareans,no_risk_factors_reported,chlamydia,attendant_at_birth,pluarality,sex_of_infant,last_normal_menses_month,infant_breastfed_at_discharge
count,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,...,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0
mean,3.302746e-16,3.087832e-16,-2.144728e-15,-5.438403e-16,1.258963e-15,7.297302e-16,-1.627991e-17,1.585223e-15,6.178748e-18,-1.14276e-15,...,7.050747000000001e-17,-2.038079e-15,-3.101718e-15,8.206749e-16,-4.179366e-16,1.414786e-15,-1.840255e-15,-1.196499e-15,1.077311e-16,-1.516061e-15
std,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,...,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004
min,-1.620642,-2.837408,-0.5190669,-0.6576646,-17.29659,-0.7687025,-3.734086,-0.7439477,-1.932785,-2.855741,...,-7.876871,-3.179457,-7.066151,-3.468656,-0.146831,-1.688013,-11.47718,-1.021016,-1.553354,-2.163216
25%,-0.7578119,-0.761894,-0.5190669,-0.6576646,-0.2303981,-0.7687025,-0.273186,-0.7439477,-0.8362043,-0.6354396,...,0.08431215,-0.4193374,-0.3675159,-1.374388,-0.146831,-0.4740207,-0.1705951,-1.021016,-1.00721,-0.7986342
50%,0.1050185,0.1029036,-0.5190669,-0.6576646,0.4016831,-0.7687025,-0.273186,-0.7439477,-0.2879141,0.03036734,...,0.08431215,-0.4193374,-0.3675159,0.7198793,-0.146831,-0.4740207,-0.1705951,0.9794169,0.08507691,0.5659479
75%,0.9678488,0.7947417,-0.5190669,1.198648,0.4016831,1.56911,1.457264,1.34418,0.8086663,0.4747111,...,0.08431215,-0.4193374,-0.3675159,0.7198793,-0.146831,-0.4740207,-0.1705951,0.9794169,0.9042922,0.5659479
max,1.543069,3.562094,4.285933,4.911274,0.4016831,1.958745,1.457264,1.34418,2.453537,8.880137,...,4.064904,2.340782,8.005778,0.7198793,7.150293,4.381949,5.482699,0.9794169,1.723508,0.5659479


In [10]:
# scaled the data to mean=0 and std dev=1
X_val_scaled = util_scale(X_val_from_file[column_list], 'age')
print(X_val_scaled.shape)

# sanity check. mean = 0, std dev = 1
display(X_val_scaled.describe())

      birth_month  mother_age mother_race1  father_age  bmi
0              12        22.0            2   32.006733  3.0
1               3        30.0            4   33.000000  3.0
2               8        20.0            2   20.000000  9.0
3              11        33.0            1   37.000000  2.0
4              10        37.0            1   44.000000  2.0
...           ...         ...          ...         ...  ...
38867           2        34.0            1   36.000000  3.0
38868           1        27.0            2   25.000000  3.0
38869          10        38.0            1   32.000000  2.0
38870          10        39.0            1   39.000000  4.0
38871           6        24.0            1   24.000000  2.0

[38872 rows x 5 columns]
(38872, 36)


Unnamed: 0,birth_month,mother_age,mother_nativity,residence_status,mother_race1,mother_hispanic_race,paternity_acknowledged,marital_status,mother_education,father_age,...,fertility_enhancing_drugs,previous_cesarean,number_of_previous_cesareans,no_risk_factors_reported,chlamydia,attendant_at_birth,pluarality,sex_of_infant,last_normal_menses_month,infant_breastfed_at_discharge
count,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,...,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0
mean,-0.004574,-0.005014,0.007577,0.001061,0.003543,0.003021,0.001615,0.001337,-0.011301,-0.003536,...,-0.007645,-0.008288,-0.005365,0.002413,-0.011014,-0.006844,0.018469,0.003491,0.002548,0.011578
std,0.999667,1.001992,1.007157,0.999944,0.976724,1.000519,1.001144,1.000413,0.999371,1.00725,...,1.025373,0.994746,0.999923,1.0008,0.962506,0.986896,1.043563,0.999934,1.001116,0.990842
min,-1.620642,-2.837408,-0.519067,-0.657665,-16.66451,-0.768702,-3.734086,-0.743948,-1.932785,-2.855741,...,-7.876871,-3.179457,-5.391492,-3.468656,-0.146831,-1.688013,-11.477184,-1.021016,-1.553354,-2.163216
25%,-0.757812,-0.761894,-0.519067,-0.657665,-0.230398,-0.768702,-0.273186,-0.743948,-0.836204,-0.63544,...,0.084312,-0.419337,-0.367516,-1.374388,-0.146831,-0.474021,-0.170595,-1.021016,-1.00721,-0.798634
50%,0.105018,0.102904,-0.519067,-0.657665,0.401683,-0.768702,-0.273186,-0.743948,-0.287914,0.0,...,0.084312,-0.419337,-0.367516,0.719879,-0.146831,-0.474021,-0.170595,0.979417,0.085077,0.565948
75%,0.967849,0.621782,-0.519067,1.198648,0.401683,1.56911,1.457264,1.34418,0.808666,0.474711,...,0.084312,-0.419337,-0.367516,0.719879,-0.146831,-0.474021,-0.170595,0.979417,0.904292,0.565948
max,1.543069,3.562094,4.285933,4.911274,0.401683,1.958745,1.457264,1.34418,2.453537,6.34265,...,4.064904,2.340782,8.005778,0.719879,7.150293,4.381949,5.482699,0.979417,1.723508,0.565948


## Train individual model and evaluate its accuracy

In [11]:
# train individual models and use validation dataset to compare the results
# It helps determine the proportion of each model for ensemble modeling
# Please refer to childbirth_model_parameter_tuning.ipynb for parameters tuning code
def train_and_evaluate_several_models_for_age_prediction(predict_output_type, X_train, y_train, X_val, y_val):
    # linear regression
    model = LinearRegression() # rmse=2.17(200K)
    util_train_and_evaluate("linear", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")
 
    # Gradient Boosting Regressor, rmse=1.67(200K)
    # tuned on 2/8/2023
    params = {"n_estimators": 500, "max_depth": 4, "min_samples_split": 5, "learning_rate": 0.05, "loss": "squared_error"}
    model = GradientBoostingRegressor(**params)
    util_train_and_evaluate("gb", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # SGDRegressor, rmse=2.17(200K), tuned on 2/7/2023
    params = {'penalty': 'l2', 'loss': 'squared_error', 'learning_rate': 'adaptive', 'eta0': 100, 'alpha': 0.001}
    model = SGDRegressor(**params)
    util_train_and_evaluate("sgd", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # LGBMRegressor, rmse=2.17(200K)
    #model = LGBMRegressor()
    util_train_and_evaluate("lgbm", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # XGBRegressor, rmse=2.17(200K)
    #model = XGBRegressor()
    util_train_and_evaluate("xgb", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # RandomForestRegressor, rmse=2.08(200K)
    # it takes 6-10 minutes, tuned on 2/7/2023
    params = {'n_estimators': 800, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 10}
    model = RandomForestRegressor(**params)
    util_train_and_evaluate("rf", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # KNN is very bad. Don't use
    #model = KNeighborsRegressor(n_neighbors=20)
    #train_and_evaluate("KNN", model, X_train_from_file, y_train, X_val_from_file, y_val)
    #print("")

    # SVR is slow and bad, Dont Use.Take 30 minutes to run
    #params = {'kernel': 'linear', 'gamma': 1e-07, 'epsilon': 0.1, 'degree': 2, 'coef0': 1, 'C': 100, 'max_iter': 150000}}
    #model = SVR(**params)
    #train_and_evaluate("svr", predict_output_type, model, X_train, y_train, X_val, y_val)
    #print("")
    


In [12]:
# it takes over 30 minutes to train and evaluate the above models
train_and_evaluate_several_models_for_age_prediction("age", X_train_scaled, y_train_age, X_val_scaled, y_val_age)


Start training model linear for age at 2023-02-14 19:29:52.173381
Saving linear to file: models/model_linear_age.sav
End time = 2023-02-14 19:29:52.734365, elapsed time = 0.5609846115112305
linear for age: rmse=2.213753071043989

Start training model gb for age at 2023-02-14 19:29:52.734365
Saving gb to file: models/model_gb_age.sav
End time = 2023-02-14 19:32:45.638847, elapsed time = 172.90448117256165
gb for age: rmse=1.5588196480837289

Start training model sgd for age at 2023-02-14 19:32:45.639846
Saving sgd to file: models/model_sgd_age.sav
End time = 2023-02-14 19:32:48.630954, elapsed time = 2.9911081790924072
sgd for age: rmse=2.213754719164824

Start training model lgbm for age at 2023-02-14 19:32:48.630954
Saving lgbm to file: models/model_lgbm_age.sav
End time = 2023-02-14 19:32:51.724262, elapsed time = 3.0933074951171875
lgbm for age: rmse=2.213726876943967

Start training model xgb for age at 2023-02-14 19:32:51.724262
Saving xgb to file: models/model_xgb_age.sav
End tim

In [13]:
# Neural network, rmse=2.416. It takes 5 minutes to run
def nn_train_and_evaluate_for_age_prediction(predict_output_type, X_train, y_train, X_val, y_val):
    model_name="nn"
    start_time = time.time()
    print(f"Start training model {model_name} for {predict_output_type} at {datetime.now()}")

    tf.keras.backend.clear_session()
    tf.random.set_seed(0)

    nn_model = tf.keras.Sequential()
    nn_model.add(tf.keras.layers.Dense(16))
    nn_model.add(tf.keras.layers.Dense(units=1))

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)

    # We specify the MSE loss.
    nn_model.compile(loss='mse', optimizer=optimizer)
    history = nn_model.fit(
      x = X_train,
      y = y_train,
      epochs=100,
      batch_size=32,
      validation_split=0.2,
      verbose=0)

    y_pred = nn_model.predict(X_val)

    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    model_filename = f"models/model_{model_name}_{predict_output_type}"
    print(f"Saving {model_name} to directory: {model_filename}")
    
    nn_model.save(model_filename)
    end_time = time.time()
    print(f"End time = {datetime.now()}, elapsed time = {end_time - start_time}")
    
    print(f"{model_name} for {predict_output_type}: rmse={rmse}")
    



In [14]:
nn_train_and_evaluate_for_age_prediction("age", X_train_scaled, y_train_age, X_val_scaled, y_val_age)


Start training model nn for age at 2023-02-14 19:39:56.699932
Saving nn to directory: models/model_nn_age
INFO:tensorflow:Assets written to: models/model_nn_age\assets
End time = 2023-02-14 19:45:21.122896, elapsed time = 324.42296504974365
nn for age: rmse=2.5644553121692675


# Ensemble Models for Gestation Age Prediction

In [15]:
### Ensemble model for age prediction

column_list = util_load_x_columns_list_from_file("age")
models = util_load_models_from_file("age")

def scale_predict_compare_save(X_input_from_file, y, input_type):
    predict_output_type = "age"
    X_scaled = util_scale(X_input_from_file[column_list], predict_output_type)
    print(f"{input_type} feature shape: {X_scaled.shape}, output shape: {y.shape}")
    y_pred = util_ensemble_predict_age(X_scaled, column_list, models)
    print(y_pred)
    print(y_pred.shape)
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    print(rmse) 
    np.savetxt(f"data_files/pred_y_{input_type}_{predict_output_type}.csv", y_pred, delimiter=",")

# rmse=1.99
scale_predict_compare_save(X_input_from_file = X_val_from_file, y = y_val_age, input_type = "val")

# rmse=2.00
scale_predict_compare_save(X_input_from_file = X_train_from_file, y = y_train_age, input_type = "train")

# rmse=2.02
scale_predict_compare_save(X_input_from_file = X_test_from_file, y = y_test_age, input_type = "test")


      birth_month  mother_age mother_race1  father_age  bmi
0              12        22.0            2   32.006733  3.0
1               3        30.0            4   33.000000  3.0
2               8        20.0            2   20.000000  9.0
3              11        33.0            1   37.000000  2.0
4              10        37.0            1   44.000000  2.0
...           ...         ...          ...         ...  ...
38867           2        34.0            1   36.000000  3.0
38868           1        27.0            2   25.000000  3.0
38869          10        38.0            1   32.000000  2.0
38870          10        39.0            1   39.000000  4.0
38871           6        24.0            1   24.000000  2.0

[38872 rows x 5 columns]
val feature shape: (38872, 36), output shape: (38872,)
predicting using linear and its proportion is 0.05
predicting using gb and its proportion is 0.25
predicting using sgd and its proportion is 0.05
predicting using lgbm and its proportion is 0.25
pred