## This notebook trains different models to predict newborn age

In [3]:
from datetime import datetime

import pickle
import time
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split

from pandas_profiling import ProfileReport

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import RandomizedSearchCV

import tensorflow as tf
from childbirth_common_util import *


In [4]:
print(datetime.now())

2023-02-09 22:08:36.671478


In [5]:
# laod CSV data

X_train_from_file = pd.read_csv("data_files/x_train_orig.csv")
y_train_from_file = pd.read_csv("data_files/y_train_orig.csv")
y_train_weight = y_train_from_file['birth_weight_in_g'].squeeze()
y_train_age = y_train_from_file['combined_gestation_week'].squeeze()

X_val_from_file = pd.read_csv("data_files/x_val_orig.csv")
y_val_from_file = pd.read_csv("data_files/y_val_orig.csv")
y_val_weight = y_val_from_file['birth_weight_in_g'].squeeze()
y_val_age = y_val_from_file['combined_gestation_week'].squeeze()

X_test_from_file = pd.read_csv("data_files/x_test_orig.csv")
y_test_from_file = pd.read_csv("data_files/y_test_orig.csv")
y_test_weight = y_test_from_file['birth_weight_in_g'].squeeze()
y_test_age = y_test_from_file['combined_gestation_week'].squeeze()

print(f"Train feature shape: {X_train_from_file.shape}, output shape: {y_train_age.shape}")
print(f"Val feature shape: {X_val_from_file.shape}, output shape: {y_val_age.shape}")
print(f"Test feature shape: {X_test_from_file.shape}, output shape: {y_test_age.shape}")


Train feature shape: (116615, 81), output shape: (116615,)
Val feature shape: (38872, 81), output shape: (38872,)
Test feature shape: (38872, 81), output shape: (38872,)


In [6]:
util_calc_baseline(y_train_age, "gestation week")

the gestation week's mean in training is 38.50819362860695
gestation week: rmse=2.50879856327732


In [7]:
# list of regressor model: 
# https://scikit-learn.org/stable/supervised_learning.htmlFor 

# Train Age Models


In [9]:
# calculate and saved the scaler for each feature
X_train_scaled = util_handle_na(X_train_from_file[column_list].copy())
X_train_scaled = util_calc_save_scaler(X_train_scaled, "age")
print(X_train_scaled.shape)

# sanity check. mean = 0, std dev = 1
display(X_train_scaled.describe())

(116615, 36)


Unnamed: 0,birth_month,mother_age,mother_nativity,residence_status,mother_race1,mother_hispanic_race,paternity_acknowledged,marital_status,mother_education,father_age,...,fertility_enhancing_drugs,previous_cesarean,number_of_previous_cesareans,no_risk_factors_reported,chlamydia,attendant_at_birth,pluarality,sex_of_infant,last_normal_menses_month,infant_breastfed_at_discharge
count,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,...,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0
mean,2.782693e-16,-2.17351e-17,2.133346e-15,9.670284e-16,3.008717e-15,1.046718e-15,1.522374e-15,3.427292e-16,-1.510737e-16,1.7330960000000002e-17,...,-2.327075e-15,-5.772322e-17,-1.862832e-15,8.206749e-16,1.715268e-15,-2.026208e-15,-5.123558e-16,1.196499e-15,-3.1782e-16,-1.516061e-15
std,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,...,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004
min,-1.582814,-3.377988,-4.285933,-4.172658,-3.159746,-2.059279,-1.916987,-1.267869,-2.146877,-3.101819,...,-8.407923,-5.042865,-12.82082,-3.468656,-10.93861,-1.439982,-16.10491,-0.9794169,-1.743532,-2.163216
25%,-0.7220785,-0.7527683,0.5190669,-0.9889307,0.3188995,-0.4296012,-0.9933853,-1.267869,-0.7752022,-0.6590888,...,0.1489556,0.4252991,0.3452675,-1.374388,0.1426313,-0.4694775,0.1791575,-0.9794169,-0.9087457,-0.7986342
50%,-0.1482548,0.1660587,0.5190669,0.6029329,0.5121576,0.7926575,0.8538175,0.8946347,0.1392474,0.2864842,...,0.1489556,0.4252991,0.3452675,0.7198793,0.1426313,-0.4694775,0.1791575,-0.9794169,-0.07395907,0.5659479
75%,0.9993927,0.9536247,0.5190669,0.6029329,0.5121576,0.7926575,0.8538175,0.8946347,1.053697,0.8380685,...,0.1489556,0.4252991,0.3452675,0.7198793,0.1426313,-0.4694775,0.1791575,1.021016,0.7608275,0.5659479
max,1.573216,1.478669,0.5190669,0.6029329,2.25148,0.7926575,0.8538175,0.8946347,1.510922,1.704844,...,0.1489556,0.4252991,0.3452675,0.7198793,0.1426313,3.412542,0.1791575,1.021016,1.595614,0.5659479


In [10]:
# scaled the data to mean=0 and std dev=1
X_val_scaled = util_scale(X_val_from_file[column_list], 'age')
print(X_val_scaled.shape)

# sanity check. mean = 0, std dev = 1
display(X_val_scaled.describe())

(38872, 36)


Unnamed: 0,birth_month,mother_age,mother_nativity,residence_status,mother_race1,mother_hispanic_race,paternity_acknowledged,marital_status,mother_education,father_age,...,fertility_enhancing_drugs,previous_cesarean,number_of_previous_cesareans,no_risk_factors_reported,chlamydia,attendant_at_birth,pluarality,sex_of_infant,last_normal_menses_month,infant_breastfed_at_discharge
count,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,...,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0
mean,0.003025,0.000912,-0.007577,-0.00151,0.004045,0.001506,-0.000333,-0.00092,-0.01184,-0.001011,...,-0.004988,0.005255,0.004899,0.002413,0.010644,-0.004248,-0.016057,-0.003491,0.000209,0.011578
std,0.999852,0.998402,1.007157,1.001951,0.994723,0.997047,0.99971,1.000435,1.001036,0.999434,...,1.011328,0.997739,0.99648,1.0008,0.962873,0.9957,1.0359,0.999934,0.998624,0.990842
min,-1.582814,-3.377988,-4.285933,-4.172658,-2.966488,-2.059279,-1.916987,-1.267869,-2.146877,-3.101819,...,-8.407923,-5.042865,-11.357923,-3.468656,-10.938614,-1.439982,-16.104906,-0.979417,-1.743532,-2.163216
25%,-0.722079,-0.752768,0.519067,-0.988931,0.318899,-0.429601,-0.993385,-1.267869,-0.775202,-0.659089,...,0.148956,0.425299,0.345267,-1.374388,0.142631,-0.469477,0.179158,-0.979417,-0.908746,-0.798634
50%,-0.148255,0.166059,0.519067,0.602933,0.512158,0.792658,0.853817,0.894635,0.139247,0.286484,...,0.148956,0.425299,0.345267,0.719879,0.142631,-0.469477,0.179158,-0.979417,-0.073959,0.565948
75%,0.999393,0.953625,0.519067,0.602933,0.512158,0.792658,0.853817,0.894635,1.053697,0.838068,...,0.148956,0.425299,0.345267,0.719879,0.142631,-0.469477,0.179158,1.021016,0.760828,0.565948
max,1.573216,1.478669,0.519067,0.602933,2.25148,0.792658,0.853817,0.894635,1.510922,1.626046,...,0.148956,0.425299,0.345267,0.719879,0.142631,3.412542,0.179158,1.021016,1.595614,0.565948


## Train individual model and evaluate its accuracy

In [11]:
# train individual models and use validation dataset to compare the results
# It helps determine the proportion of each model for ensemble modeling
# Please refer to childbirth_model_parameter_tuning.ipynb for parameters tuning code
def train_and_evaluate_several_models_for_age_prediction(predict_output_type, X_train, y_train, X_val, y_val):
    # linear regression
    model = LinearRegression() # rmse=2.17(200K)
    util_train_and_evaluate("linear", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")
 
    # Gradient Boosting Regressor, rmse=1.67(200K)
    # tuned on 2/8/2023
    params = {"n_estimators": 500, "max_depth": 4, "min_samples_split": 5, "learning_rate": 0.05, "loss": "squared_error"}
    model = GradientBoostingRegressor(**params)
    util_train_and_evaluate("gb", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # SGDRegressor, rmse=2.17(200K), tuned on 2/7/2023
    params = {'penalty': 'l2', 'loss': 'squared_error', 'learning_rate': 'adaptive', 'eta0': 100, 'alpha': 0.001}
    model = SGDRegressor(**params)
    util_train_and_evaluate("sgd", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # LGBMRegressor, rmse=2.17(200K)
    #model = LGBMRegressor()
    util_train_and_evaluate("lgbm", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # XGBRegressor, rmse=2.17(200K)
    #model = XGBRegressor()
    util_train_and_evaluate("xgb", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # RandomForestRegressor, rmse=2.08(200K)
    # it takes 10+ minutes, tuned on 2/7/2023
    params = {'n_estimators': 800, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 10}
    model = RandomForestRegressor(**params)
    util_train_and_evaluate("rf", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # KNN is very bad. Don't use
    #model = KNeighborsRegressor(n_neighbors=20)
    #train_and_evaluate("KNN", model, X_train_from_file, y_train, X_val_from_file, y_val)
    #print("")

    # SVR is slow and bad, Dont Use.Take 30 minutes to run
    #params = {'kernel': 'linear', 'gamma': 1e-07, 'epsilon': 0.1, 'degree': 2, 'coef0': 1, 'C': 100, 'max_iter': 150000}}
    #model = SVR(**params)
    #train_and_evaluate("svr", predict_output_type, model, X_train, y_train, X_val, y_val)
    #print("")
    


In [12]:
# it takes over 30 minutes to traing and evaluate the above models
train_and_evaluate_several_models_for_age_prediction("age", X_train_scaled, y_train_age, X_val_scaled, y_val_age)


Start training model linear for age at 2023-02-09 22:08:48.053142
Saving linear to file: models/model_linear_age.sav
End time = 2023-02-09 22:08:48.329565, elapsed time = 0.27642321586608887
linear for age: rmse=2.174567488420935

Start training model gb for age at 2023-02-09 22:08:48.332202
Saving gb to file: models/model_gb_age.sav
End time = 2023-02-09 22:11:08.380307, elapsed time = 140.04810500144958
gb for age: rmse=1.6773869551818055

Start training model sgd for age at 2023-02-09 22:11:08.381274
Saving sgd to file: models/model_sgd_age.sav
End time = 2023-02-09 22:11:11.042373, elapsed time = 2.661099672317505
sgd for age: rmse=2.1745470324030793

Start training model lgbm for age at 2023-02-09 22:11:11.042373
Saving lgbm to file: models/model_lgbm_age.sav
End time = 2023-02-09 22:11:13.884751, elapsed time = 2.8423776626586914
lgbm for age: rmse=2.1745523573239187

Start training model xgb for age at 2023-02-09 22:11:13.884751
Saving xgb to file: models/model_xgb_age.sav
End t

In [13]:
# Neural network, rmse=2.416
def nn_train_and_evaluate_for_age_prediction(predict_output_type, X_train, y_train, X_val, y_val):
    model_name="nn"
    start_time = time.time()
    print(f"Start training model {model_name} for {predict_output_type} at {datetime.now()}")

    tf.keras.backend.clear_session()
    tf.random.set_seed(0)

    nn_model = tf.keras.Sequential()
    nn_model.add(tf.keras.layers.Dense(16))
    nn_model.add(tf.keras.layers.Dense(units=1))

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)

    # We specify the MSE loss.
    nn_model.compile(loss='mse', optimizer=optimizer)
    history = nn_model.fit(
      x = X_train,
      y = y_train,
      epochs=100,
      batch_size=32,
      validation_split=0.2,
      verbose=0)

    y_pred = nn_model.predict(X_val)

    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    model_filename = f"models/model_{model_name}_{predict_output_type}"
    print(f"Saving {model_name} to directory: {model_filename}")
    
    nn_model.save(model_filename)
    end_time = time.time()
    print(f"End time = {datetime.now()}, elapsed time = {end_time - start_time}")
    
    print(f"{model_name} for {predict_output_type}: rmse={rmse}")
    



Start training model nn for age at 2023-02-09 22:17:56.310535
Saving nn to directory: models/model_nn_age
INFO:tensorflow:Assets written to: models/model_nn_age\assets
End time = 2023-02-09 22:23:28.866046, elapsed time = 332.5555112361908
nn for age: rmse=2.416773472051171


In [None]:
nn_train_and_evaluate_for_age_prediction("age", X_train_scaled, y_train_age, X_val_scaled, y_val_age)


# Ensemble Models for Gestation Age Prediction

In [None]:
### Ensemble model for age prediction

column_list = util_load_x_columns_list_from_file("age")
models = util_load_models_from_file("age")

def scale_predict_compare_save(X_input_from_file, y, input_type):
    predict_output_type = "age"
    X_scaled = util_scale(X_input_from_file[column_list], predict_output_type)
    print(f"{input_type} feature shape: {X_scaled.shape}, output shape: {y.shape}")
    y_pred = util_ensemble_predict_age(X_scaled, column_list, models)
    print(y_pred)
    print(y_pred.shape)
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    print(rmse) 
    np.savetxt(f"data_files/pred_y_{input_type}_{predict_output_type}.csv", y_pred, delimiter=",")

# rmse=1.99
scale_predict_compare_save(X_input_from_file = X_val_from_file, y = y_val_age, input_type = "val")

# rmse=2.00
scale_predict_compare_save(X_input_from_file = X_train_from_file, y = y_train_age, input_type = "train")

# rmse=2.02
scale_predict_compare_save(X_input_from_file = X_test_from_file, y = y_test_age, input_type = "test")
