## This notebook trains different models to predict newborn weight

In [1]:
from datetime import datetime

import pickle
import time
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split

from pandas_profiling import ProfileReport

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import RandomizedSearchCV

import tensorflow as tf
from childbirth_common_util import *


In [2]:
print(datetime.now())

2023-02-14 18:51:30.498922


In [3]:
# laod CSV data

X_train_from_file = pd.read_csv("data_files/x_train_orig.csv")
y_train_from_file = pd.read_csv("data_files/y_train_orig.csv")
y_train_weight = y_train_from_file['birth_weight_in_g'].squeeze()
y_train_age = y_train_from_file['combined_gestation_week'].squeeze()

X_val_from_file = pd.read_csv("data_files/x_val_orig.csv")
y_val_from_file = pd.read_csv("data_files/y_val_orig.csv")
y_val_weight = y_val_from_file['birth_weight_in_g'].squeeze()
y_val_age = y_val_from_file['combined_gestation_week'].squeeze()

X_test_from_file = pd.read_csv("data_files/x_test_orig.csv")
y_test_from_file = pd.read_csv("data_files/y_test_orig.csv")
y_test_weight = y_test_from_file['birth_weight_in_g'].squeeze()
y_test_age = y_test_from_file['combined_gestation_week'].squeeze()

print(f"Train feature shape: {X_train_from_file.shape}, output shape: {y_train_weight.shape}")
print(f"Val feature shape: {X_val_from_file.shape}, output shape: {y_val_weight.shape}")
print(f"Test feature shape: {X_test_from_file.shape}, output shape: {y_test_weight.shape}")


Train feature shape: (116615, 81), output shape: (116615,)
Val feature shape: (38872, 81), output shape: (38872,)
Test feature shape: (38872, 81), output shape: (38872,)


In [4]:
util_calc_baseline(y_train_weight, "birth_weight_in_g")

the birth_weight_in_g's mean in training is 3249.156257771299
birth_weight_in_g: rmse=588.133500299761


In [5]:
# list of regressor model: 
# https://scikit-learn.org/stable/supervised_learning.html

# Train Weight Models


In [6]:
# load feature list from file
column_list = np.loadtxt(f'models/feature_list_weight.txt', dtype="object")

print(column_list)
print(column_list.shape)


['birth_month' 'mother_age' 'mother_nativity' 'residence_status'
 'mother_race1' 'mother_hispanic_race' 'paternity_acknowledged'
 'marital_status' 'mother_education' 'father_age'
 'prior_births_now_living' 'total_birth_order'
 'interval_since_last_live_birth' 'month_prenatal_care_began'
 'number_of_prenatal_visits' 'wic' 'cigarettes_3rd_trimester'
 'mother_height_in_total_inches' 'bmi' 'prepregnancy_weight'
 'weight_gain_group' 'gestational_diabetes' 'prepregnancy_hypertension'
 'gestational_hypertension' 'previous_preterm_birth'
 'infertility_treatment_used' 'fertility_enhancing_drugs'
 'previous_cesarean' 'number_of_previous_cesareans'
 'no_risk_factors_reported' 'chlamydia' 'attendant_at_birth' 'pluarality'
 'sex_of_infant' 'last_normal_menses_month' 'combined_gestation_week'
 'birth_weight_in_g' 'infant_breastfed_at_discharge']
(38,)


In [10]:
# calculate and saved the scaler for each feature
X_train_scaled = util_handle_na_and_type(X_train_from_file[column_list].copy(), "weight")
X_train_scaled = util_calc_save_scaler(X_train_scaled, "weight")

# sanity check. mean = 0, std dev = 1
display(X_train_scaled.describe())

       birth_month  mother_age  father_age  bmi
0                1        25.0        32.0  3.0
1               12        24.0        30.0  3.0
2               10        26.0        25.0  2.0
3               10        28.0        28.0  9.0
4                9        35.0        41.0  3.0
...            ...         ...         ...  ...
116610          12        39.0        39.0  4.0
116611           8        31.0        36.0  2.0
116612           1        19.0        21.0  2.0
116613          12        25.0        25.0  2.0
116614          12        22.0        23.0  2.0

[116615 rows x 4 columns]


Unnamed: 0,birth_month,mother_age,mother_nativity,residence_status,mother_race1,mother_hispanic_race,paternity_acknowledged,marital_status,mother_education,father_age,...,fertility_enhancing_drugs,previous_cesarean,number_of_previous_cesareans,no_risk_factors_reported,chlamydia,attendant_at_birth,pluarality,sex_of_infant,last_normal_menses_month,infant_breastfed_at_discharge
count,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,...,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0,116615.0
mean,3.302746e-16,3.087832e-16,-2.144728e-15,-5.438403e-16,-2.229634e-15,7.297302e-16,-1.044723e-15,-1.14879e-15,6.178748e-18,-1.996031e-16,...,7.050747000000001e-17,-1.242725e-15,-3.101718e-15,7.937835e-16,-4.179366e-16,1.414786e-15,1.271522e-15,-1.196499e-15,1.077311e-16,-1.516061e-15
std,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,...,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004
min,-1.620642,-2.837408,-0.5190669,-0.6576646,-15.12533,-0.7687025,-2.563486,-1.932069,-1.932785,-2.754331,...,-7.876871,-0.4267251,-7.066151,-1.385474,-0.146831,-1.688013,-5.639229,-1.021016,-1.553354,-2.163216
25%,-0.7578119,-0.761894,-0.5190669,-0.6576646,-0.1502924,-0.7687025,-0.07567021,-0.3757291,-0.8362043,-0.7207452,...,0.08431215,-0.4267251,-0.3675159,-1.385474,-0.146831,-0.4740207,-0.1790937,-1.021016,-1.00721,-0.7986342
50%,0.1050185,0.1029036,-0.5190669,-0.6576646,0.4043388,-0.7687025,-0.07567021,-0.3757291,-0.2879141,0.005535422,...,0.08431215,-0.4267251,-0.3675159,0.7140808,-0.146831,-0.4740207,-0.1790937,0.9794169,0.08507691,0.5659479
75%,0.9678488,0.7947417,-0.5190669,1.198648,0.4043388,1.56911,1.168238,1.180611,0.8086663,0.731816,...,0.08431215,-0.4267251,-0.3675159,0.7140808,-0.146831,-0.4740207,-0.1790937,0.9794169,0.9042922,0.5659479
max,1.543069,3.562094,4.285933,4.911274,0.4043388,1.958745,1.168238,1.180611,2.453537,7.994622,...,4.064904,2.348886,8.005778,2.813635,7.150293,4.381949,10.74118,0.9794169,1.723508,0.5659479


In [11]:
# scaled the data to mean=0 and std dev=1
X_val_scaled = util_scale(X_val_from_file[column_list], 'weight')
print(X_val_scaled.shape)

# sanity check. mean = 0, std dev = 1
display(X_val_scaled.describe())

      birth_month  mother_age  father_age  bmi
0              12        22.0   32.961892  3.0
1               3        30.0   33.000000  3.0
2               8        20.0   20.000000  9.0
3              11        33.0   37.000000  2.0
4              10        37.0   44.000000  2.0
...           ...         ...         ...  ...
38867           2        34.0   36.000000  3.0
38868           1        27.0   25.000000  3.0
38869          10        38.0   32.000000  2.0
38870          10        39.0   39.000000  4.0
38871           6        24.0   24.000000  2.0

[38872 rows x 4 columns]
(38872, 36)


Unnamed: 0,birth_month,mother_age,mother_nativity,residence_status,mother_race1,mother_hispanic_race,paternity_acknowledged,marital_status,mother_education,father_age,...,fertility_enhancing_drugs,previous_cesarean,number_of_previous_cesareans,no_risk_factors_reported,chlamydia,attendant_at_birth,pluarality,sex_of_infant,last_normal_menses_month,infant_breastfed_at_discharge
count,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,...,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0,38872.0
mean,-0.004574,-0.005014,0.007577,0.001061,0.002305,0.003021,0.00177,0.001665,-0.011301,-0.126598,...,-0.007645,-0.006835,-0.005365,0.004687,-0.011014,-0.006844,0.016714,0.003491,0.002548,0.011578
std,0.999667,1.001992,1.007157,0.999944,0.983065,1.000519,0.999554,0.999641,0.999371,0.923627,...,1.025373,0.99305,0.999923,1.000016,0.962506,0.986896,1.039186,0.999934,1.001116,0.990842
min,-1.620642,-2.837408,-0.519067,-0.657665,-14.570702,-0.768702,-2.563486,-1.932069,-1.932785,-2.754331,...,-7.876871,-0.426725,-5.391492,-1.385474,-0.146831,-1.688013,-5.639229,-1.021016,-1.553354,-2.163216
25%,-0.757812,-0.761894,-0.519067,-0.657665,-0.150292,-0.768702,-0.07567,-0.375729,-0.836204,-0.720745,...,0.084312,-0.426725,-0.367516,-1.385474,-0.146831,-0.474021,-0.179094,-1.021016,-1.00721,-0.798634
50%,0.105018,0.102904,-0.519067,-0.657665,0.404339,-0.768702,-0.07567,-0.375729,-0.287914,0.0,...,0.084312,-0.426725,-0.367516,0.714081,-0.146831,-0.474021,-0.179094,0.979417,0.085077,0.565948
75%,0.967849,0.621782,-0.519067,1.198648,0.404339,1.56911,1.168238,1.180611,0.808666,0.296048,...,0.084312,-0.426725,-0.367516,0.714081,-0.146831,-0.474021,-0.179094,0.979417,0.904292,0.565948
max,1.543069,3.562094,4.285933,4.911274,0.404339,1.958745,1.168238,1.180611,2.453537,5.670524,...,4.064904,2.348886,8.005778,2.813635,7.150293,4.381949,10.741176,0.979417,1.723508,0.565948


## Train individual model and evaluate its accuracy

In [12]:
# train individual models and use validation dataset to compare the results
# It helps determine the proportion of each model for ensemble modeling
# Please refer to childbirth_model_parameter_tuning.ipynb for parameters tuning code
def train_and_evaluate_several_models_for_weight_prediction(predict_output_type, X_train, y_train, X_val, y_val):

    # linear regression
    model = LinearRegression() # rmse=515(30K), 504(200K)
    util_train_and_evaluate("linear", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # Gradient Boosting Regressor, rmse=493(30K), 468(200K)
    params = {"n_estimators": 500, "max_depth": 4, "min_samples_split": 5, "learning_rate": 0.05, "loss": "squared_error"}
    model = GradientBoostingRegressor(**params)
    util_train_and_evaluate("gb", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # SGDRegressor, rmse=515(30K), 503(200K)
    params = {'penalty': 'l2', 'loss': 'squared_error', 'learning_rate': 'adaptive', 'eta0': 10, 'alpha': 0.01}
    model = SGDRegressor(**params)
    util_train_and_evaluate("sgd", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # LGBMRegressor, rmse=504(30K), 476(200K)
    model = LGBMRegressor()
    util_train_and_evaluate("lgbm", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # XGBRegressor, rmse=505(30K), 461(200K)
    model = XGBRegressor()
    util_train_and_evaluate("xgb", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # RandomForestRegressor, 500(200K)
    # it takes 10+ minutes
    #params = {'n_estimators': 600, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 100, 'bootstrap': True}
    params = {'n_estimators': 600, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 10, 'bootstrap': True}
    model = RandomForestRegressor(**params)
    util_train_and_evaluate("rf", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # SVR is slow and bad, Dont Use.Take 30 minutes to run
    # SVR, rmse = 517(30K), 516(200K). 515(200K). 
    #params = {'kernel': 'linear', 'gamma': 1e-07, 'epsilon': 0.1, 'degree': 2, 'coef0': 1, 'C': 100, 'max_iter': 150000}}
    #model = SVR(**params)
    #train_and_evaluate("svr", predict_output_type, model, X_train, y_train, X_val, y_val)
    #print("")
    
    # KNN is very bad. Don't use
    #model = KNeighborsRegressor(n_neighbors=20)
    #train_and_evaluate("KNN", model, X_train_from_file, y_train_weight, X_val_from_file, y_val_weight)
    #print("")



In [13]:
# it takes over 30 minutes to traing and evaluate the above models
train_and_evaluate_several_models_for_weight_prediction("weight", X_train_scaled, y_train_weight, X_val_scaled, y_val_weight)


Start training model linear for weight at 2023-02-14 18:55:50.686945
Saving linear to file: models/model_linear_weight.sav
End time = 2023-02-14 18:55:50.941578, elapsed time = 0.25463294982910156
linear for weight: rmse=508.07785504531887

Start training model gb for weight at 2023-02-14 18:55:50.942581
Saving gb to file: models/model_gb_weight.sav
End time = 2023-02-14 18:58:41.938441, elapsed time = 170.99586009979248
gb for weight: rmse=462.6097582239098

Start training model sgd for weight at 2023-02-14 18:58:41.938441
Saving sgd to file: models/model_sgd_weight.sav
End time = 2023-02-14 18:58:45.273750, elapsed time = 3.3353095054626465
sgd for weight: rmse=508.08276511613684

Start training model lgbm for weight at 2023-02-14 18:58:45.273750
Saving lgbm to file: models/model_lgbm_weight.sav
End time = 2023-02-14 18:58:46.076328, elapsed time = 0.8025772571563721
lgbm for weight: rmse=478.58934388570384

Start training model xgb for weight at 2023-02-14 18:58:46.076328
Saving xgb

In [14]:
# Neural network, rmse=505 (200K)
def nn_train_and_evaluate_for_weight_prediction(predict_output_type, X_train, y_train, X_val, y_val):
    model_name="nn"
    start_time = time.time()
    print(f"Start training model {model_name} for {predict_output_type} at {datetime.now()}")

    tf.keras.backend.clear_session()
    tf.random.set_seed(0)

    nn_model = tf.keras.Sequential()
    nn_model.add(tf.keras.layers.Dense(16))
    nn_model.add(tf.keras.layers.Dense(units=1))

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)

    # We specify the MSE loss.
    nn_model.compile(loss='mse', optimizer=optimizer)
    history = nn_model.fit(
      x = X_train,
      y = y_train,
      epochs=100,
      batch_size=32,
      validation_split=0.2,
      verbose=0)

    y_pred = nn_model.predict(X_val)

    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    model_filename = f"models/model_{model_name}_{predict_output_type}"
    print(f"Saving {model_name} to directory: {model_filename}")
    
    nn_model.save(model_filename)
    end_time = time.time()
    print(f"End time = {datetime.now()}, elapsed time = {end_time - start_time}")
    
    print(f"{model_name} for {predict_output_type}: rmse={rmse}")
    

nn_train_and_evaluate_for_weight_prediction("weight", X_train_scaled, y_train_weight, X_val_scaled, y_val_weight)


Start training model nn for weight at 2023-02-14 19:04:27.988447
Saving nn to directory: models/model_nn_weight
INFO:tensorflow:Assets written to: models/model_nn_weight\assets
End time = 2023-02-14 19:10:04.936366, elapsed time = 336.9479191303253
nn for weight: rmse=510.8327781803669


# Ensemble Models for Weight Prediction

In [15]:
# Ensemble model for weight prediction

column_list = util_load_x_columns_list_from_file("weight")
models = util_load_models_from_file("weight")

def scale_predict_compare_save(X_input_from_file, y, input_type):
    predict_output_type = "weight"
    X_scaled = util_scale(X_input_from_file[column_list], predict_output_type)
    print(f"{input_type} feature shape: {X_scaled.shape}, output shape: {y.shape}")
    y_pred = util_ensemble_predict_weight(X_scaled, column_list, models)
    print(y_pred)
    print(y_pred.shape)
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    print(rmse) 
    np.savetxt(f"data_files/pred_y_{input_type}_{predict_output_type}.csv", y_pred, delimiter=",")

# rmse=469
scale_predict_compare_save(X_input_from_file = X_val_from_file, y = y_val_weight, input_type = "val")

# rmse=454
scale_predict_compare_save(X_input_from_file = X_train_from_file, y = y_train_weight, input_type = "train")

# rmse=472
scale_predict_compare_save(X_input_from_file = X_test_from_file, y = y_test_weight, input_type = "test")


      birth_month  mother_age  father_age  bmi
0              12        22.0   32.961892  3.0
1               3        30.0   33.000000  3.0
2               8        20.0   20.000000  9.0
3              11        33.0   37.000000  2.0
4              10        37.0   44.000000  2.0
...           ...         ...         ...  ...
38867           2        34.0   36.000000  3.0
38868           1        27.0   25.000000  3.0
38869          10        38.0   32.000000  2.0
38870          10        39.0   39.000000  4.0
38871           6        24.0   24.000000  2.0

[38872 rows x 4 columns]
val feature shape: (38872, 36), output shape: (38872,)
predicting using linear and its proportion is 0.05
predicting using gb and its proportion is 0.25
predicting using sgd and its proportion is 0.05
predicting using lgbm and its proportion is 0.25
predicting using xgb and its proportion is 0.25
predicting using rf and its proportion is 0.05
predicting using nn and its proportion is 0.1
[3241.86740419 3475