## This notebook trains different models to predict the gestation age (in weeks)

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/My Drive/w210-capstone/Colab')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from datetime import datetime

import pickle
import time
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split

from pandas_profiling import ProfileReport

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import RandomizedSearchCV

import tensorflow as tf
from childbirth_common_util import *


In [3]:
print(datetime.now())

2023-02-16 15:49:10.495134


In [4]:
# load CSV data

path="/content/drive/MyDrive/w210-capstone/Colab/data_files"
# local jupyter path path="data_files"

X_train_from_file = pd.read_csv(f"{path}/x_train_orig.csv")
y_train_from_file = pd.read_csv(f"{path}/y_train_orig.csv")
y_train_weight = y_train_from_file['birth_weight_in_g'].squeeze()
y_train_age = y_train_from_file['combined_gestation_week'].squeeze()

X_val_from_file = pd.read_csv(f"{path}/x_val_orig.csv")
y_val_from_file = pd.read_csv(f"{path}/y_val_orig.csv")
y_val_weight = y_val_from_file['birth_weight_in_g'].squeeze()
y_val_age = y_val_from_file['combined_gestation_week'].squeeze()

print(f"Train feature shape: {X_train_from_file.shape}, output shape: {y_train_age.shape}")
print(f"Val feature shape: {X_val_from_file.shape}, output shape: {y_val_age.shape}")


Train feature shape: (274848, 81), output shape: (274848,)
Val feature shape: (91616, 81), output shape: (91616,)


In [5]:
X_test_from_file = pd.read_csv(f"{path}/x_test_orig.csv")
y_test_from_file = pd.read_csv(f"{path}/y_test_orig.csv")
y_test_weight = y_test_from_file['birth_weight_in_g'].squeeze()
y_test_age = y_test_from_file['combined_gestation_week'].squeeze()

print(f"Test feature shape: {X_test_from_file.shape}, output shape: {y_test_age.shape}")

Test feature shape: (3298177, 81), output shape: (3298177,)


In [6]:
# TODO: marital_status has NA and its type becomes float
# X_train_from_file.dtypes.to_csv("DEBUG_dtypes.csv")

In [7]:
# X_train_correct_type = util_change_column_type(X_train_from_file)
# X_train_correct_type.dtypes.to_csv("DEBUG_dtypes.csv")

In [8]:
util_calc_baseline(y_train_age, "gestation week")

the gestation week's mean in training is 38.50885580393527
gestation week: rmse=2.5114767422193225


In [9]:
# list of regressor model: 
# https://scikit-learn.org/stable/supervised_learning.html

# Train Age Models


In [10]:
# load feature list from file
# to add or remove features, please modify the feature_list file

model_path = "/content/drive/MyDrive/w210-capstone/Colab/models"
# local jupyer note book model_path="models"
column_list = np.loadtxt(f'{model_path}/feature_list_age.txt', dtype="object")

print(column_list)
print(column_list.shape)


['birth_month' 'mother_age' 'mother_nativity' 'residence_status'
 'mother_race1' 'mother_hispanic_race' 'paternity_acknowledged'
 'marital_status' 'mother_education' 'father_age'
 'prior_births_now_living' 'total_birth_order'
 'interval_since_last_live_birth' 'month_prenatal_care_began'
 'number_of_prenatal_visits' 'wic' 'cigarettes_3rd_trimester'
 'mother_height_in_total_inches' 'bmi' 'prepregnancy_weight'
 'weight_gain_group' 'gestational_diabetes' 'prepregnancy_hypertension'
 'gestational_hypertension' 'previous_preterm_birth'
 'infertility_treatment_used' 'fertility_enhancing_drugs'
 'previous_cesarean' 'number_of_previous_cesareans'
 'no_risk_factors_reported' 'chlamydia' 'attendant_at_birth' 'pluarality'
 'sex_of_infant' 'last_normal_menses_month' 'combined_gestation_week'
 'birth_weight_in_g' 'infant_breastfed_at_discharge']
(38,)


In [11]:
# calculate and saved the scaler for each feature
X_train_scaled = util_handle_na_and_type(X_train_from_file[column_list].copy(), "age")
X_train_scaled = util_calc_save_scaler(X_train_scaled, "age")

# sanity check. mean = 0, std dev = 1
display(X_train_scaled.describe())

Unnamed: 0,birth_month,mother_age,mother_nativity,residence_status,mother_race1,mother_hispanic_race,paternity_acknowledged,marital_status,mother_education,father_age,...,fertility_enhancing_drugs,previous_cesarean,number_of_previous_cesareans,no_risk_factors_reported,chlamydia,attendant_at_birth,pluarality,sex_of_infant,last_normal_menses_month,infant_breastfed_at_discharge
count,274848.0,274848.0,274848.0,274848.0,274848.0,274848.0,274848.0,274848.0,274848.0,274848.0,...,274848.0,274848.0,274848.0,274848.0,274848.0,274848.0,274848.0,274848.0,274848.0,274848.0
mean,-7.698787000000001e-17,1.337076e-16,-1.4813310000000002e-17,-9.074125e-17,1.924438e-16,-3.1849920000000004e-17,3.490048e-18,-4.3431710000000004e-18,7.342027e-17,2.361341e-16,...,-1.037449e-16,-7.574697e-17,6.36533e-16,6.364813e-17,-3.833882e-17,-1.9066e-17,1.884109e-16,-4.671494e-17,-1.333974e-17,1.319884e-16
std,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,...,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002
min,-1.621693,-3.013879,-0.5218567,-0.6571469,-5.477088,-0.7705311,-2.558393,-0.743125,-1.93665,-3.643561,...,-7.95129,-0.4279772,-7.004636,-1.383111,-0.1481115,-0.4743925,-5.675584,-1.019391,-1.553312,-2.174128
25%,-0.7580854,-0.7641085,-0.5218567,-0.6571469,-0.08252063,-0.7705311,-0.07295407,-0.743125,-0.8368186,-0.6342258,...,0.08164514,-0.4279772,-0.3679826,-1.383111,-0.1481115,-0.4743925,-0.1802468,-1.019391,-1.007331,-0.8049463
50%,0.1055221,0.101188,-0.5218567,-0.6571469,0.5399295,-0.7705311,-0.07295407,-0.743125,-0.2869027,4.263099e-06,...,0.08164514,-0.4279772,-0.3679826,0.7149254,-0.1481115,-0.4743925,-0.1802468,0.9809776,0.08463012,0.5642356
75%,0.9691296,0.7934252,-0.5218567,1.198841,0.5399295,1.565738,1.169766,1.345669,0.8129291,0.4744765,...,0.08164514,-0.4279772,-0.3679826,0.7149254,-0.1481115,-0.4743925,-0.1802468,0.9809776,0.9036013,0.5642356
max,1.544868,3.562374,4.255205,4.910818,0.5399295,1.955117,1.169766,1.345669,2.462677,7.918621,...,4.098113,2.342266,7.927834,2.812962,7.072849,5.558972,10.81043,0.9809776,1.722572,0.5642356


In [12]:
# scaled the data to mean=0 and std dev=1
X_val_scaled = util_scale(X_val_from_file[column_list], 'age')
print(X_val_scaled.shape)

# sanity check. mean = 0, std dev = 1
display(X_val_scaled.describe())

(91616, 36)


Unnamed: 0,birth_month,mother_age,mother_nativity,residence_status,mother_race1,mother_hispanic_race,paternity_acknowledged,marital_status,mother_education,father_age,...,fertility_enhancing_drugs,previous_cesarean,number_of_previous_cesareans,no_risk_factors_reported,chlamydia,attendant_at_birth,pluarality,sex_of_infant,last_normal_menses_month,infant_breastfed_at_discharge
count,91616.0,91616.0,91616.0,91616.0,91616.0,91616.0,91616.0,91616.0,91616.0,91616.0,...,91616.0,91616.0,91616.0,91616.0,91616.0,91616.0,91616.0,91616.0,91616.0,91616.0
mean,-0.005562,-0.004955,-0.000408,-0.002944,0.001819,8.2e-05,0.003224,-0.002052,-0.003517,-0.003738,...,-0.006561,0.001048,-0.000129,-0.004511,-0.00092,-0.005505,-0.004499,0.003151,-0.001028,0.006406
std,1.000239,0.999137,0.998879,0.998237,0.999479,1.000929,0.996193,0.999385,1.00424,1.001827,...,1.015961,1.001168,0.992705,1.000766,0.997054,0.993085,0.996791,0.99994,1.001275,0.995966
min,-1.621693,-3.013879,-0.521857,-0.657147,-5.477088,-0.770531,-2.558393,-0.743125,-1.93665,-2.85163,...,-7.95129,-0.427977,-7.004636,-1.383111,-0.148112,-0.474392,-5.675584,-1.019391,-1.553312,-2.174128
25%,-0.758085,-0.764108,-0.521857,-0.657147,-0.082521,-0.770531,-0.072954,-0.743125,-0.836819,-0.634226,...,0.081645,-0.427977,-0.367983,-1.383111,-0.148112,-0.474392,-0.180247,-1.019391,-1.007331,-0.804946
50%,0.105522,0.101188,-0.521857,-0.657147,0.539929,-0.770531,-0.072954,-0.743125,-0.286903,0.0,...,0.081645,-0.427977,-0.367983,0.714925,-0.148112,-0.474392,-0.180247,0.980978,0.08463,0.564236
75%,0.96913,0.793425,-0.521857,1.198841,0.539929,1.565738,1.169766,1.345669,0.812929,0.474477,...,0.081645,-0.427977,-0.367983,0.714925,-0.148112,-0.474392,-0.180247,0.980978,0.903601,0.564236
max,1.544868,3.562374,4.255205,4.910818,0.539929,1.955117,1.169766,1.345669,2.462677,6.809918,...,4.098113,2.342266,7.927834,2.812962,7.072849,5.558972,10.810428,0.980978,1.722572,0.564236


## Train individual model and evaluate its accuracy

In [13]:
# train individual models and use validation dataset to compare the results
# It helps determine the proportion of each model for ensemble modeling
# Please refer to childbirth_model_parameter_tuning.ipynb for parameters tuning code
def train_and_evaluate_several_models_for_age_prediction(predict_output_type, X_train, y_train, X_val, y_val):
    # linear regression
    model = LinearRegression() # rmse=2.17(200K)
    util_train_and_evaluate("linear", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")
 
    # Gradient Boosting Regressor, rmse=1.67(200K)
    # tuned on 2/8/2023
    params = {"n_estimators": 500, "max_depth": 4, "min_samples_split": 5, "learning_rate": 0.05, "loss": "squared_error"}
    model = GradientBoostingRegressor(**params)
    util_train_and_evaluate("gb", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # SGDRegressor, rmse=2.17(200K), tuned on 2/7/2023
    params = {'penalty': 'l2', 'loss': 'squared_error', 'learning_rate': 'adaptive', 'eta0': 100, 'alpha': 0.001}
    model = SGDRegressor(**params)
    util_train_and_evaluate("sgd", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # LGBMRegressor, rmse=2.17(200K)
    #model = LGBMRegressor()
    util_train_and_evaluate("lgbm", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # XGBRegressor, rmse=2.17(200K)
    #model = XGBRegressor()
    util_train_and_evaluate("xgb", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # RandomForestRegressor, rmse=2.08(200K)
    # it takes 6-10 minutes, tuned on 2/7/2023
    params = {'n_estimators': 800, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 10}
    model = RandomForestRegressor(**params)
    util_train_and_evaluate("rf", predict_output_type, model, X_train, y_train, X_val, y_val)
    print("")

    # KNN is very bad. Don't use
    #model = KNeighborsRegressor(n_neighbors=20)
    #train_and_evaluate("KNN", model, X_train_from_file, y_train, X_val_from_file, y_val)
    #print("")

    # SVR is slow and bad, Dont Use.Take 30 minutes to run
    #params = {'kernel': 'linear', 'gamma': 1e-07, 'epsilon': 0.1, 'degree': 2, 'coef0': 1, 'C': 100, 'max_iter': 150000}}
    #model = SVR(**params)
    #train_and_evaluate("svr", predict_output_type, model, X_train, y_train, X_val, y_val)
    #print("")
    


In [14]:
# it takes over 30 minutes to train and evaluate the above models
train_and_evaluate_several_models_for_age_prediction("age", X_train_scaled, y_train_age, X_val_scaled, y_val_age)


Start training model linear for age at 2023-02-16 15:51:09.475869
Saving linear to file: /content/drive/MyDrive/w210-capstone/Colab/models/model_linear_age.sav
End time = 2023-02-16 15:51:10.210986, elapsed time = 0.7351138591766357
linear for age: rmse=2.2296643662121287

Start training model gb for age at 2023-02-16 15:51:10.213929
Saving gb to file: /content/drive/MyDrive/w210-capstone/Colab/models/model_gb_age.sav
End time = 2023-02-16 15:59:48.340857, elapsed time = 518.1269197463989
gb for age: rmse=1.5563079239403748

Start training model sgd for age at 2023-02-16 15:59:48.345825
Saving sgd to file: /content/drive/MyDrive/w210-capstone/Colab/models/model_sgd_age.sav
End time = 2023-02-16 16:00:01.712195, elapsed time = 13.366365432739258
sgd for age: rmse=2.229690263279757

Start training model lgbm for age at 2023-02-16 16:00:01.712649
Saving lgbm to file: /content/drive/MyDrive/w210-capstone/Colab/models/model_lgbm_age.sav
End time = 2023-02-16 16:00:14.843950, elapsed time = 

In [15]:
# Neural network, rmse=2.416. 
def nn_train_and_evaluate_for_age_prediction(predict_output_type, X_train, y_train, X_val, y_val):
    model_name="nn"
    start_time = time.time()
    print(f"Start training model {model_name} for {predict_output_type} at {datetime.now()}")

    tf.keras.backend.clear_session()
    tf.random.set_seed(0)

    nn_model = tf.keras.Sequential()
    nn_model.add(tf.keras.layers.Dense(16))
    nn_model.add(tf.keras.layers.Dense(units=1))

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)

    # We specify the MSE loss.
    nn_model.compile(loss='mse', optimizer=optimizer)
    history = nn_model.fit(
      x = X_train,
      y = y_train,
      epochs=100,
      batch_size=32,
      validation_split=0.2,
      verbose=0)

    y_pred = nn_model.predict(X_val)

    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    model_filename = f"models/model_{model_name}_{predict_output_type}"
    print(f"Saving {model_name} to directory: {model_filename}")
    
    nn_model.save(model_filename)
    end_time = time.time()
    print(f"End time = {datetime.now()}, elapsed time = {end_time - start_time}")
    
    print(f"{model_name} for {predict_output_type}: rmse={rmse}")
    



In [16]:
# It takes 20 minutes to run
nn_train_and_evaluate_for_age_prediction("age", X_train_scaled, y_train_age, X_val_scaled, y_val_age)


Start training model nn for age at 2023-02-16 16:22:16.844626
Saving nn to directory: models/model_nn_age




End time = 2023-02-16 16:45:49.211481, elapsed time = 1412.3668501377106
nn for age: rmse=3.166348255075131


# Ensemble Models for Gestation Age Prediction

In [17]:
### Ensemble model for age prediction

column_list = util_load_x_columns_list_from_file("age")
models = util_load_models_from_file("age")

def scale_predict_compare_save(X_input_from_file, y, input_type):
    predict_output_type = "age"
    X_scaled = util_scale(X_input_from_file[column_list], predict_output_type)
    print(f"{input_type} feature shape: {X_scaled.shape}, output shape: {y.shape}")
    y_pred = util_ensemble_predict_age(X_scaled, column_list, models)
    print(y_pred)
    print(y_pred.shape)
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    print(rmse) 

    data_files_path = "/content/drive/MyDrive/w210-capstone/Colab/data_files"
    # local jupyer note book model_path="models"
    np.savetxt(f"{data_files_path}/pred_y_{input_type}_{predict_output_type}.csv", y_pred, delimiter=",")

# rmse=1.99
print("predicting using validation data set...\n\n")
scale_predict_compare_save(X_input_from_file = X_val_from_file, y = y_val_age, input_type = "val")

# rmse=2.00
print("predicting using train data set...\n\n")
scale_predict_compare_save(X_input_from_file = X_train_from_file, y = y_train_age, input_type = "train")

# rmse=2.02
print("predicting using test data set...\n\n")
scale_predict_compare_save(X_input_from_file = X_test_from_file, y = y_test_age, input_type = "test")


predicting using validation data set...


val feature shape: (91616, 36), output shape: (91616,)
predicting using linear and its proportion is 0.05
predicting using gb and its proportion is 0.25
predicting using sgd and its proportion is 0.05
predicting using lgbm and its proportion is 0.25
predicting using xgb and its proportion is 0.25
predicting using rf and its proportion is 0.05
predicting using nn and its proportion is 0.1
[38.84877048 38.42063305 38.19295879 ... 38.42317374 39.06997501
 38.40272319]
(91616,)
1.9940830734066033
predicting using train data set...


train feature shape: (274848, 36), output shape: (274848,)
predicting using linear and its proportion is 0.05
predicting using gb and its proportion is 0.25
predicting using sgd and its proportion is 0.05
predicting using lgbm and its proportion is 0.25
predicting using xgb and its proportion is 0.25
predicting using rf and its proportion is 0.05
predicting using nn and its proportion is 0.1
[38.31530635 38.36263594 39.