# Modeling

description here

## Imports

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"

In [3]:
from enum import Enum
from pathlib import Path
from typing import Dict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tqdm import tqdm

print(f"{tf.__version__ = }")
pd.set_option("display.max_columns", 500)

tf.__version__ = '2.6.1'


## Inputs

In [4]:
# path to csv file
data_file_path = Path("data/COVID-19_Case_Surveillance_Public_Use_Data.csv")

# feature types
feature_type = Enum("feature_type", "categorical continuous date")

# target name
target_name = "death_yn"

# these are my initial features, I will update this map
# as I add or remove features
feature_name=str
feature_to_type_map: Dict[feature_name, feature_type] = {
    "cdc_report_dt"                 : feature_type.date,
    "pos_spec_dt"                   : feature_type.date,
    "onset_dt"                      : feature_type.date,
    "current_status"                : feature_type.categorical,
    "sex"                           : feature_type.categorical,
    "age_group"                     : feature_type.categorical,
    "Race and ethnicity (combined)" : feature_type.categorical,
    "hosp_yn"                       : feature_type.categorical,
    "icu_yn"                        : feature_type.categorical,
    "death_yn"                      : feature_type.categorical,
    "medcond_yn"                    : feature_type.categorical,
}

## Explore data

In [5]:
df = pd.read_csv(data_file_path)
print(f"{df.shape = }")

  df = pd.read_csv(data_file_path)


df.shape = (8405079, 11)


## Data Prep

### Race or ethnicity

In [6]:
# copied from the original workshop notebook
aux = df["Race and ethnicity (combined)"].str.split(",", n = 1, expand = True)
df["race"] = aux[0]
df["ethnicity"] = aux[1]

# update feature map
feature_to_type_map["race"] = feature_type.categorical
feature_to_type_map["ethnicity"] = feature_type.categorical

### Date imputation

In [7]:
def to_date(s: pd.Series, **kwargs) -> pd.Series:
    """Thin wrapper around pd.to_datetime to only return date portion
    """
    return pd.to_datetime(s, **kwargs).dt.date

In [8]:
cdc_report_dt = to_date(df["cdc_report_dt"])
pos_spec_dt = to_date(df["pos_spec_dt"])
onset_dt = to_date(df["onset_dt"])

In [9]:
cdc_report_dt.isna().any()

False

In [10]:
# the fact that a date is missing can be a (binary) feature in inself
pos_spec_dt_is_missing = pos_spec_dt.isna()
onset_dt_is_missing = onset_dt.isna()

# str to make it consistent with every other categorical in this ds
df["pos_spec_dt_is_missing"]=np.where(pos_spec_dt_is_missing, "True", "False")
df["onset_dt_is_missing"]=np.where(onset_dt_is_missing, "True", "False")

feature_to_type_map["pos_spec_dt_is_missing"]=feature_type.categorical
feature_to_type_map["onset_dt_is_missing"]=feature_type.categorical

In [11]:
# compute median difference with cdc_report when date is not missing
# if we are being strict, I should compute the median with only samples from the triaing set
pos_spec_dt_median_diff = (pos_spec_dt[~pos_spec_dt_is_missing] - cdc_report_dt[~pos_spec_dt_is_missing]).median()
onset_dt_median_diff = (onset_dt[~onset_dt_is_missing] - cdc_report_dt[~onset_dt_is_missing]).median()

# impute cdc_report date + median difference
pos_spec_dt[pos_spec_dt_is_missing] = cdc_report_dt[pos_spec_dt_is_missing] + pos_spec_dt_median_diff
onset_dt[onset_dt_is_missing] = cdc_report_dt[onset_dt_is_missing] + onset_dt_median_diff

# sanity check: assert no missing values left
assert not pos_spec_dt.isna().any()
assert not onset_dt.isna().any()

  pos_spec_dt[pos_spec_dt_is_missing] = cdc_report_dt[pos_spec_dt_is_missing] + pos_spec_dt_median_diff
  onset_dt[onset_dt_is_missing] = cdc_report_dt[onset_dt_is_missing] + onset_dt_median_diff


In [12]:
# set all 3 back in df (as pd dates)
df["cdc_report_dt"] = cdc_report_dt
df["pos_spec_dt"] = pos_spec_dt
df["onset_dt"] = onset_dt

### Date encoding

In [13]:
def process_date_column(df: pd.DataFrame, column_name: str, feature_to_type_map: Dict) -> None:
    """
    Process date column in-place
    Modifies feature_to_type_map inplace as well to reflect new features
    """
    # pop column and transform it to datetime 
    date_column = pd.to_datetime(df.pop(column_name), errors='raise')
    _ = feature_to_type_map.pop(column_name)

    # decompose date
    date_column_year       = date_column.dt.year
    date_column_month      = date_column.dt.month
    date_column_week       = date_column.dt.isocalendar().week
    date_column_dayofmonth = date_column.dt.day
    date_column_dayofyear  = date_column.dt.dayofyear
    date_column_dayofweek  = date_column.dt.dayofweek #Monday=0, Sunday=6
    date_column_elapsed    = (date_column - date_column.min()).dt.days

    # encode cyclical features with sin/cos encoding
    def encode_cyclical(values: pd.Series, feature_name: str) -> None:
        """Encode cyclical 
        """
        df[f"{column_name}_{feature_name}_sin"] = np.sin(2 * np.pi * values / values.max())
        df[f"{column_name}_{feature_name}_cos"] = np.cos(2 * np.pi * values / values.max())

        feature_to_type_map[f"{column_name}_{feature_name}_sin"]=feature_type.continuous
        feature_to_type_map[f"{column_name}_{feature_name}_cos"]=feature_type.continuous

    encode_cyclical(date_column_month, feature_name="month")
    encode_cyclical(date_column_week, feature_name="week")
    encode_cyclical(date_column_dayofmonth, feature_name="dayofmonth")
    encode_cyclical(date_column_dayofyear, feature_name="dayofyear")
    encode_cyclical(date_column_dayofweek, feature_name="dayofweek")

    # in addition, add month and year as categorical
    df[f"{column_name}_year"] = date_column_year
    df[f"{column_name}_month"] = date_column_month

    feature_to_type_map[f"{column_name}_year"]=feature_type.continuous
    feature_to_type_map[f"{column_name}_month"]=feature_type.continuous

    # and elapsed as continuous
    df[f"{column_name}_elapsed"] = date_column_elapsed
    feature_to_type_map[f"{column_name}_elapsed"]=feature_type.continuous


In [14]:
date_features = [f for f,t in feature_to_type_map.items() if t == feature_type.date]
for cname in date_features:
    process_date_column(df, column_name=cname, feature_to_type_map=feature_to_type_map)

### set dtypes

In [15]:
continuous_features = [f for f,t in feature_to_type_map.items() if t == feature_type.continuous]
categorical_features = [f for f,t in feature_to_type_map.items() if t == feature_type.categorical]

# sanity check
assert len(df.columns) == len(continuous_features) + len(categorical_features)

In [16]:
for c in continuous_features:
    assert not df[c].isna().any(), f"Na found in {c}"
    df[c]=df[c].astype(np.float32)

for c in categorical_features:
    # NA will be just one more category
    df[c]=df[c].fillna("#NA#").astype("category")

    #sanity check
    assert not df[c].isna().any(), f"Na found in {c}"

### show final

In [17]:
df.head()

Unnamed: 0,current_status,sex,age_group,Race and ethnicity (combined),hosp_yn,icu_yn,death_yn,medcond_yn,race,ethnicity,pos_spec_dt_is_missing,onset_dt_is_missing,cdc_report_dt_month_sin,cdc_report_dt_month_cos,cdc_report_dt_week_sin,cdc_report_dt_week_cos,cdc_report_dt_dayofmonth_sin,cdc_report_dt_dayofmonth_cos,cdc_report_dt_dayofyear_sin,cdc_report_dt_dayofyear_cos,cdc_report_dt_dayofweek_sin,cdc_report_dt_dayofweek_cos,cdc_report_dt_year,cdc_report_dt_month,cdc_report_dt_elapsed,pos_spec_dt_month_sin,pos_spec_dt_month_cos,pos_spec_dt_week_sin,pos_spec_dt_week_cos,pos_spec_dt_dayofmonth_sin,pos_spec_dt_dayofmonth_cos,pos_spec_dt_dayofyear_sin,pos_spec_dt_dayofyear_cos,pos_spec_dt_dayofweek_sin,pos_spec_dt_dayofweek_cos,pos_spec_dt_year,pos_spec_dt_month,pos_spec_dt_elapsed,onset_dt_month_sin,onset_dt_month_cos,onset_dt_week_sin,onset_dt_week_cos,onset_dt_dayofmonth_sin,onset_dt_dayofmonth_cos,onset_dt_dayofyear_sin,onset_dt_dayofyear_cos,onset_dt_dayofweek_sin,onset_dt_dayofweek_cos,onset_dt_year,onset_dt_month,onset_dt_elapsed
0,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,Unknown,No,No,Black,Non-Hispanic,False,True,-1.133108e-15,1.0,-0.133287,0.991077,0.897805,-0.440394,-0.1736482,0.984808,0.8660254,0.5,2020.0,11.0,314.0,-0.5,0.866025,-0.375267,0.926917,0.897805,-0.440394,-0.758306,0.651899,0.866025,0.5,2020.0,11.0,316.0,-0.5,0.866025,-0.748511,0.663123,0.394356,0.918958,-0.789565,0.613667,0.0,1.0,2020.0,11.0,313.0
1,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No,No,No,Black,Non-Hispanic,False,False,-1.133108e-15,1.0,-0.133287,0.991077,0.299363,-0.954139,-0.09681087,0.995303,-0.8660254,0.5,2020.0,11.0,318.0,-0.5,0.866025,-0.375267,0.926917,0.897805,-0.440394,-0.758306,0.651899,0.866025,0.5,2020.0,11.0,316.0,-0.5,0.866025,-0.663123,0.748511,0.897805,-0.440394,-0.696196,0.717852,0.866025,0.5,2020.0,11.0,321.0
2,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No,No,No,Black,Non-Hispanic,False,False,-1.133108e-15,1.0,6.43249e-16,1.0,-0.651372,-0.758758,-2.449294e-16,1.0,1.224647e-16,-1.0,2020.0,11.0,323.0,-0.5,0.866025,-0.375267,0.926917,0.897805,-0.440394,-0.758306,0.651899,0.866025,0.5,2020.0,11.0,316.0,-0.5,0.866025,-0.663123,0.748511,0.968077,-0.250653,-0.708652,0.705558,0.0,1.0,2020.0,11.0,320.0
3,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",Missing,Missing,No,Missing,Black,Non-Hispanic,False,True,-1.133108e-15,1.0,-0.133287,0.991077,0.299363,-0.954139,-0.09681087,0.995303,-0.8660254,0.5,2020.0,11.0,318.0,-0.5,0.866025,-0.375267,0.926917,0.897805,-0.440394,-0.758306,0.651899,0.866025,0.5,2020.0,11.0,316.0,-0.5,0.866025,-0.748511,0.663123,0.937752,0.347305,-0.744704,0.667395,-0.866025,-0.5,2020.0,11.0,317.0
4,Laboratory-confirmed case,Male,10 - 19 Years,"Black, Non-Hispanic",No,No,No,Yes,Black,Non-Hispanic,False,False,-1.133108e-15,1.0,-0.133287,0.991077,0.485302,-0.874347,-0.1160929,0.993238,-0.8660254,-0.5,2020.0,11.0,317.0,-0.5,0.866025,-0.375267,0.926917,0.897805,-0.440394,-0.758306,0.651899,0.866025,0.5,2020.0,11.0,316.0,-0.5,0.866025,-0.663123,0.748511,0.897805,-0.440394,-0.696196,0.717852,0.866025,0.5,2020.0,11.0,321.0


## Modeling

### train/test split

In [18]:
train_df, valid_df = train_test_split(df, test_size=.33, random_state=42) # important note about this at the end

print(f"{len(train_df) = }")
print(f"{len(valid_df) = }")

len(train_df) = 5631402
len(valid_df) = 2773677


### define tf dataset

In [19]:
def df_to_dataset(dataframe: pd.DataFrame, target_name: str, shuffle: bool=True, batch_size: int=32):
  df = dataframe.copy()
  labels = df.pop(target_name)
  df_dict = {key: value.to_numpy()[:,None] for key, value in dataframe.items()}
  ds = tf.data.Dataset.from_tensor_slices((df_dict, labels))
  if shuffle:
    # set max buffer size of 100k to avoid blowing up the memory
    # this may result in not perfect shuffles
    ds = ds.shuffle(buffer_size=min(len(dataframe), 100_00))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [20]:
train_ds = df_to_dataset(train_df, target_name=target_name, shuffle=True)
valid_ds = df_to_dataset(valid_df, target_name=target_name, shuffle=False)

2022-04-03 02:19:00.385576: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-04-03 02:19:00.385627: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: 88654ae2549f
2022-04-03 02:19:00.385639: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: 88654ae2549f
2022-04-03 02:19:00.385782: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.82.1
2022-04-03 02:19:00.385815: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.82.1
2022-04-03 02:19:00.385825: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.82.1
2022-04-03 02:19:00.386170: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instruct

### build model preprocessing layers

In [23]:
#helpers
def get_normalization_layer(name: str, dataframe: pd.DataFrame):
  """prep layer for continuous inputs
  """
  # I prefer computing these from pandas instead of calling
  # layer.adapt because it's way faster
  mean = dataframe[name].mean()
  var = dataframe[name].var()
  
  # Create a Normalization layer for the feature.
  normalizer = layers.Normalization(axis=None, mean=mean, variance=var)

  return normalizer

def get_lookup_layer(name: str, dataframe: pd.DataFrame, max_tokens=None):
  """prep layer for categorical inputs (str->int)
  """
  # I prefer computing these from pandas instead of calling
  # layer.adapt because it's way faster
  vocab = dataframe[name].unique()

  # create StringLookup layer for the feature
  index = layers.StringLookup(max_tokens=max_tokens)

  return index

def get_embedding_layer(lookup_layer, size_multiplier=1):
  """learned dimensionality reduction layer for categorical inputs
  """
  def emb_sz_rule(n_cat:int)->int: 
    """
    fast-ais rule of thumb for embedding size
    https://forums.fast.ai/t/size-of-embedding-for-categorical-variables/42608/2
    """
    return min(600, round(1.6 * n_cat**0.56))
  
  vocab_size = lookup_layer.vocabulary_size()
  output_size = size_multiplier * emb_sz_rule(vocab_size)
  embedding_layer = layers.Embedding(vocab_size, output_size)
  return embedding_layer

In [24]:
all_inputs = []
encoded_features = []

for f in tqdm(categorical_features):
    # define input
    feature_input = keras.Input(shape=(1,), name=f)
    all_inputs.append(feature_input)

    # add lookup+embedding layers
    lookup_layer = get_lookup_layer(f, train_df)
    embedding_layer = get_embedding_layer(lookup_layer)

    encoded_feature = embedding_layer(lookup_layer(feature_input))
    encoded_features.append(encoded_feature)

for f in tqdm(continuous_features):
    # define input
    feature_input = keras.Input(shape=(1,), name=f)
    all_inputs.append(feature_input)
    
    # add normalization layer
    normalization_layer = get_normalization_layer(f, train_df)
    encoded_feature = normalization_layer(feature_input)
    encoded_features.append(encoded_feature)


100%|██████████| 12/12 [00:00<00:00, 24.88it/s]
100%|██████████| 39/39 [00:02<00:00, 17.28it/s]
