# Modeling

description here

## Imports

In [1]:
from enum import Enum
from pathlib import Path
from typing import Dict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(f"{tf.__version__ = }")
pd.set_option("display.max_columns", 500)

tf.__version__ = '2.6.1'


## Inputs

In [2]:
# path to csv file
data_file_path = Path("data/COVID-19_Case_Surveillance_Public_Use_Data.csv")

# feature types
feature_type = Enum("feature_type", "categorical continuous date")

# target name
target_name = "death_yn"

# these are my initial features, I will update this map
# as I add or remove features
feature_name=str
feature_to_type_map: Dict[feature_name, feature_type] = {
    "cdc_report_dt"                 : feature_type.date,
    "pos_spec_dt"                   : feature_type.date,
    "onset_dt"                      : feature_type.date,
    "current_status"                : feature_type.categorical,
    "sex"                           : feature_type.categorical,
    "age_group"                     : feature_type.categorical,
    "Race and ethnicity (combined)" : feature_type.categorical,
    "hosp_yn"                       : feature_type.categorical,
    "icu_yn"                        : feature_type.categorical,
    "death_yn"                      : feature_type.categorical,
    "medcond_yn"                    : feature_type.categorical,
}

## Explore data

In [3]:
df = pd.read_csv(data_file_path)
print(f"{df.shape = }")

  df = pd.read_csv(data_file_path)


df.shape = (8405079, 11)


## Data Prep

### Race or ethnicity

In [4]:
# copied from the original workshop notebook
aux = df["Race and ethnicity (combined)"].str.split(",", n = 1, expand = True)
df["race"] = aux[0]
df["ethnicity"] = aux[1]

# update feature map
feature_to_type_map["race"] = feature_type.categorical
feature_to_type_map["ethnicity"] = feature_type.categorical

### Date imputation

In [5]:
def to_date(s: pd.Series, **kwargs) -> pd.Series:
    """Thin wrapper around pd.to_datetime to only return date portion
    """
    return pd.to_datetime(s, **kwargs).dt.date

In [6]:
cdc_report_dt = to_date(df["cdc_report_dt"])
pos_spec_dt = to_date(df["pos_spec_dt"])
onset_dt = to_date(df["onset_dt"])

In [7]:
cdc_report_dt.isna().any()

False

In [8]:
# the fact that a date is missing can be a (binary) feature in inself
pos_spec_dt_is_missing = pos_spec_dt.isna()
onset_dt_is_missing = onset_dt.isna()

# str to make it consistent with every other categorical in this ds
df["pos_spec_dt_is_missing"]=np.where(pos_spec_dt_is_missing, "True", "False")
df["onset_dt_is_missing"]=np.where(onset_dt_is_missing, "True", "False")

feature_to_type_map["pos_spec_dt_is_missing"]=feature_type.categorical
feature_to_type_map["onset_dt_is_missing"]=feature_type.categorical

In [9]:
# compute median difference with cdc_report when date is not missing
# if we are being strict, I should compute the median with only samples from the triaing set
pos_spec_dt_median_diff = (pos_spec_dt[~pos_spec_dt_is_missing] - cdc_report_dt[~pos_spec_dt_is_missing]).median()
onset_dt_median_diff = (onset_dt[~onset_dt_is_missing] - cdc_report_dt[~onset_dt_is_missing]).median()

# impute cdc_report date + median difference
pos_spec_dt[pos_spec_dt_is_missing] = cdc_report_dt[pos_spec_dt_is_missing] + pos_spec_dt_median_diff
onset_dt[onset_dt_is_missing] = cdc_report_dt[onset_dt_is_missing] + onset_dt_median_diff

# sanity check: assert no missing values left
assert not pos_spec_dt.isna().any()
assert not onset_dt.isna().any()

  pos_spec_dt[pos_spec_dt_is_missing] = cdc_report_dt[pos_spec_dt_is_missing] + pos_spec_dt_median_diff
  onset_dt[onset_dt_is_missing] = cdc_report_dt[onset_dt_is_missing] + onset_dt_median_diff


In [None]:
# set all 3 back in df (as pd dates)
df["cdc_report_dt"] = cdc_report_dt
df["pos_spec_dt"] = pos_spec_dt
df["onset_dt"] = onset_dt

### Date encoding

In [None]:
def process_date_column(df: pd.DataFrame, column_name: str, feature_to_type_map: Dict) -> None:
    """
    Process date column in-place
    Modifies feature_to_type_map inplace as well to reflect new features
    """
    # pop column and transform it to datetime 
    date_column = pd.to_datetime(df.pop(column_name), errors='raise')
    _ = feature_to_type_map.pop(column_name)

    # decompose date
    date_column_year       = date_column.dt.year
    date_column_month      = date_column.dt.month
    date_column_week       = date_column.dt.isocalendar().week
    date_column_dayofmonth = date_column.dt.day
    date_column_dayofyear  = date_column.dt.dayofyear
    date_column_dayofweek  = date_column.dt.dayofweek #Monday=0, Sunday=6
    date_column_elapsed    = (date_column - date_column.min()).dt.days

    # encode cyclical features with sin/cos encoding
    def encode_cyclical(values: pd.Series, feature_name: str) -> None:
        """Encode cyclical 
        """
        df[f"{column_name}_{feature_name}_sin"] = np.sin(2 * np.pi * values / values.max())
        df[f"{column_name}_{feature_name}_cos"] = np.cos(2 * np.pi * values / values.max())

        feature_to_type_map[f"{column_name}_{feature_name}_sin"]=feature_type.continuous
        feature_to_type_map[f"{column_name}_{feature_name}_cos"]=feature_type.continuous

    encode_cyclical(date_column_month, feature_name="month")
    encode_cyclical(date_column_week, feature_name="week")
    encode_cyclical(date_column_dayofmonth, feature_name="dayofmonth")
    encode_cyclical(date_column_dayofyear, feature_name="dayofyear")
    encode_cyclical(date_column_dayofweek, feature_name="dayofweek")

    # in addition, add month and year as categorical
    df[f"{column_name}_year"] = date_column_year
    df[f"{column_name}_month"] = date_column_month

    feature_to_type_map[f"{column_name}_year"]=feature_type.continuous
    feature_to_type_map[f"{column_name}_month"]=feature_type.continuous

    # and elapsed as continuous
    df[f"{column_name}_elapsed"] = date_column_elapsed
    feature_to_type_map[f"{column_name}_elapsed"]=feature_type.continuous


In [None]:
date_features = [f for f,t in feature_to_type_map.items() if t == feature_type.date]
for cname in date_features:
    process_date_column(df, column_name=cname, feature_to_type_map=feature_to_type_map)

### set dtypes

In [None]:
continuous_features = [f for f,t in feature_to_type_map.items() if t == feature_type.continuous]
categorical_features = [f for f,t in feature_to_type_map.items() if t == feature_type.categorical]

# sanity check
assert len(df.columns) == len(continuous_features) + len(categorical_features)

In [None]:
for c in continuous_features:
    assert not df[c].isna().any(), f"Na found in {c}"
    df[c]=df[c].astype(np.float32)

for c in categorical_features:
    # NA will be just one more category
    df[c]=df[c].fillna("#NA#").astype("category")

    #sanity check
    assert not df[c].isna().any(), f"Na found in {c}"

### show final

In [None]:
df.head()

## Modeling

### train/test split

In [None]:
train_df, valid_df = train_test_split(df, test_size=.33, random_state=42) # important note about this at the end

print(f"{len(train_df) = }")
print(f"{len(valid_df) = }")

### define tf dataset

In [None]:
def df_to_dataset(dataframe: pd.DataFrame, target_name: str, shuffle: bool=True, batch_size: int=32):
  df = dataframe.copy()
  labels = df.pop(target_name)
  df_dict = {key: value.to_numpy()[:,None] for key, value in dataframe.items()}
  ds = tf.data.Dataset.from_tensor_slices((df_dict, labels))
  if shuffle:
    # set max buffer size of 100k to avoid blowing up the memory
    # this may result in not perfect shuffles
    ds = ds.shuffle(buffer_size=len(min(len(dataframe), 100_00)))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [None]:
train_ds = df_to_dataset(train_df, target_name=target_name, shuffle=True)
valid_ds = df_to_dataset(valid_df, target_name=target_name, shuffle=False)

### build model preprocessing layers

In [None]:
#helpers
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for the feature.
    normalizer = layers.Normalization(axis=None)

    # Prepare a Dataset that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)

    return normalizer

def get_lookup_layer(name, dataset, max_tokens=None):
  # create StringLookup layer for the feature
  index = layers.StringLookup(max_tokens=max_tokens)
  
  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  return index

def get_embedding_layer(lookup_layer):
  """
  """
  def emb_sz_rule(n_cat:int)->int: 
    """
    fast-ais rule of thumb for embedding size
    https://forums.fast.ai/t/size-of-embedding-for-categorical-variables/42608/2
    """
    return min(600, round(1.6 * n_cat**0.56))
  pass
  

In [None]:
all_inputs = []
encoded_features = []

for f in continuous_features:
    # define input
    feature_input = keras.Input(shape=(1,), name=f)
    all_inputs.append(feature_input)
    
    # add input prep layers
    normalization_layer = get_normalization_layer(f, train_ds)
    encoded_feature = normalization_layer(feature_input)
    encoded_features.append(encoded_feature)

for f in categorical_features:
    # define input
    feature_input = keras.Input(shape=(1,), name=f)
    all_inputs.append(feature_input)

    # add input prep layers
    lookup_layer = get_lookup_layer(f, train_ds)
    encoded_feature = lookup_layer(feature_input)


    encoded_features.append(encoded_feature)