# Covid-19 Risk 2022
Predict patient outcomes in 44M reported US Covid-19 cases from basic demographics.

## Goal
Predict the risk of death from basic demographic information.

## Data
This dataset was downloaded from the CDC website and contains most of confirmed Covid-19 cases in the US since the beginning of the pandemic. Anonymized patient data contains a number of demographic features and risk factors, along with whether the patient died.

## Train/Test Split
The training dataset consists of all confirmed covid-19 patients before Oct 1st, 2021. The test dataset contains patients from Oct 2022 to Jan 18 2022. Half of the test set is used to calculate the leaderboard, while the other half is used for the final evaluation.

## Missing Data
The data is not fully sanitized. Many patients have missing data. 

## Evaluation Metric
Predictions are evaluated using the area under the receiver operator characteristic (AUROC or AUC).

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV;
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier;

In [9]:
# Load training data.
fields = ['case_month', 'res_state', 'age_group', 'sex', 'race', 'ethnicity', 'labconfirmed_yn', 
          'symptomatic_yn', 'hosp_yn', 'icu_yn', 'underlying_conditions_yn', 'death_yn']
# ignore state_fips_code, res_county, county_fips_code, case_positive_specimen_interval, case_onset_interval, process, exposure_yn

df_train = pd.read_csv(
    'train.csv', usecols=fields, 
    dtype={ 'labconfirmed_yn': bool, 'symptomatic_yn': object, 'icu_yn': object }
) # Can read from zip files directly.

# Load test data.
df_test = pd.read_csv(
    'test.csv', usecols=fields[:-1], 
    dtype={ 'labconfirmed_yn': bool, 'symptomatic_yn': object, 'icu_yn': object }
)

print(df_train.shape)
print(df_test.shape)
df_train.head(1)

(36225855, 12)
(2594412, 11)


Unnamed: 0,case_month,res_state,age_group,sex,race,ethnicity,labconfirmed_yn,symptomatic_yn,hosp_yn,icu_yn,death_yn,underlying_conditions_yn
0,2020-01,NY,,,,,True,,,,0.0,


## Data Analysis

In [4]:
# Count the num of unique values
print(f"nunique\n{df_train.nunique()}\n")
# See the unique values
for col in df_train:
    print(f"{col}\n{df_train[col].unique()}\n")

nunique
case_month                  21
res_state                   54
age_group                    5
sex                          4
race                         8
ethnicity                    4
labconfirmed_yn              2
symptomatic_yn               3
hosp_yn                      2
icu_yn                       3
death_yn                     2
underlying_conditions_yn     2
dtype: int64

case_month
['2020-01' '2020-02' '2020-03' '2020-04' '2020-05' '2020-06' '2020-07'
 '2020-08' '2020-09' '2020-10' '2020-11' '2020-12' '2021-01' '2021-02'
 '2021-03' '2021-04' '2021-05' '2021-06' '2021-07' '2021-08' '2021-09']

res_state
['NY' 'NC' 'NJ' 'IA' 'GA' 'NV' 'TX' 'FL' 'CA' 'TN' 'SC' 'UT' 'MO' 'WI'
 'OH' 'WA' 'MI' nan 'CO' 'CT' 'IN' 'MA' 'PR' 'MD' 'AL' 'ME' 'SD' 'AZ' 'KY'
 'NM' 'KS' 'NE' 'PA' 'VA' 'IL' 'DC' 'LA' 'AR' 'MS' 'OR' 'MN' 'VT' 'MT'
 'ID' 'AK' 'OK' 'HI' 'NH' 'ND' 'WY' 'RI' 'DE' 'WV' 'VI' 'GU']

age_group
[nan '65+ years' '18 to 49 years' '50 to 64 years' '0 - 17 years'
 'Missing']

s

In [5]:
# See % of deaths
deaths = df_train.dropna(subset=['death_yn'])
total = deaths.shape[0]
deaths = deaths[ (deaths.death_yn == 1) ]
print(deaths.shape[0] / total * 100)

2.629269342229879


In [6]:
# Out of 352,187 deaths, see most common case_month
print(f"case_month\n{deaths.case_month.value_counts()[:10]}\n")
# most common res_state
print(f"res_state\n{deaths.res_state.value_counts()[:10]}\n")
# most common age_group
print(f"age_group\n{deaths.age_group.value_counts()}\n")
# most common sex
print(f"sex\n{deaths.sex.value_counts()}\n")
# most common race
print(f"race\n{deaths.race.value_counts()}\n")
# most common ethnicity
print(f"ethnicity\n{deaths.ethnicity.value_counts()}\n")

case_month
2020-12    64781
2020-04    56784
2020-11    42270
2021-01    42146
2020-03    18865
2020-05    17978
2021-08    17879
2020-07    15139
2020-10    14932
2021-09    12845
Name: case_month, dtype: int64

res_state
CA    60407
FL    44609
NY    36881
IL    29430
NJ    16839
PA    16192
OH    15592
MA    14455
AZ    14377
MI     7713
Name: res_state, dtype: int64

age_group
65+ years         312110
50 to 64 years     32812
18 to 49 years      6559
0 - 17 years          13
Missing                4
Name: age_group, dtype: int64

sex
Male       188259
Female     162556
Unknown       133
Missing         1
Name: sex, dtype: int64

race
White                                     263440
Black                                      39095
Asian                                      10906
Unknown                                    10122
Missing                                     8335
Multiple/Other                              1812
American Indian/Alaska Native                929
Native Hawa

In [7]:
# See % missing data
missing = df_train.isna().sum() / df_train.shape[0] * 100
print(missing)

case_month                   0.000000
res_state                    0.002542
age_group                    1.097434
sex                          3.007832
race                        15.793333
ethnicity                   18.604105
labconfirmed_yn              0.000000
symptomatic_yn              51.848121
hosp_yn                     52.816078
icu_yn                      94.681083
death_yn                    63.024028
underlying_conditions_yn    94.207518
dtype: float64


## Data Preprocessing

In [10]:
# use only the most recent case_month
df_train = df_train.loc[df_train['case_month'] == '2021-09']
df_train.drop(columns=['case_month'], inplace=True) # drop case_month
print(f"shape: {df_train.shape}")

# drop rows w/ nan age_group, sex, res_state
df_train.dropna(subset=['age_group', 'sex', 'res_state'], inplace=True)
# drop rows w/ missing age_group
df_train.drop(df_train[df_train['age_group'] == 'Missing'].index, inplace=True)
# drop rows w/ missing or unknown sex
df_train.drop(df_train[ (df_train['sex'] == 'Unknown') | (df_train['sex'] == 'Missing') ].index, inplace=True)

# impute nan data
df_train.replace(
    {'death_yn':{np.nan:0}, # Assume no info means survived.
     'symptomatic_yn':{np.nan:0, 'nul':0, '0':0, '1':1},
     'hosp_yn':{np.nan:0},
     'icu_yn':{np.nan:0, 'nul':0, '0':0, '1':1},
     'underlying_conditions_yn':{np.nan:0}},
    inplace=True
)

# encode sex, race, ethnicity

# ethnicity: if Hispanic 1, else 0
df_train.replace({'ethnicity': {'Hispanic/Latino': 1, 'Non-Hispanic/Latino': 0, 'Unknown': 0, 'Missing': 0, np.nan: 0} }, inplace=True)
# sex: if Male 1, if Female 0
df_train.replace({'sex':{'Male':1, 'Female':0}}, inplace=True)

df_train.replace({'race':{'Unknown':np.nan, 'Missing':np.nan}}, inplace=True) # replace race Unknown, Missing w/ nan
df_train.fillna(df_train.mode().iloc[0], inplace=True) # fill nan w/ most frequent

shape: (3219293, 11)


In [11]:
y = df_train['death_yn']

In [13]:
# Repeat for test data
df_test.drop(columns=['case_month'], inplace=True)
df_test.replace({
    'age_group': {'Missing': np.nan},
    'sex': {'Missing': np.nan, 'Unknown': np.nan},
    'race': {'Unknown': np.nan, 'Missing': np.nan}
}, inplace=True)
df_test.replace(
    {'symptomatic_yn':{np.nan:0, 'nul':0, '0':0, '1':1},
     'hosp_yn':{np.nan:0},
     'icu_yn':{np.nan:0, 'nul':0, '0':0, '1':1},
     'underlying_conditions_yn':{np.nan:0}},
    inplace=True
) 

df_test.replace({'ethnicity': {'Hispanic/Latino': 1, 'Non-Hispanic/Latino': 0, 'Unknown': 0, 'Missing': 0, np.nan: 0} }, inplace=True)
df_test.replace({'sex':{'Male':1, 'Female':0}}, inplace=True)
df_test.fillna(df_test.mode().iloc[0], inplace=True) # fill nan w/ most frequent

In [12]:
# Encode state and race as one-hot, age_group as ordinal
column_trans = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore', sparse=False, dtype=bool), ['res_state', 'race']),
    (OrdinalEncoder(dtype=np.int8), ['age_group']),
    remainder='passthrough',
    n_jobs=8,
    verbose=True
)
df_train.drop(columns=['death_yn'], inplace=True)
X = column_trans.fit_transform(df_train)

print(X.shape)

(3062393, 68)


In [14]:
# Repeat for test data
X_test = column_trans.transform(df_test)
print(X_test.shape)

(2594412, 68)
[ColumnTransformer] ..... (3 of 3) Processing remainder, total=   0.0s
[ColumnTransformer]  (2 of 3) Processing ordinalencoder, total=   0.6s
[ColumnTransformer] . (1 of 3) Processing onehotencoder, total=   1.2s


In [15]:
# train, test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=13); 
print(X_train.shape)
print(X_val.shape)

(2296794, 68)
(765599, 68)


## Hyperparameter Tuning

In [6]:
# Tune hyperparameters
# learning_rate, n_estimators, max_depth, min_samples_split

parameters = {
    'learning_rate': [0.5, 0.25, 0.1, 0.05, 0.01],
    'n_estimators': [8, 16, 32, 64, 100, 200],
    'max_depth': np.linspace(1, 15, 8, endpoint=True), # 1, 3, ..., 15
    'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True)
}
clf = GradientBoostingClassifier();
# scoring='f1_micro'
random_search = RandomizedSearchCV(clf, param_distributions=parameters, n_iter=40, cv=3, random_state=13, n_jobs=8); # the best model
random_search.fit(X_train, y_train);

print("Optimal parameters:", random_search.best_params_); # 
print("Train accuracy: %.3f" %random_search.score(X_train, y_train));
print("Test accuracy: %.3f" %random_search.score(X_val, y_val));

In [None]:
# refit model w/ the best params (RandomizedSearchCV already does this)
gbc = GradientBoostingClassifier(n_estimators=100, min_samples_split=0.1, max_depth=11.0, learning_rate=0.25, verbose=2); 
gbc.fit(X, y);

## Submission

In [9]:
# ypred = random_search.predict_proba(X_test)[:,1]
ypred = gbc.predict_proba(X_test)[:,1]

# Create submission file.
submission = pd.DataFrame(ypred, columns=['prediction']) # Create new dataframe.
submission['Id'] = submission.index  # Kaggle expects two columns: Id, prediction.
submission.to_csv('ps2_submission.csv', index=False)