In [12]:
# make sure to install these packages before running:
# pip install pandas
# pip install sodapy

import pandas as pd
from sodapy import Socrata

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cdc.gov", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cdc.gov,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get_all("vbim-akqf", where="(hosp_yn != 'Unknown') and (hosp_yn != 'Missing') and (icu_yn != 'Unknown') and (icu_yn != 'Missing') and (death_yn != 'Unknown') and (death_yn != 'Missing') and (sex != 'Unknown') and (sex != 'Missing') and (sex != 'Other') and (sex != 'NA') and (age_group != 'Missing') and (age_group != 'NA') and (race_ethnicity_combined != 'Missing') and  (race_ethnicity_combined != 'NA') and (race_ethnicity_combined != 'Unknown') and (medcond_yn != 'Missing') and (medcond_yn != 'Unknown')", limit=22500000)
# Convert to pandas DataFrame
df = pd.DataFrame.from_records(results)



In [13]:
new_df = df.iloc[:,5:12]

print(new_df.head())
print(new_df.shape)

      sex    age_group race_ethnicity_combined hosp_yn icu_yn death_yn  \
0  Female  0 - 9 Years     White, Non-Hispanic      No     No       No   
1  Female  0 - 9 Years     White, Non-Hispanic      No     No       No   
2  Female  0 - 9 Years     White, Non-Hispanic      No     No       No   
3  Female  0 - 9 Years     White, Non-Hispanic      No     No       No   
4  Female  0 - 9 Years     White, Non-Hispanic      No     No       No   

  medcond_yn  
0        Yes  
1         No  
2         No  
3         No  
4        Yes  
(642408, 7)


In [14]:
print(new_df.sex.value_counts(), '\n')
print(new_df.age_group.value_counts(), '\n')
print(new_df.race_ethnicity_combined.value_counts(), '\n')
print(new_df.medcond_yn.value_counts(), '\n')

Female    342353
Male      300055
Name: sex, dtype: int64 

20 - 29 Years    103062
50 - 59 Years     97649
40 - 49 Years     91822
30 - 39 Years     91432
60 - 69 Years     82583
10 - 19 Years     61168
70 - 79 Years     53846
80+ Years         39577
0 - 9 Years       21269
Name: age_group, dtype: int64 

White, Non-Hispanic                                     399689
Hispanic/Latino                                         127430
Black, Non-Hispanic                                      66657
Multiple/Other, Non-Hispanic                             23417
Asian, Non-Hispanic                                      19514
Native Hawaiian/Other Pacific Islander, Non-Hispanic      3535
American Indian/Alaska Native, Non-Hispanic               2166
Name: race_ethnicity_combined, dtype: int64 

No     332902
Yes    309506
Name: medcond_yn, dtype: int64 



In [15]:
print(df.hosp_yn.value_counts(), '\n')
print(df.icu_yn.value_counts(), '\n')
print(df.death_yn.value_counts(), '\n')

No     523988
Yes    118420
Name: hosp_yn, dtype: int64 

No     603347
Yes     39061
Name: icu_yn, dtype: int64 

No     601211
Yes     41197
Name: death_yn, dtype: int64 



In [16]:
features = new_df.iloc[:,[0,1,2,6]]
labels = new_df.iloc[:,[3,4,5]]

print(features.head(), '\n')
print(labels.head(), '\n')

      sex    age_group race_ethnicity_combined medcond_yn
0  Female  0 - 9 Years     White, Non-Hispanic        Yes
1  Female  0 - 9 Years     White, Non-Hispanic         No
2  Female  0 - 9 Years     White, Non-Hispanic         No
3  Female  0 - 9 Years     White, Non-Hispanic         No
4  Female  0 - 9 Years     White, Non-Hispanic        Yes 

  hosp_yn icu_yn death_yn
0      No     No       No
1      No     No       No
2      No     No       No
3      No     No       No
4      No     No       No 



In [17]:
pd.options.mode.chained_assignment = None
features["sex"] = features["sex"].astype('category').cat.codes
features["age_group"] = features["age_group"].astype('category').cat.codes
features = pd.get_dummies(features, columns=["race_ethnicity_combined"], prefix=["race"])
features["medcond_yn"] = features["medcond_yn"].astype('category').cat.codes
print(features.head())


   sex  age_group  medcond_yn  \
0    0          0           1   
1    0          0           0   
2    0          0           0   
3    0          0           0   
4    0          0           1   

   race_American Indian/Alaska Native, Non-Hispanic  race_Asian, Non-Hispanic  \
0                                                 0                         0   
1                                                 0                         0   
2                                                 0                         0   
3                                                 0                         0   
4                                                 0                         0   

   race_Black, Non-Hispanic  race_Hispanic/Latino  \
0                         0                     0   
1                         0                     0   
2                         0                     0   
3                         0                     0   
4                         0                     0

In [18]:
labels["hosp_yn"] = labels["hosp_yn"].astype('category').cat.codes
labels["icu_yn"] = labels["icu_yn"].astype('category').cat.codes
labels["death_yn"] = labels["death_yn"].astype('category').cat.codes
print(labels.head())

   hosp_yn  icu_yn  death_yn
0        0       0         0
1        0       0         0
2        0       0         0
3        0       0         0
4        0       0         0


In [19]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features.to_numpy(), labels.to_numpy(), test_size=0.2, random_state=42)

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.utils import normalize
x_train = normalize(x_train, axis=1)
x_test = normalize(x_test, axis=1)

model = Sequential()
model.add(Dense(10, input_dim=features.shape[1], activation='relu')) # Hidden 1
model.add(Dense(10, activation='relu')) # Hidden 2
model.add(Dense(3, activation='sigmoid')) # Output
model.compile(loss='mean_squared_error', optimizer='adam', metrics=["accuracy"])

model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x2356be26ca0>

In [21]:
test_loss, test_acc = model.evaluate(x=x_test, y=y_test) 
print(f"Test Loss: {test_loss}, Test Accuracy: {test_acc}")

Test Loss: 0.07053086161613464, Test Accuracy: 0.9912205338478088


In [22]:
import os
model.save(os.path.join("./model/","covid_model.h5"))

In [25]:
print(features.age_group.value_counts(), '\n')

2    103062
5     97649
4     91822
3     91432
6     82583
1     61168
7     53846
8     39577
0     21269
Name: age_group, dtype: int64 

