## Data Preparation
#### Tristan Chen, Jessica Nguyen, Hera Chan

In [None]:
# Installing covidcast
!pip install covidcast



In [None]:
from datetime import date
import covidcast

In [None]:
ca_counties = covidcast.fips_to_name("^06.*", ties_method="all")
ca_counties_fips = ca_counties[0]

# drop first row, because the fips 06000 represents CA
ca_counties_fips.pop("06000")

ad = covidcast.signal('hospital-admissions', 'smoothed_adj_covid19_from_claims', date(2020, 2,20), date(2020, 9, 27), geo_type = "county", geo_values = ca_counties_fips)
ca_counties_1 = ad['geo_value'].unique()

In [None]:
labels = covidcast.signal('indicator-combination', 'confirmed_incidence_prop', date(2020, 2,20), date(2020, 9, 27), geo_type = "county", geo_values = ca_counties_1)
labels = labels[['geo_value', 'signal', 'time_value', 'value', 'geo_type','data_source']]

We drop 06000 because that represents California, not a county in California.

In [None]:
doctor_visits = covidcast.signal('doctor-visits', 'smoothed_adj_cli', date(2020, 2,20), date(2020, 9, 27), geo_type = "county", geo_values = ca_counties_1)
doctor_visits = doctor_visits[['geo_value', 'signal', 'time_value', 'value', 'geo_type', 'data_source']]

In [None]:
hospital_admissions = covidcast.signal('hospital-admissions', 'smoothed_adj_covid19', date(2020, 2,20), date(2020, 9, 27), geo_type = "county", geo_values = ca_counties_1)
hospital_admissions = hospital_admissions[['geo_value', 'signal', 'time_value', 'value', 'geo_type','data_source']]

In [None]:
hospital_admissionsclaims = covidcast.signal('hospital-admissions', 'smoothed_adj_covid19_from_claims', date(2020, 2,20), date(2020, 9, 27), geo_type = "county", geo_values = ca_counties_1)
hospital_admissionsclaims = hospital_admissionsclaims[['geo_value', 'signal', 'time_value', 'value', 'geo_type','data_source']]

In [None]:
outpatient_covid = covidcast.signal('chng', 'smoothed_adj_outpatient_covid', date(2020, 2,20), date(2020, 9, 27), geo_type = "county", geo_values = ca_counties_1)
outpatient_covid = outpatient_covid[['geo_value', 'signal', 'time_value', 'value', 'geo_type','data_source']]

## Merging Data

In [None]:
dataset = covidcast.aggregate_signals([doctor_visits, outpatient_covid, hospital_admissions, hospital_admissionsclaims, labels, doctor_visits, outpatient_covid, hospital_admissions, hospital_admissionsclaims, labels, labels], dt = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0])
dataset = dataset.rename(columns = {'doctor-visits_smoothed_adj_cli_0_value':'doctor_visits (t-1)', 
                                    'chng_smoothed_adj_outpatient_covid_1_value':'outpatient_covid (t-1)',
                                    'hospital-admissions_smoothed_adj_covid19_2_value':'hospital_admissions (t-1)', 
                                    'hospital-admissions_smoothed_adj_covid19_from_claims_3_value':'hospital_admissionsclaims (t-1)',
                                    'indicator-combination_confirmed_incidence_prop_4_value':'confirmed_cases_prop (t-1)',
                                    'doctor-visits_smoothed_adj_cli_5_value':'doctor_visits (t-2)',
                                    'chng_smoothed_adj_outpatient_covid_6_value':'outpatient_covid (t-2)',
                                    'hospital-admissions_smoothed_adj_covid19_7_value':'hospital_admissions (t-2)',
                                    'hospital-admissions_smoothed_adj_covid19_from_claims_8_value':'hospital_admissionsclaims (t-2)',
                                    'indicator-combination_confirmed_incidence_prop_9_value':'confirmed_cases_prop (t-2)',
                                    'indicator-combination_confirmed_incidence_prop_10_value':'label'})

In [None]:
dataset

Unnamed: 0,geo_value,time_value,doctor_visits (t-1),outpatient_covid (t-1),hospital_admissions (t-1),hospital_admissionsclaims (t-1),confirmed_cases_prop (t-1),doctor_visits (t-2),outpatient_covid (t-2),hospital_admissions (t-2),hospital_admissionsclaims (t-2),confirmed_cases_prop (t-2),label,geo_type
0,06001,2020-02-20,,,,,,,,,,,0.0,county
1,06007,2020-02-20,,,,,,,,,,,0.0,county
2,06013,2020-02-20,,,,,,,,,,,0.0,county
3,06019,2020-02-20,,,,,,,,,,,0.0,county
4,06029,2020-02-20,,,,,,,,,,,0.0,county
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6685,06097,2020-09-29,,,,,,3.670377,0.247314,,,30.343734,,county
6686,06099,2020-09-29,,,,,,10.490177,0.330599,,0.343012,0.000000,,county
6687,06107,2020-09-29,,,,,,6.624808,0.199940,,,15.658684,,county
6688,06111,2020-09-29,,,,,,3.494010,0.081933,1.210626,1.184979,0.000000,,county


In [None]:
dataset = dataset.loc[(dataset['time_value'] != '2020-02-20') & (dataset['time_value'] != '2020-02-21') & (dataset['time_value'] !='2020-09-28') & (dataset['time_value'] != '2020-09-29')]
dataset.isna().sum()

geo_value                             0
time_value                            0
doctor_visits (t-1)                   0
outpatient_covid (t-1)                0
hospital_admissions (t-1)          1561
hospital_admissionsclaims (t-1)    2475
confirmed_cases_prop (t-1)            0
doctor_visits (t-2)                   0
outpatient_covid (t-2)                0
hospital_admissions (t-2)          1554
hospital_admissionsclaims (t-2)    2481
confirmed_cases_prop (t-2)            0
label                                 0
geo_type                              0
dtype: int64

In [None]:
# Run this to drop counties with missing values; WARNING will only have ~ 2K DATA IF DONE
v = dataset[dataset['hospital_admissionsclaims (t-1)'].isnull()]
print(v['geo_value'].unique())
print((ca_counties_1))
df = dataset
for i in v['geo_value'].unique():
  df = df[df.geo_value != i]
df

['06007' '06029' '06031' '06047' '06053' '06057' '06077' '06079' '06083'
 '06087' '06095' '06097' '06099' '06107' '06113' '06019' '06041' '06111'
 '06061']
['06001' '06013' '06019' '06037' '06041' '06059' '06061' '06065' '06067'
 '06071' '06073' '06075' '06081' '06085' '06111' '06029' '06083' '06077'
 '06031' '06099' '06047' '06095' '06079' '06097' '06053' '06107' '06057'
 '06007' '06113' '06087']


Unnamed: 0,geo_value,time_value,doctor_visits (t-1),outpatient_covid (t-1),hospital_admissions (t-1),hospital_admissionsclaims (t-1),confirmed_cases_prop (t-1),doctor_visits (t-2),outpatient_covid (t-2),hospital_admissions (t-2),hospital_admissionsclaims (t-2),confirmed_cases_prop (t-2),label,geo_type
60,06001,2020-02-22,0.000000,0.011169,0.100903,0.113524,0.0,0.000000,0.012182,0.120272,0.110992,0.000000,0.000000,county
62,06013,2020-02-22,0.000000,0.011528,0.097979,0.118647,0.0,0.000000,0.012669,0.098062,0.118745,0.000000,0.000000,county
66,06037,2020-02-22,0.102249,0.002221,0.092971,0.104829,0.0,0.102826,0.002446,0.100812,0.111187,0.000000,0.000000,county
71,06059,2020-02-22,0.098592,0.003301,0.079441,0.093063,0.0,0.098592,0.003528,0.079474,0.091819,0.000000,0.000000,county
73,06065,2020-02-22,0.012824,0.011176,0.087820,0.090121,0.0,0.000000,0.012328,0.086867,0.090382,0.000000,0.000000,county
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6615,06071,2020-09-27,3.669759,0.209799,6.422002,3.880961,0.0,3.662651,0.183888,5.491649,3.657069,14.219629,37.108645,county
6616,06073,2020-09-27,3.232160,0.178928,5.407488,1.984030,0.0,3.434456,0.153158,5.308291,1.876802,12.131814,18.242654,county
6617,06075,2020-09-27,7.035193,0.286158,2.876024,1.154249,0.0,7.350537,0.271349,3.212283,1.512510,10.436175,11.910852,county
6620,06081,2020-09-27,3.026974,0.163244,5.329470,2.697026,0.0,3.209329,0.155770,6.374249,3.207936,9.262001,7.566142,county


In [None]:
df.isna().sum()

geo_value                          0
time_value                         0
doctor_visits (t-1)                0
outpatient_covid (t-1)             0
hospital_admissions (t-1)          0
hospital_admissionsclaims (t-1)    0
confirmed_cases_prop (t-1)         0
doctor_visits (t-2)                0
outpatient_covid (t-2)             0
hospital_admissions (t-2)          0
hospital_admissionsclaims (t-2)    0
confirmed_cases_prop (t-2)         0
label                              0
geo_type                           0
dtype: int64

In [None]:
df

Unnamed: 0,geo_value,time_value,doctor_visits (t-1),outpatient_covid (t-1),hospital_admissions (t-1),hospital_admissionsclaims (t-1),confirmed_cases_prop (t-1),doctor_visits (t-2),outpatient_covid (t-2),hospital_admissions (t-2),hospital_admissionsclaims (t-2),confirmed_cases_prop (t-2),label,geo_type
60,06001,2020-02-22,0.000000,0.011169,0.100903,0.113524,0.0,0.000000,0.012182,0.120272,0.110992,0.000000,0.000000,county
62,06013,2020-02-22,0.000000,0.011528,0.097979,0.118647,0.0,0.000000,0.012669,0.098062,0.118745,0.000000,0.000000,county
66,06037,2020-02-22,0.102249,0.002221,0.092971,0.104829,0.0,0.102826,0.002446,0.100812,0.111187,0.000000,0.000000,county
71,06059,2020-02-22,0.098592,0.003301,0.079441,0.093063,0.0,0.098592,0.003528,0.079474,0.091819,0.000000,0.000000,county
73,06065,2020-02-22,0.012824,0.011176,0.087820,0.090121,0.0,0.000000,0.012328,0.086867,0.090382,0.000000,0.000000,county
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6615,06071,2020-09-27,3.669759,0.209799,6.422002,3.880961,0.0,3.662651,0.183888,5.491649,3.657069,14.219629,37.108645,county
6616,06073,2020-09-27,3.232160,0.178928,5.407488,1.984030,0.0,3.434456,0.153158,5.308291,1.876802,12.131814,18.242654,county
6617,06075,2020-09-27,7.035193,0.286158,2.876024,1.154249,0.0,7.350537,0.271349,3.212283,1.512510,10.436175,11.910852,county
6620,06081,2020-09-27,3.026974,0.163244,5.329470,2.697026,0.0,3.209329,0.155770,6.374249,3.207936,9.262001,7.566142,county


In [None]:
df.loc[df.loc[:,'geo_value'] == '06001', ['time_value','label', 'confirmed_cases_prop (t-1)','confirmed_cases_prop (t-2)']]

Unnamed: 0,time_value,label,confirmed_cases_prop (t-1),confirmed_cases_prop (t-2)
60,2020-02-22,0.000000,0.000000,0.000000
90,2020-02-23,0.000000,0.000000,0.000000
120,2020-02-24,0.000000,0.000000,0.000000
150,2020-02-25,0.000000,0.000000,0.000000
180,2020-02-26,0.000000,0.000000,0.000000
...,...,...,...,...
6480,2020-09-23,5.444769,6.402091,4.966108
6510,2020-09-24,6.701254,5.444769,6.402091
6540,2020-09-25,4.607112,6.701254,5.444769
6570,2020-09-26,0.000000,4.607112,6.701254


In [None]:
df1 = df.loc[(df['time_value'] != '2020-02-20') & (df['time_value'] != '2020-02-21') & (df['time_value'] !='2020-09-28') & (df['time_value'] != '2020-09-29')]

In [None]:
df1.columns

Index(['geo_value', 'time_value', 'doctor_visits (t-1)',
       'outpatient_covid (t-1)', 'hospital_admissions (t-1)',
       'hospital_admissionsclaims (t-1)', 'confirmed_cases_prop (t-1)',
       'doctor_visits (t-2)', 'outpatient_covid (t-2)',
       'hospital_admissions (t-2)', 'hospital_admissionsclaims (t-2)',
       'confirmed_cases_prop (t-2)', 'label', 'geo_type'],
      dtype='object')

In [None]:
df1.columns
columns = ['geo_value', 'time_value', 'doctor_visits (t-1)', 'outpatient_covid (t-1)',
       'hospital_admissions (t-1)', 'hospital_admissionsclaims (t-1)',
       'confirmed_cases_prop (t-1)', 'doctor_visits (t-2)', 'outpatient_covid (t-2)',
       'hospital_admissions (t-2)', 'hospital_admissionsclaims (t-2)',
       'confirmed_cases_prop (t-2)', 'label']
df2 = df1[columns]
df2

Unnamed: 0,geo_value,time_value,doctor_visits (t-1),outpatient_covid (t-1),hospital_admissions (t-1),hospital_admissionsclaims (t-1),confirmed_cases_prop (t-1),doctor_visits (t-2),outpatient_covid (t-2),hospital_admissions (t-2),hospital_admissionsclaims (t-2),confirmed_cases_prop (t-2),label
60,06001,2020-02-22,0.000000,0.011169,0.100903,0.113524,0.0,0.000000,0.012182,0.120272,0.110992,0.000000,0.000000
62,06013,2020-02-22,0.000000,0.011528,0.097979,0.118647,0.0,0.000000,0.012669,0.098062,0.118745,0.000000,0.000000
66,06037,2020-02-22,0.102249,0.002221,0.092971,0.104829,0.0,0.102826,0.002446,0.100812,0.111187,0.000000,0.000000
71,06059,2020-02-22,0.098592,0.003301,0.079441,0.093063,0.0,0.098592,0.003528,0.079474,0.091819,0.000000,0.000000
73,06065,2020-02-22,0.012824,0.011176,0.087820,0.090121,0.0,0.000000,0.012328,0.086867,0.090382,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6615,06071,2020-09-27,3.669759,0.209799,6.422002,3.880961,0.0,3.662651,0.183888,5.491649,3.657069,14.219629,37.108645
6616,06073,2020-09-27,3.232160,0.178928,5.407488,1.984030,0.0,3.434456,0.153158,5.308291,1.876802,12.131814,18.242654
6617,06075,2020-09-27,7.035193,0.286158,2.876024,1.154249,0.0,7.350537,0.271349,3.212283,1.512510,10.436175,11.910852
6620,06081,2020-09-27,3.026974,0.163244,5.329470,2.697026,0.0,3.209329,0.155770,6.374249,3.207936,9.262001,7.566142


In [None]:
import pandas as pd
df2 = df2.reset_index(drop=True)
df2['time_value'] = pd.to_datetime(df2['time_value'])
df2 = df2.set_index('time_value')

In [None]:
from google.colab import files
df2.to_csv('training_data.csv') 
files.download('training_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>