# COVID-19 Search Trends & Hospitalizations


So we need to do a mixture of code files and notebooks cause they want to run some of our stuff easily. 
I'm guessing we can simply put our good stuff here and play around in the code files.


In [None]:
import numpy as np
import pandas as pd

First, load datasets directly into 2D numpy arrays. Two different seach data arrays were used: one with string values for the feature names and the other with float numbers for preprocessing


In [None]:
search_data = np.genfromtxt('2020_US_weekly_symptoms_dataset.csv', delimiter=',')
search_data_info = np.genfromtxt('2020_US_weekly_symptoms_dataset.csv', delimiter=',', dtype=None, encoding='utf-8')
hospitalization_data = np.genfromtxt('aggregated_cc_by.csv', delimiter=',', dtype=None, encoding='utf-8')

search_data_no_head = search_data[1:, :]
hospitalization_data_no_head = hospitalization_data[1:, :]
search_data_name = search_data_info[:, 0:8]
search_data_symptoms = search_data_info[0, 9:]

OSError: ignored

**Cleaning up the datasets**: Any symptoms with less than 50% of their data being filled were removed. Subregion 2 columns also are removed since they didn't categorize any smaller subregions. 


In [None]:
search_data = np.delete(search_data_no_head, np.count_nonzero(np.isnan(search_data_no_head), axis=0) >
                        0.5 * np.shape(search_data_no_head)[0], axis=1)
search_data_symptoms = search_data_symptoms[~(np.count_nonzero(np.isnan(search_data_no_head[:, 9:]), axis=0) >
                                              0.5 * np.shape(search_data_no_head)[0])]

[['open_covid_region_code' 'country_region_code' 'country_region' ...
  'symptom:Ventricular fibrillation' 'symptom:Viral pneumonia'
  'symptom:Yawn']
 ['US-AK' 'US' 'United States' ... '9.91' '' '14.28']
 ['US-AK' 'US' 'United States' ... '8.41' '' '16.26']
 ...
 ['US-WY' 'US' 'United States' ... '' '' '4.02']
 ['US-WY' 'US' 'United States' ... '' '' '4.37']
 ['US-WY' 'US' 'United States' ... '3.23' '' '4.76']]


**Normalizing the Data:** Since the dataset was normalized to each specific region, we desire to re-normalize it again to be comparable across regions. To do this we subtracted the median value of each symptom from each data point or de-median'd the data.

In [None]:
search_data[np.isnan(search_data)] = 0

median = np.median(search_data, axis=0)# MEDIAN ARRAY
for i in range(search_data.shape[1]):
    search_data[:, i] = search_data[:, i] - median[i]

search_data_with_head = np.concatenate(([search_data_symptoms], search_data), axis=0)
search_data_name = np.delete(search_data_name, [5, 6], axis=1)

clean_search_data = np.concatenate((search_data_name, search_data_with_head), axis=1)

**Cleaning up the datasets**: Hospitalization data contained entries for several countries and American states. Only states included in the search symptoms dataset were taken out and any states with very little hospitalization data that was not reflective of their COVID-19 case numbers were removed as well. This included District of Columbia, Delaware and West Virginia.

In [None]:
hospital_columns = [0, 1, 2, 16, 17]
american_states = ['AK', 'HI', 'ID', 'ME', 'MT', 'ND', 'NE', 'NH', 'NM', 'RI', 'SD', 'VT', 'WY']

hospitalization_data = np.take(hospitalization_data, hospital_columns, axis=1)

us_hospitalization_data = np.empty((1, 5))

for state in american_states:
    covid_region_code = f"US-{state}"
    state_data = np.delete(hospitalization_data, ~(np.any(hospitalization_data == covid_region_code, axis=1)), axis=0)
    us_hospitalization_data = np.append(us_hospitalization_data, state_data, axis=0)

us_hospitalization_data = us_hospitalization_data[1:,:]
print(us_hospitalization_data)

[['US-AK' 'Alaska' '2020-03-06' '0.0' '0.0']
 ['US-AK' 'Alaska' '2020-03-07' '0.0' '0.0']
 ['US-AK' 'Alaska' '2020-03-08' '0.0' '0.0']
 ...
 ['US-WY' 'Wyoming' '2020-10-04' '1.0' '282.0']
 ['US-WY' 'Wyoming' '2020-10-05' '12.0' '294.0']
 ['US-WY' 'Wyoming' '2020-10-06' '10.0' '304.0']]


**Cleaning up the datasets**: Remove states and dates with no hospitalization data in the search data.

In [None]:
clean_search_data = np.delete(clean_search_data, np.any(clean_search_data == "US-WV", axis=1), axis=0)
clean_search_data = np.delete(clean_search_data, np.any(clean_search_data == "US-DC", axis=1), axis=0)
clean_search_data = np.delete(clean_search_data, np.any(clean_search_data == "US-DE", axis=1), axis=0)

remove_dates = ['01-06', '01-13', '01-20', '01-27', '02-03', '02-10', '02-17', '02-24']
for date in remove_dates:
    full_date = f"2020-{date}"
    clean_search_data = np.delete(clean_search_data, np.any(clean_search_data == full_date, axis=1), axis=0)

print(clean_search_data)

[['open_covid_region_code' 'country_region_code' 'country_region' ...
  'symptom:Ventricular fibrillation' 'symptom:Viral pneumonia'
  'symptom:Yawn']
 ['US-AK' 'US' 'United States' ... '10.46' '14.12' '9.3']
 ['US-AK' 'US' 'United States' ... '' '18.85' '10.77']
 ...
 ['US-WY' 'US' 'United States' ... '' '' '4.02']
 ['US-WY' 'US' 'United States' ... '' '' '4.37']
 ['US-WY' 'US' 'United States' ... '3.23' '' '4.76']]


**Merging the datasets**: The hospitalization data is given daily hence need be converted to weekly. Daily data was summed together and aggregated to the Monday of that week starting from March-2-2020 to September-21-2020. 

In [None]:
us_hospitalization_data = pd.DataFrame(us_hospitalization_data, columns=['open_covid_region_code', 'sub_region_1',
                                                                         'date', 'hospitalized_new',
                                                                         'hospitalized_cumulative'])

us_hospitalization_data['date'] = us_hospitalization_data['date'].astype('datetime64[ns]')
us_hospitalization_data['hospitalized_new'] = us_hospitalization_data['hospitalized_new'].astype('float')
us_hospitalization_data['hospitalized_cumulative'] = us_hospitalization_data['hospitalized_cumulative'].astype('float')

weekly_hospitalization_data = us_hospitalization_data.groupby(by=["open_covid_region_code", 'sub_region_1']).resample(
    "W-MON",
    label='left',
    closed='left',
    on='date').sum().reset_index()

weekly_hospitalization_data

Unnamed: 0,open_covid_region_code,sub_region_1,date,hospitalized_new,hospitalized_cumulative
0,US-AK,Alaska,2020-03-02,0.0,0.0
1,US-AK,Alaska,2020-03-09,0.0,0.0
2,US-AK,Alaska,2020-03-16,0.0,0.0
3,US-AK,Alaska,2020-03-23,6.0,18.0
4,US-AK,Alaska,2020-03-30,14.0,83.0
...,...,...,...,...,...
412,US-WY,Wyoming,2020-09-07,9.0,1625.0
413,US-WY,Wyoming,2020-09-14,15.0,1656.0
414,US-WY,Wyoming,2020-09-21,19.0,1784.0
415,US-WY,Wyoming,2020-09-28,20.0,1920.0


**Cleaning the datasets:** Extra weeks were included in the hospitalization data not in the search data hence they were excluded. 

In [None]:
weekly_hospitalization_data = weekly_hospitalization_data.drop(weekly_hospitalization_data[(weekly_hospitalization_data.
                                                                                            date == '2020-09-28') |
                                                                                           (weekly_hospitalization_data.
                                                                                            date == '2020-10-05') |
                                                                                           (weekly_hospitalization_data.
                                                                                            date == '2020-02-24')].
                                                               index).reset_index(drop=True)
weekly_hospitalization_data

Unnamed: 0,open_covid_region_code,sub_region_1,date,hospitalized_new,hospitalized_cumulative
0,US-AK,Alaska,2020-03-02,0.0,0.0
1,US-AK,Alaska,2020-03-09,0.0,0.0
2,US-AK,Alaska,2020-03-16,0.0,0.0
3,US-AK,Alaska,2020-03-23,6.0,18.0
4,US-AK,Alaska,2020-03-30,14.0,83.0
...,...,...,...,...,...
385,US-WY,Wyoming,2020-08-24,8.0,1497.0
386,US-WY,Wyoming,2020-08-31,4.0,1543.0
387,US-WY,Wyoming,2020-09-07,9.0,1625.0
388,US-WY,Wyoming,2020-09-14,15.0,1656.0


**Merging the datasets:** Now that both datasets have 390 rows aligned by state and week, we can take the last two columns of the hospitalzation data and concatenate/append it to the search data. 

In [None]:
search_dataframe = pd.DataFrame(clean_search_data[1:, :], columns=clean_search_data[0, :])

final_covid_dataframe = pd.concat([search_dataframe, weekly_hospitalization_data[['hospitalized_new',
                                                                                  'hospitalized_cumulative']]],
                                  axis=1)

final_covid_dataset = np.append(clean_search_data[1:, :], np.array(weekly_hospitalization_data
                                                                   [['hospitalized_new',
                                                                     'hospitalized_cumulative']]), axis=1)
final_covid_dataset
final_covid_dataframe

Unnamed: 0,open_covid_region_code,country_region_code,country_region,sub_region_1,sub_region_1_code,date,symptom:Adrenal crisis,symptom:Ageusia,symptom:Allergic conjunctivitis,symptom:Amblyopia,...,symptom:Trichoptilosis,symptom:Upper respiratory tract infection,symptom:Urethritis,symptom:Urinary urgency,symptom:Vasculitis,symptom:Ventricular fibrillation,symptom:Viral pneumonia,symptom:Yawn,hospitalized_new,hospitalized_cumulative
0,US-AK,US,United States,Alaska,US-AK,2020-03-02,14.62,,,,...,12.96,21.76,11.29,,13.29,10.46,14.12,9.3,0.0,0.0
1,US-AK,US,United States,Alaska,US-AK,2020-03-09,10.6,,11.61,,...,13.97,46.95,10.27,,9.26,,18.85,10.77,0.0,0.0
2,US-AK,US,United States,Alaska,US-AK,2020-03-16,11.69,,6.39,,...,9.04,78.71,12.0,,11.53,,24.94,14.34,0.0,0.0
3,US-AK,US,United States,Alaska,US-AK,2020-03-23,11.15,16.57,,,...,8.36,37.54,8.8,,6.75,7.04,13.79,15.69,6.0,18.0
4,US-AK,US,United States,Alaska,US-AK,2020-03-30,8.96,8.96,7.11,,...,14.79,24.46,9.81,7.96,7.54,7.11,8.82,17.63,14.0,83.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,US-WY,US,United States,Wyoming,US-WY,2020-08-24,4.42,3.31,3.42,5.19,...,2.26,3.98,4.09,,3.64,2.48,,3.87,8.0,1497.0
386,US-WY,US,United States,Wyoming,US-WY,2020-08-31,5.03,2.79,,5.75,...,2.57,2.68,,,5.08,,,3.63,4.0,1543.0
387,US-WY,US,United States,Wyoming,US-WY,2020-09-07,3.36,3.25,2.2,3.63,...,,3.19,3.19,,4.51,,,4.02,9.0,1625.0
388,US-WY,US,United States,Wyoming,US-WY,2020-09-14,4.15,4.97,,5.14,...,3.06,4.59,3.22,2.68,3.88,,,4.37,15.0,1656.0


In [None]:
final_covid_dataset[1:,6:127]

array([['10.6', '', '11.61', ..., '', '18.85', '10.77'],
       ['11.69', '', '6.39', ..., '', '24.94', '14.34'],
       ['11.15', '16.57', '', ..., '7.04', '13.79', '15.69'],
       ...,
       ['3.36', '3.25', '2.2', ..., '', '', '4.02'],
       ['4.15', '4.97', '', ..., '', '', '4.37'],
       ['5.64', '3.67', '', ..., '3.23', '', '4.76']], dtype='<U49')