In [267]:
from math import sqrt
import numpy as np
from numpy import concatenate
from matplotlib import pyplot
import pandas as pd
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from operator import add

### Loading hospitalization data 

In [99]:
df = read_csv('truth-Incident_Hospitalizations.csv')
df['date'] = pd.to_datetime(df['date'])
print(f'Earlies Date: {min(df["date"])}')
print(f'Latest Date: {max(df["date"])}')

states = list(df['location_name'].unique())
states.remove('US')
states.remove('Virgin Islands')
states.remove('American Samoa')
states.remove('Alaska')
states.remove('Hawaii')
states.remove('Puerto Rico')
print(f'Number of States (No U.S.): {len(states)}')

Earlies Date: 2020-01-01 00:00:00
Latest Date: 2021-11-13 00:00:00
Number of States (No U.S.): 49


In [100]:
# States have less data later than 2020-07-14
for state in states:
    temp_df = df[df['location_name']==state]
    if min(temp_df['date']) > pd.Timestamp("2020-07-14"):
        print(state, '----', min(temp_df['date']))

North Dakota ---- 2020-07-27 00:00:00


In [101]:
# As North Dakota's ealiest date of hospitalization data 
# available is 2020-07-27
# All the data will truncate with this date
df = df[df['date'] >= pd.Timestamp('2020-07-27')]
df = df[df['location_name'].isin(states)]

print(f"Each State has {int(len(df)/len(states))} days data available.\
        \nTotal states is {len(states)}")

Each State has 475 days data available.        
Total states is 49


### Loading Cases Data 

In [102]:
case_df = read_csv('truth-Incident Cases.csv', low_memory=False)
case_df['date'] = pd.to_datetime(case_df['date'])
# Align Date
case_df = case_df[(case_df['date'] >= pd.Timestamp('2020-07-27')) & \
                  (case_df['date'] <= pd.Timestamp('2021-11-13'))]
# Filter States
case_df = case_df[case_df['location_name'].isin(states)]

print(f"Each State has {int(len(case_df)/len(states))} days data available.\
        \nTotal states is {len(states)}")

Each State has 475 days data available.        
Total states is 49


### Loading population data 

In [103]:
import geopandas as gpd

In [20]:
filename = 'Conterminous_US_counties.geojson'
pop_df = gpd.read_file(filename)

  aout[:] = out
  aout[:] = out


In [54]:
pop_df = DataFrame(pop_df)
pop_df = pop_df[['State_Name', 'POPULATION']]
pop_df = pop_df.groupby('State_Name').sum()

In [62]:
# Check Alignment of states of main df and pop_df
set(pop_df.index) == set(states)

True

In [65]:
pop_dict = pop_df['POPULATION'].to_dict()
pop_dict

{'Alabama': 4864680,
 'Arizona': 6946685,
 'Arkansas': 2990671,
 'California': 39148760,
 'Colorado': 5531141,
 'Connecticut': 3581504,
 'Delaware': 949495,
 'District of Columbia': 684498,
 'Florida': 20598139,
 'Georgia': 10297484,
 'Idaho': 1687809,
 'Illinois': 12821497,
 'Indiana': 6637426,
 'Iowa': 3132499,
 'Kansas': 2908776,
 'Kentucky': 4440204,
 'Louisiana': 4663616,
 'Maine': 1332813,
 'Maryland': 6003435,
 'Massachusetts': 6819092,
 'Michigan': 9957488,
 'Minnesota': 5527358,
 'Mississippi': 2988762,
 'Missouri': 6090062,
 'Montana': 1041732,
 'Nebraska': 1904760,
 'Nevada': 2922849,
 'New Hampshire': 1343622,
 'New Jersey': 8881845,
 'New Mexico': 2092434,
 'New York': 12807220,
 'North Carolina': 10155624,
 'North Dakota': 752201,
 'Ohio': 11641879,
 'Oklahoma': 3918137,
 'Oregon': 4081943,
 'Pennsylvania': 12791181,
 'Rhode Island': 1056611,
 'South Carolina': 4955925,
 'South Dakota': 864289,
 'Tennessee': 6651089,
 'Texas': 27885195,
 'Utah': 3045350,
 'Vermont': 62497

### Smooth the data with a 7-day window 

In [104]:
smoothed_hosp_df = pd.DataFrame()
smoothed_case_df = pd.DataFrame()

for state in states:
    temp_df = df[df['location_name']==state].sort_values(by='date')
    temp_df['smoothed_value'] = temp_df.rolling(window=7).mean()
    smoothed_hosp_df = smoothed_hosp_df.append(temp_df)
    
    state_case = case_df[case_df['location_name']==state].sort_values(by='date')
    state_case['smoothed_value'] = state_case.rolling(window=7).mean()
    smoothed_case_df = smoothed_case_df.append(state_case)

In [105]:
len_ = len(smoothed_hosp_df)
smoothed_hosp_df = smoothed_hosp_df.dropna(subset=['smoothed_value'])
smoothed_case_df = smoothed_case_df.dropna(subset=['smoothed_value'])

print('Actual drop = ',len_ - len(smoothed_hosp_df), ', Num of states * 7 =', 
      len(states)*6)
print('Actual drop = ',len_ - len(smoothed_case_df), ', Num of states * 7 =', 
      len(states)*6)
print(len(smoothed_hosp_df)==len(smoothed_case_df))

Actual drop =  294 , Num of states * 7 = 294
Actual drop =  294 , Num of states * 7 = 294
True


In [106]:
main_df = smoothed_hosp_df.copy()
main_df = main_df.rename(columns={'smoothed_value': 'smoothed_hosp', 
                                  'value': 'hospitalizations'})
main_df = pd.merge(main_df, smoothed_case_df, how='inner',
                   on=['date', 'location', 'location_name'])

In [110]:
main_df = main_df.rename(columns={'value': 'cases',
                                  'smoothed_value': 'smoothed_cases'})

In [111]:
main_df

Unnamed: 0,date,location,location_name,hospitalizations,smoothed_hosp,cases,smoothed_cases
0,2020-08-02,54,West Virginia,15,13.571429,120,138.428571
1,2020-08-03,54,West Virginia,16,12.714286,120,139.857143
2,2020-08-04,54,West Virginia,17,12.857143,81,134.142857
3,2020-08-05,54,West Virginia,10,13.285714,103,126.428571
4,2020-08-06,54,West Virginia,15,12.571429,116,129.000000
...,...,...,...,...,...,...,...
22976,2021-11-09,12,Florida,209,197.571429,0,1499.714286
22977,2021-11-10,12,Florida,197,196.571429,0,1499.714286
22978,2021-11-11,12,Florida,172,193.000000,0,1499.714286
22979,2021-11-12,12,Florida,173,188.285714,10302,1471.714286


### Calculate Spatial Proximity to Cases (SPC) and Spatial Proximity to Hospitalizations(SPH) 

In [122]:
sci_df = pd.read_csv('gadm1_nuts2-gadm1_nuts2-fb-social-connectedness-index-october-2021/gadm1_nuts2_gadm1_nuts2.tsv', 
                  delimiter="\t", low_memory=False)

temp1_idx = sci_df[sci_df['user_loc'].str.contains('USA')].index
temp2_idx = sci_df[sci_df['fr_loc'].str.contains('USA')].index
idx = temp1_idx.intersection(temp2_idx)

sci_df = sci_df.loc[idx]

In [139]:
sci_df

Unnamed: 0,user_loc,fr_loc,scaled_sci
5666699,USA1,USA1,1017421
5666700,USA1,USA10,12276
5666701,USA1,USA11,39255
5666702,USA1,USA12,4767
5666703,USA1,USA13,2627
...,...,...,...
5790045,USA9,USA51,6464
5790046,USA9,USA6,16001
5790047,USA9,USA7,19639
5790048,USA9,USA8,28032


In [153]:
gadm_us = read_csv('gadm1_nuts2-gadm1_nuts2-fb-social-connectedness-index-october-2021/gadm_us_states.csv')
gadm_us = gadm_us.set_index('gadm_id')
gadm_us_dict = gadm_us['Name'].to_dict()

In [154]:
# Check States alignment
set(gadm_us['Name']) - set(states)

{'Alaska', 'Hawaii'}

In [155]:
gadm_us_dict.pop('USA50')
gadm_us_dict.pop('USA51')

'Hawaii'

In [263]:
inv_gadm_us_dict = {v: k for k, v in gadm_us_dict.items()}

In [160]:
sci_df = sci_df[(sci_df['user_loc'].isin(gadm_us_dict))
                &(sci_df['fr_loc'].isin(gadm_us_dict))]

In [164]:
sci_df

Unnamed: 0,user_loc,fr_loc,scaled_sci
5666699,USA1,USA1,1017421
5666700,USA1,USA10,12276
5666701,USA1,USA11,39255
5666702,USA1,USA12,4767
5666703,USA1,USA13,2627
...,...,...,...
5790043,USA9,USA5,12773
5790046,USA9,USA6,16001
5790047,USA9,USA7,19639
5790048,USA9,USA8,28032


In [202]:
## Calculate fixed weights
sci_weight_dict = {}

for gadm_id in gadm_us_dict:
    temp_df = sci_df[(sci_df['user_loc']==gadm_id)&
                     (sci_df['fr_loc']!=gadm_id)]
    
    temp_df['weight'] = temp_df['scaled_sci']/temp_df['scaled_sci'].sum()
    temp_df = temp_df.set_index('fr_loc')
    sci_weight_dict[gadm_id] = temp_df['weight'].to_dict()
    print(f"{gadm_id.replace('USA','')} / {len(gadm_us_dict)}")

1 / 49
2 / 49
3 / 49
4 / 49
5 / 49
6 / 49
7 / 49
8 / 49
9 / 49
10 / 49
11 / 49
12 / 49
13 / 49
14 / 49
15 / 49
16 / 49
17 / 49
18 / 49
19 / 49
20 / 49
21 / 49
22 / 49
23 / 49
24 / 49
25 / 49
26 / 49
27 / 49
28 / 49
29 / 49
30 / 49
31 / 49
32 / 49
33 / 49
34 / 49
35 / 49
36 / 49
37 / 49
38 / 49
39 / 49
40 / 49
41 / 49
42 / 49
43 / 49
44 / 49
45 / 49
46 / 49
47 / 49
48 / 49
49 / 49


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [266]:
new_main_df = DataFrame()

for state in states:
    temp_df = main_df[main_df['location_name']==state]
    
    i = 1
    for gadm_id in sci_weight_dict[inv_gadm_us_dict[state]]:
        if i == 1:
            temp_df2 = main_df[main_df['location_name']==gadm_us_dict[gadm_id]]
            spc_temp = list(temp_df2['smoothed_cases'] / pop_dict[gadm_us_dict[gadm_id]] \
                        * 10000 * sci_weight_dict[inv_gadm_us_dict[state]][gadm_id])
            sph_temp = list(temp_df2['smoothed_hosp'] / pop_dict[gadm_us_dict[gadm_id]] \
                        * 10000 * sci_weight_dict[inv_gadm_us_dict[state]][gadm_id])
        else:
            temp_df2 = main_df[main_df['location_name']==gadm_us_dict[gadm_id]]
            spc_temp = list(map(add, spc_temp, 
                                list(list(temp_df2['smoothed_cases'] / pop_dict[gadm_us_dict[gadm_id]] \
                        * 10000 * sci_weight_dict[inv_gadm_us_dict[state]][gadm_id]))))
            sph_temp = list(map(add, sph_temp, 
                                list(list(temp_df2['smoothed_hosp'] / pop_dict[gadm_us_dict[gadm_id]] \
                        * 10000 * sci_weight_dict[inv_gadm_us_dict[state]][gadm_id]))))
        i += 1
    temp_df['spc'] = spc_temp
    temp_df['sph'] = sph_temp
    
    new_main_df = new_main_df.append(temp_df)
    
    print(states.index(state)+1,'/', len(states), ':', state)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


1 / 49 : West Virginia
2 / 49 : District of Columbia
3 / 49 : New Hampshire
4 / 49 : Montana
5 / 49 : North Dakota
6 / 49 : Maryland
7 / 49 : Missouri
8 / 49 : Minnesota
9 / 49 : Delaware
10 / 49 : Oregon
11 / 49 : Idaho
12 / 49 : New Mexico
13 / 49 : Kansas
14 / 49 : Rhode Island
15 / 49 : South Dakota
16 / 49 : Louisiana
17 / 49 : New Jersey
18 / 49 : Kentucky
19 / 49 : Maine
20 / 49 : Utah
21 / 49 : Connecticut
22 / 49 : Iowa
23 / 49 : Pennsylvania
24 / 49 : Tennessee
25 / 49 : Oklahoma
26 / 49 : Nebraska
27 / 49 : Nevada
28 / 49 : Wyoming
29 / 49 : Massachusetts
30 / 49 : Colorado
31 / 49 : Arkansas
32 / 49 : North Carolina
33 / 49 : Ohio
34 / 49 : New York
35 / 49 : Vermont
36 / 49 : Michigan
37 / 49 : Wisconsin
38 / 49 : Mississippi
39 / 49 : South Carolina
40 / 49 : Illinois
41 / 49 : Virginia
42 / 49 : Texas
43 / 49 : Arizona
44 / 49 : Indiana
45 / 49 : Washington
46 / 49 : Georgia
47 / 49 : California
48 / 49 : Alabama
49 / 49 : Florida


In [268]:
new_main_df

Unnamed: 0,date,location,location_name,hospitalizations,smoothed_hosp,cases,smoothed_cases,spc,sph
0,2020-08-02,54,West Virginia,15,13.571429,120,138.428571,1.667824,0.135794
1,2020-08-03,54,West Virginia,16,12.714286,120,139.857143,1.613695,0.130553
2,2020-08-04,54,West Virginia,17,12.857143,81,134.142857,1.594363,0.130621
3,2020-08-05,54,West Virginia,10,13.285714,103,126.428571,1.578601,0.129031
4,2020-08-06,54,West Virginia,15,12.571429,116,129.000000,1.553070,0.126456
...,...,...,...,...,...,...,...,...,...
22976,2021-11-09,12,Florida,209,197.571429,0,1499.714286,2.640175,0.186948
22977,2021-11-10,12,Florida,197,196.571429,0,1499.714286,2.699973,0.188298
22978,2021-11-11,12,Florida,172,193.000000,0,1499.714286,2.558361,0.188551
22979,2021-11-12,12,Florida,173,188.285714,10302,1471.714286,2.821568,0.189012


In [271]:
new_main_df.to_csv('procecssed_df.csv')