## Time interval plots

In [407]:
import pandas as pd
import numpy as np
from numpy import cov
from scipy.stats import pearsonr
import datetime
t = datetime.datetime.now()
import plotly.express as px
import plotly.graph_objects as go
import plotly.express as px
pd.set_option('display.max_rows', 40)
folder = r'C:\Users\Ensheng\Desktop\Coronavirus\COVID_MMR\data\\'
output_folder =r'C:\Users\Ensheng\Desktop\Coronavirus\COVID_MMR\output\\'

In [426]:
# pre-config
threshold_value = 1
pop_line = 1000
case_line = 100
#day_interval = 14
ex_list = ['Alaska','American Samoa','Guam','Northern Mariana Islands',
           'Puerto Rico','Virgin Islands','Diamond Princess','Grand Princess']

### MMR data

In [427]:
# data: MMR at the county level
in_table = folder + r'US_MMR.xlsx'
mmr_df = pd.read_excel(in_table,sheet_name='county')
print(len(mmr_df))

# remove state with unified MMR
state_list = list(set(mmr_df['State']))
# detect states with the same MMR rate for all counties
one_vr_list = []
for state in state_list:
    if (len(set(mmr_df[mmr_df['State']==state]['MMR']))) == 1:
        print(state)
        one_vr_list.append(state)
mmr_df =  mmr_df[~mmr_df['State'].isin(one_vr_list)]
mmr_df = mmr_df.drop(columns=['logMMR'])

print(len(mmr_df))
mmr_df.head(3)

3101
NV
NH
HI
OH
WV
DE
NE
MS
AR
2673


Unnamed: 0,FIPS,County,State,MMR,Population
0,1001,Autauga,AL,0.9536,55504
1,1003,Baldwin,AL,0.97,212628
2,1005,Barbour,AL,0.9283,25270


### Cases

In [428]:
ex_list = ['Alaska','American Samoa','Guam','Northern Mariana Islands',
           'Puerto Rico','Virgin Islands','Diamond Princess','Grand Princess']

# data: cases at the county level
in_table = r'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv'
cases_df = pd.read_csv(in_table)
cases_df = cases_df.drop(columns=['UID', 'iso2','iso3','code3','Country_Region','Lat','Long_','Combined_Key'])

# clean the table
cases_df = cases_df[~cases_df['Province_State'].isin(ex_list)]
cases_df = cases_df[~cases_df['Admin2'].isin(['Unassigned'])]
cases_df = cases_df[~cases_df['Admin2'].str.startswith('Out of')]
cases_df = cases_df[cases_df['FIPS'].notnull()]
cases_df.loc[cases_df['FIPS'].notnull(), 'FIPS'] = cases_df[cases_df['FIPS'].notnull()]['FIPS'].astype(int) # FIPS code from float to int

name_df = cases_df[['FIPS','Admin2','Province_State']]
print("Cases! Number of counties with FIPS: " + str(len(cases_df)))
cases_df.tail(3)

Cases! Number of counties with FIPS: 3113


Unnamed: 0,FIPS,Admin2,Province_State,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,8/15/20,8/16/20,8/17/20,8/18/20,8/19/20,8/20/20,8/21/20,8/22/20,8/23/20,8/24/20
3336,56041,Uinta,Wyoming,0,0,0,0,0,0,0,...,276,276,277,278,283,283,283,283,283,283
3338,56043,Washakie,Wyoming,0,0,0,0,0,0,0,...,97,97,100,102,104,106,106,107,108,108
3339,56045,Weston,Wyoming,0,0,0,0,0,0,0,...,7,7,8,8,12,0,11,11,11,11


### Apply filter by cases

In [429]:
latest_date = cases_df.columns[-1]
#case_line = 100
df = cases_df[cases_df[latest_date]>=case_line]
print(len(df))
fig = px.histogram(df, x=latest_date, width=600, height=400)
print("Cum cases on: " + cases_df.columns[-1])
fig.show()

2185
Cum cases on: 8/24/20


In [430]:
cases_df = cases_df[cases_df[latest_date]>=100]

### Total cases moving average

In [431]:
# reframe the case dataframe
time_df = cases_df.drop(columns=['Admin2','Province_State'])
time_df = time_df.set_index('FIPS')
time_df = time_df.stack().reset_index()
time_df = time_df.rename(columns={"level_1": "date",0: "cases"})
time_df['date'] = pd.to_datetime(time_df['date'])
print(len(time_df))
time_df.head()

471960


Unnamed: 0,FIPS,date,cases
0,1001,2020-01-22,0
1,1001,2020-01-23,0
2,1001,2020-01-24,0
3,1001,2020-01-25,0
4,1001,2020-01-26,0


In [432]:
# calculate the moving average
# ref: https://stackoverflow.com/questions/53339021/python-pandas-calculate-moving-average-within-group
time_df['cases_7ave'] = time_df.groupby('FIPS')['cases'].transform(lambda x: x.rolling(7, 1).mean())
print(len(time_df))
time_df.tail()

471960


Unnamed: 0,FIPS,date,cases,cases_7ave
471955,56043,2020-08-20,106,100.285714
471956,56043,2020-08-21,106,101.714286
471957,56043,2020-08-22,107,103.142857
471958,56043,2020-08-23,108,104.714286
471959,56043,2020-08-24,108,105.857143


In [433]:
# on or beyond the threshold
#threshold_value = 1
start_df = time_df[time_df['cases_7ave']>=threshold_value]
start_df = start_df.groupby('FIPS').first().reset_index()
print(len(start_df))
start_df.head()

2185


Unnamed: 0,FIPS,date,cases,cases_7ave
0,1001,2020-03-26,6,1.571429
1,1003,2020-03-20,2,1.0
2,1005,2020-04-06,2,1.0
3,1007,2020-04-01,3,1.142857
4,1009,2020-03-27,4,1.0


In [434]:
df = start_df.sort_values(by='date', ascending=True).reset_index(drop=True)
df = pd.merge(df, name_df, how='left', on='FIPS')

fig = px.scatter(df, x="Province_State", y="date", color="Province_State", 
                 title='Dates starting with ' + str(threshold_value) + ' or more cases (moving average)')
fig.update_layout(showlegend=False)
with open('covid_mmr_' + t.strftime('%m%d%y%H%M')+'.html', 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
fig.show()

### Integrated code

In [435]:
field_name = 'incidence' # 'incidence', 'cases_7ave1'
title_name = 'Corr - log(MMR) vs incidence (moving average, log). Starting case(s): ' + str(threshold_value)
# title_name = 'Corr - log(MMR) vs total cases (moving average, log). Starting case(s): ' + str(threshold_value)

In [436]:
us_df = pd.DataFrame()

for day_interval in [0,3,5,7,10,14,21,30,40,50,60,90]:
    # add time interval
    #day_interval = 7
    threshold_df = start_df
    threshold_df['date1'] = start_df['date'] + pd.DateOffset(days=day_interval)
    #print(len(threshold_df))
    threshold_df.head()

    # cases after interval days
    time_df1 = time_df[['FIPS','date','cases_7ave']]
    time_df1 = time_df1.rename(columns={"date": "date1","cases_7ave": "cases_7ave1"})
    threshold_df = pd.merge(threshold_df, time_df1,  how='left', left_on=['FIPS','date1'], right_on = ['FIPS','date1'])
    #print(len(threshold_df))
    threshold_df.head()

    # counties with MMR but no cases
    mmr_df[mmr_df['FIPS'].isin(set(mmr_df['FIPS']) - set(threshold_df['FIPS']))]

    # counties without MMR
    temp_df = threshold_df[threshold_df['FIPS'].isin(set(threshold_df['FIPS']) - set(mmr_df['FIPS']))][['FIPS']]
    temp_df = pd.merge(temp_df,name_df,how='left',on='FIPS')
    #print(set(temp_df['Province_State']))
    temp_df

    merged_df = pd.merge(threshold_df,mmr_df, how='left', on='FIPS')
    merged_df = merged_df[merged_df['MMR'].notnull()]
    merged_df['incidence'] = merged_df['cases_7ave1'] * 100000 / merged_df['Population']
    #print(len(merged_df))
    merged_df.head(3)

    df = merged_df
    state_list = list(set(df['State']))
    state_list.sort(reverse=False)

    #field_name = 'incidence'
    df = df[(df[field_name]>0) & (df['Population']>=pop_line)]
    df.loc[:,('MMR')] = np.log(df['MMR'])
    df.loc[:,(field_name)] = np.log(df[field_name])
    df

    # nation level corr

    corr_df = pd.DataFrame()

    df_state = df

    # covariance covariance[0,1],
    data1 = df_state['MMR']
    data2 = df_state[field_name]
    covariance = cov(data1, data2)

    # Pearsons correlation
    corr, _ = pearsonr(data1, data2)

    # R-squared
    correlation_matrix = np.corrcoef(data1, data2)
    correlation_xy = correlation_matrix[0,1]
    r_squared = correlation_xy**2

    # outputs
    corr_df = corr_df.append([['US',day_interval, str(len(df_state)),str(len(df_state)),corr,'%.5f' % r_squared]])

    corr_df.columns=['state','intervals','tested_counties', 'total_counties','pearsons','r_squared']
    corr_df1 = corr_df

    # state corr

    corr_df = pd.DataFrame()

    for state in state_list:

        df_state = df[(df['State']==state)]

        if len(df_state) >= 4:

            # covariance covariance[0,1],
            data1 = df_state['MMR']
            data2 = df_state[field_name]
            covariance = cov(data1, data2)

            # Pearsons correlation
            corr, _ = pearsonr(data1, data2)

            # R-squared
            correlation_matrix = np.corrcoef(data1, data2)
            correlation_xy = correlation_matrix[0,1]
            r_squared = correlation_xy**2

            # outputs
            corr_df = corr_df.append([[state,day_interval,str(len(df_state)),str(len(mmr_df[mmr_df['State'] == state])),corr,'%.5f' % r_squared]])

    corr_df.columns=['state','intervals','tested_counties', 'total_counties','pearsons','r_squared']
    corr_df = corr_df.append(corr_df1)
    corr_df = corr_df.sort_values(by='pearsons', ascending=True).reset_index(drop=True)
    us_df = us_df.append(corr_df)

us_df

Unnamed: 0,state,intervals,tested_counties,total_counties,pearsons,r_squared
0,RI,0,5,5,-0.706797,0.49956
1,VT,0,4,14,-0.490467,0.24056
2,CO,0,29,63,-0.398821,0.15906
3,PA,0,53,67,-0.364734,0.13303
4,UT,0,7,29,-0.329904,0.10884
...,...,...,...,...,...,...
36,CT,90,8,8,0.348224,0.12126
37,ID,90,24,44,0.352970,0.12459
38,CA,90,52,58,0.410918,0.16885
39,RI,90,5,5,0.445509,0.19848


In [437]:
us_df['cty_pct'] = (us_df['tested_counties'].astype(int) / us_df['total_counties'].astype(int)) * 100
df = us_df
us_df

Unnamed: 0,state,intervals,tested_counties,total_counties,pearsons,r_squared,cty_pct
0,RI,0,5,5,-0.706797,0.49956,100.000000
1,VT,0,4,14,-0.490467,0.24056,28.571429
2,CO,0,29,63,-0.398821,0.15906,46.031746
3,PA,0,53,67,-0.364734,0.13303,79.104478
4,UT,0,7,29,-0.329904,0.10884,24.137931
...,...,...,...,...,...,...,...
36,CT,90,8,8,0.348224,0.12126,100.000000
37,ID,90,24,44,0.352970,0.12459,54.545455
38,CA,90,52,58,0.410918,0.16885,89.655172
39,RI,90,5,5,0.445509,0.19848,100.000000


In [438]:
fig = px.line(df, x="intervals", y="pearsons", color='state',title=title_name,)
with open('covid_mmr_' + t.strftime('%m%d%y%H%M')+'.html', 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
fig.show()

In [439]:
fig = px.line(df, x="intervals", y="r_squared", color='state',title=title_name)
with open('covid_mmr_' + t.strftime('%m%d%y%H%M')+'.html', 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
fig.show()

In [440]:
fig = px.line(df, x="intervals", y="cty_pct", color='state',title=title_name)
with open('covid_mmr_' + t.strftime('%m%d%y%H%M')+'.html', 'a') as f:
        f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))
fig.show()

### --- loop begin ---

In [230]:
# add time interval
#day_interval = 7
threshold_df = start_df
threshold_df['date1'] = start_df['date'] + pd.DateOffset(days=day_interval)
print(len(threshold_df))
threshold_df.head()

3010


Unnamed: 0,FIPS,date,cases,cases_7ave,date1
0,1001,2020-03-30,6,5.0,2020-04-13
1,1003,2020-03-29,15,6.571429,2020-04-12
2,1005,2020-04-12,10,5.714286,2020-04-26
3,1007,2020-04-07,8,5.0,2020-04-21
4,1009,2020-04-02,6,5.0,2020-04-16


In [192]:
# cases after interval days
time_df1 = time_df[['FIPS','date','cases_7ave']]
time_df1 = time_df1.rename(columns={"date": "date1","cases_7ave": "cases_7ave1"})
threshold_df = pd.merge(threshold_df, time_df1,  how='left', left_on=['FIPS','date1'], right_on = ['FIPS','date1'])
print(len(threshold_df))
threshold_df.head()

2514


Unnamed: 0,FIPS,date,cases,cases_7ave,date1,cases_7ave1
0,1001,2020-05-07,61,51.428571,2020-07-06,569.714286
1,1003,2020-04-12,71,53.714286,2020-06-11,318.857143
2,1005,2020-05-10,59,51.428571,2020-07-09,354.285714
3,1007,2020-05-21,52,50.571429,2020-07-20,251.857143
4,1009,2020-05-28,58,51.142857,2020-07-27,543.571429


### Merge with MMR

In [193]:
# counties with MMR but no cases
mmr_df[mmr_df['FIPS'].isin(set(mmr_df['FIPS']) - set(threshold_df['FIPS']))]

Unnamed: 0,FIPS,County,State,MMR,Population
158,6003,Alpine,CA,0.965,1120
181,6049,Modoc,CA,0.991,8859
188,6063,Plumas,CA,0.862,18742
202,6091,Sierra,CA,0.972,2999
209,6105,Trinity,CA,0.911,12709
...,...,...,...,...,...
3086,56019,Johnson,WY,0.940,8476
3090,56027,Niobrara,WY,0.680,2397
3092,56031,Platte,WY,0.880,8562
3094,56035,Sublette,WY,0.770,9799


In [194]:
# counties without MMR
temp_df = threshold_df[threshold_df['FIPS'].isin(set(threshold_df['FIPS']) - set(mmr_df['FIPS']))][['FIPS']]
temp_df = pd.merge(temp_df,name_df,how='left',on='FIPS')
print(set(temp_df['Province_State']))
temp_df

{'Mississippi', 'West Virginia', 'Texas', 'Arkansas', 'New Hampshire', 'Hawaii', 'Ohio', 'Nevada', 'Nebraska', 'District of Columbia', 'Delaware', 'Colorado'}


Unnamed: 0,FIPS,Admin2,Province_State
0,5001,Arkansas,Arkansas
1,5003,Ashley,Arkansas
2,5005,Baxter,Arkansas
3,5007,Benton,Arkansas
4,5009,Boone,Arkansas
...,...,...,...
334,54083,Randolph,West Virginia
335,54091,Taylor,West Virginia
336,54099,Wayne,West Virginia
337,54107,Wood,West Virginia


In [259]:
merged_df = pd.merge(threshold_df,mmr_df, how='left', on='FIPS')
merged_df = merged_df[merged_df['MMR'].notnull()]
merged_df['incidence'] = merged_df['cases_7ave1'] * 100000 / merged_df['Population']
print(len(merged_df))
merged_df.head(3)

2594


Unnamed: 0,FIPS,date,cases,cases_7ave,date1,cases_7ave1,County,State,MMR,Population,incidence
0,1001,2020-03-30,6,5.0,2020-05-09,58.285714,Autauga,AL,0.9536,55504.0,105.011737
1,1003,2020-03-29,15,6.571429,2020-05-08,193.428571,Baldwin,AL,0.97,212628.0,90.970414
2,1005,2020-04-12,10,5.714286,2020-05-22,90.571429,Barbour,AL,0.9283,25270.0,358.414834


### Calculate correlation

In [425]:
df = merged_df
state_list = list(set(df['State']))
state_list.sort(reverse=False)

field_name = 'cases_7ave1'
df = df[(df[field_name]>0) & (df['Population']>=pop_line)]
df.loc[:,('MMR')] = np.log(df['MMR'])
df.loc[:,(field_name)] = np.log(df[field_name])
df

Unnamed: 0,FIPS,date,cases,cases_7ave,date1,cases_7ave1,County,State,MMR,Population,incidence
0,1001,2020-03-26,6,1.571429,2020-06-24,6.059791,Autauga,AL,-0.047511,55504.0,771.630359
1,1003,2020-03-20,2,1.000000,2020-06-18,5.910797,Baldwin,AL,-0.030459,212628.0,173.542525
2,1005,2020-04-06,2,1.000000,2020-07-05,5.811141,Barbour,AL,-0.074400,25270.0,1321.725366
3,1007,2020-04-01,3,1.142857,2020-06-30,5.038806,Bibb,AL,-0.058477,22668.0,680.632232
4,1009,2020-03-27,4,1.000000,2020-06-25,5.025758,Blount,AL,-0.026344,58013.0,262.502740
...,...,...,...,...,...,...,...,...,...,...,...
2180,56033,2020-03-17,2,1.000000,2020-06-15,2.936892,Sheridan,WY,-0.127833,30210.0,62.420201
2181,56037,2020-03-31,2,1.142857,2020-06-29,4.317488,Sweetwater,WY,-0.223144,43534.0,172.279138
2182,56039,2020-03-22,2,1.000000,2020-06-20,4.671493,Teton,WY,-0.328504,23265.0,459.304289
2183,56041,2020-04-06,3,1.285714,2020-07-05,5.189777,Uinta,WY,-0.328504,20495.0,875.474854


In [197]:
# nation level corr

corr_df = pd.DataFrame()
    
df_state = df
    
# covariance covariance[0,1],
data1 = df_state['MMR']
data2 = df_state[field_name]
covariance = cov(data1, data2)

# Pearsons correlation
corr, _ = pearsonr(data1, data2)

# R-squared
correlation_matrix = np.corrcoef(data1, data2)
correlation_xy = correlation_matrix[0,1]
r_squared = correlation_xy**2

# outputs
corr_df = corr_df.append([['US',day_interval, str(len(df_state)),str(len(df_state)),corr,'%.5f' % r_squared]])

corr_df.columns=['state','intervals','tested_counties', 'total_counties','pearsons','r_squared']
corr_df1 = corr_df

In [198]:
# state corr

corr_df = pd.DataFrame()

for state in state_list:
    
    df_state = df[(df['State']==state)]
    
    if len(df_state) >= 4:
    
        # covariance covariance[0,1],
        data1 = df_state['MMR']
        data2 = df_state[field_name]
        covariance = cov(data1, data2)

        # Pearsons correlation
        corr, _ = pearsonr(data1, data2)

        # R-squared
        correlation_matrix = np.corrcoef(data1, data2)
        correlation_xy = correlation_matrix[0,1]
        r_squared = correlation_xy**2

        # outputs
        corr_df = corr_df.append([[state,day_interval,str(len(df_state)),str(len(mmr_df[mmr_df['State'] == state])),corr,'%.5f' % r_squared]])

corr_df.columns=['state','intervals','tested_counties', 'total_counties','pearsons','r_squared']
corr_df = corr_df.append(corr_df1)
corr_df = corr_df.sort_values(by='pearsons', ascending=True).reset_index(drop=True)
corr_df

Unnamed: 0,state,intervals,tested_counties,total_counties,pearsons,r_squared
0,ME,60,6,16,-0.534573,0.28577
1,KY,60,43,120,-0.355796,0.12659
2,LA,60,62,64,-0.301857,0.09112
3,MO,60,37,115,-0.26712,0.07135
4,IN,60,71,92,-0.238107,0.0567
5,TN,60,51,95,-0.232783,0.05419
6,NC,60,77,100,-0.170098,0.02893
7,VA,60,84,133,-0.166865,0.02784
8,FL,60,60,67,-0.14575,0.02124
9,MN,60,47,87,-0.124193,0.01542


### --- loop end ---