# Survey Analysis: Summary Tables and Statistical Tests

This notebook summarizes survey responses from the original and amended flu surveys. It generates Tables 1, 2, 3 and S1 and reproduces the statistical tests reported in the paper.

In [0]:
import pandas as pd
import numpy as np

import scipy.stats as stats

### Load datasets

Qualified researchers may download these datasets from Synapse: https://www.synapse.org/20192020ilisurveillanceprogram

In [0]:
#NOTE: UPDATE THIS CELL WITH THE APPROPRIATE PATHNAMES

#define pathnames of datasets
ili_events_path = '/data/covid19/manuscript_data_delivery_for_synapse/event-level_survey_data.csv.gz'
ili_days_path = '/data/covid19/manuscript_data_delivery_for_synapse/daily_symptom_survey_data.csv.gz'
demographics_path = '/data/covid19/manuscript_data_delivery_for_synapse/participant_characteristics.csv.gz'

In [0]:
ili_events = pd.read_csv(ili_events_path)
ili_events.tail()

Unnamed: 0,user_id,cohort,valid_user__steps,valid_user__heart,valid_user__sleep,user_event_n_merged,date_onset_merged,date_recovery_merged,date_survey_concat,survey_id,is_overlapping_flu_event,diagnosed__flu,diagnosed__covid,symptoms__chills_shiver,symptoms__cough,symptoms__fatigue,symptoms__fever,symptoms__headache,symptoms__bodyache,symptoms__no_symptoms,symptoms__nasal,symptoms__other,symptoms__sneezing,symptoms__sore_throat,symptoms__sweats,symptoms__anosmia,symptoms__none,symptoms__chest_pain,symptoms__short_breath,medical__sought_attention,medical__sought_attention_where,medical__hospitalized,medical__medication,covid__told_to_quarantine,covid__behavior__large_gatherings,covid__behavior__air_travel,covid__behavior__social_distancing__did_not
6921,243b95793eee4540,Non-COVID-19 Flu,1.0,0.0,0.0,1.0,2020-04-06,2020-04-10,2020-04-10__2020-04-16,280161684,1.0,yes,no,,cough,,,headache,,,nasal congestion or runny nose,,sneezing,sore throat or itchy scratchy throat,sweats,,none of the above,,,yes,primary care clinic e g family medicine intern...,no,yes,no,no,no,i did not practice social distancing in the la...
6922,131c39ea5d4e8f0e,Pre-COVID-19 Flu,0.0,0.0,0.0,1.0,2020-02-12,2020-03-04,2020-03-04,273462458,0.0,yes,,chills or shivering,cough,feeling more tired than usual,fever or feeling feverish,headache,body muscle ache,,nasal congestion or runny nose,,sneezing,,sweats,,,,,yes,urgent care facility,no,no,,,,
6923,61c200aad265d315,Non-COVID-19 Flu,1.0,1.0,1.0,1.0,2020-04-05,2020-04-19,2020-04-21__2020-04-14,280161684,1.0,yes,no,chills or shivering,cough,feeling more tired than usual,fever or feeling feverish,headache,body muscle ache,,nasal congestion or runny nose,,,sore throat or itchy scratchy throat,sweats,,none of the above,,,yes,primary care clinic e g family medicine intern...,no,no,yes,no,no,
6924,7d9839d123a2cb2a,Pre-COVID-19 Flu,0.0,0.0,0.0,1.0,2020-01-22,2020-01-27,2020-01-28,273462458,0.0,yes,,,,,,,,,,,,sore throat or itchy scratchy throat,,,,,,yes,urgent care facility,yes,yes,,,,
6925,185840c0245922e3,Pre-COVID-19 Flu,0.0,0.0,0.0,1.0,2020-02-07,2020-02-14,2020-02-14,273462458,0.0,yes,,chills or shivering,cough,feeling more tired than usual,fever or feeling feverish,headache,body muscle ache,,nasal congestion or runny nose,Nausea,sneezing,sore throat or itchy scratchy throat,sweats,,,,,yes,urgent care facility,no,yes,,,,


In [0]:
ili_days = pd.read_csv(ili_days_path)
ili_days.tail()

Unnamed: 0,user_id,cohort,days_since_symptoms_onset,study_date,date_onset_merged,date_recovery_merged,date_survey_for_daily_symptoms,dif_survey_symptom,daily_bhvr__covid__behavior__social_distancing,daily_symp__anosmia,daily_symp__chest_pain,daily_symp__short_breath,daily_symp__fever,daily_symp__other,daily_symp__cough,daily_symp__nasal,daily_symp__sneezing,daily_symp__sore_throat,daily_symp__bodyache,daily_symp__chills_shiver,daily_symp__fatigue,daily_symp__headache,daily_symp__sweats
574853,5b94d1dddfcc7821,Pre-COVID-19 Flu,71.0,2020-04-24,2020-02-13,2020-02-21,,,,,,,,,,,,,,,,,
574854,5b94d1dddfcc7821,Pre-COVID-19 Flu,72.0,2020-04-25,2020-02-13,2020-02-21,,,,,,,,,,,,,,,,,
574855,5b94d1dddfcc7821,Pre-COVID-19 Flu,73.0,2020-04-26,2020-02-13,2020-02-21,,,,,,,,,,,,,,,,,
574856,5b94d1dddfcc7821,Pre-COVID-19 Flu,74.0,2020-04-27,2020-02-13,2020-02-21,,,,,,,,,,,,,,,,,
574857,5b94d1dddfcc7821,Pre-COVID-19 Flu,75.0,2020-04-28,2020-02-13,2020-02-21,,,,,,,,,,,,,,,,,


In [0]:
demographics = pd.read_csv(demographics_path)
demographics.tail()

Unnamed: 0,user_id,cohort,valid_user__steps,valid_user__heart,valid_user__sleep,hs_completed,gender,age_group,race,education,bmi_group,alzheimers,anxiety,arrhythmia,arthritis,asthma,cancer,chronic_pain,copd,coronary_heart_disease,depression,diabetes_t1,diabetes_t2,fibromyalgia,gerd,gestational_diabetes,heart_attack,heart_failure,high_colesterol,hypertension,hypo_hyper_thyroidism,ibd,ibs,ibs_ibd,insomnia,lupus,mental_health_not_depression_anxiety,migraines,multiple_sclerosis,neurodegenerative,obstructive_coronary_heart_disease,osteoporosis,pcos,psoriasis,restless_leg,rheumatoid_arthritis,seasonal_allergies,sleep_apnea,stroke
6921,243b95793eee4540,Non-COVID-19 Flu,1.0,0.0,0.0,True,male,35-44,white / caucasian,trade/technical/vocational training,,,,,,,,,,,,,,,,,,,,True,,,,,,,,,,,,,,,,,,,
6922,131c39ea5d4e8f0e,Pre-COVID-19 Flu,0.0,0.0,0.0,True,female,25-34,white / caucasian,"high school graduate, diploma or the equivalen...",25.0 - 29.9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6923,61c200aad265d315,Non-COVID-19 Flu,1.0,1.0,1.0,True,female,35-44,white / caucasian,"some college, no degree",25.0 - 29.9,,,,,,,,,,True,,,,,,,,,,True,,,,,,,,,,,,,,,,,,
6924,7d9839d123a2cb2a,Pre-COVID-19 Flu,0.0,0.0,0.0,True,male,25-34,white / caucasian,"college graduate, associate or bachelor's degree",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6925,185840c0245922e3,Pre-COVID-19 Flu,0.0,0.0,0.0,True,female,35-44,hispanic or latino,"some college, no degree",30 +,,True,,,,,True,,,True,,,,,,,,,,,,,,,,,True,,,,,,,,,,,


## Create a dictionary with cohort labels and users

In [0]:
#define dictionary with labels for each cohort
dense_activity_columns = ['valid_user__steps','valid_user__heart','valid_user__sleep']
dense_activity_users = set(demographics.loc[demographics[dense_activity_columns].sum(axis=1)>0,'user_id'])
     
cohort_dict = {}
cohort_dict['All COVID-19'] = set(demographics.loc[demographics['cohort']=='COVID-19','user_id'])
cohort_dict['All Non-COVID-19 Flu'] = set(demographics.loc[demographics['cohort']=='Non-COVID-19 Flu','user_id'])
cohort_dict['All Pre-COVID-19 Flu'] = set(demographics.loc[demographics['cohort']=='Pre-COVID-19 Flu','user_id'])
cohort_dict['Dense COVID-19'] = dense_activity_users.intersection(cohort_dict['All COVID-19'])
cohort_dict['Dense Non-COVID-19 Flu'] = dense_activity_users.intersection(cohort_dict['All Non-COVID-19 Flu'])
cohort_dict['Dense Pre-COVID-19 Flu'] = dense_activity_users.intersection(cohort_dict['All Pre-COVID-19 Flu'])
                                                                
#print size of each cohort
for cohort_label, users in cohort_dict.items():
  print('Number of unique users in', cohort_label, ':', len(users))

## Define functions for summarizing data values

In [0]:
#create a function that formats the count data in the way specified in the SAP
def format_count_data_for_table(x):
  formatted_summary = pd.DataFrame({'count': x.value_counts(), 
                                    'percent': x.value_counts(normalize=True)*100})
  #make sure that the counts are equal to the cohort size
  assert(formatted_summary['count'].sum()==len(x))
  return formatted_summary['count'].astype(str) + ' (' + formatted_summary['percent'].round(1).astype(str) + '%)'

#create an additional function that formats percentage data only to save space in the tables
def format_precentages_for_table(x):
  formatted_summary = pd.DataFrame({'count': x.value_counts(), 
                                    'percent': x.value_counts(normalize=True)*100})
  #make sure that the counts are equal to the cohort size
  assert(formatted_summary['count'].sum()==len(x))
  return formatted_summary['percent'].round(1).astype(str) + '%'

#create a function that formats continous data
def format_continous_data_for_table(x):
  formatted_summary = pd.Series({'Mean ± SD': '{:.1f} ± {:.1f}'.format(x.mean(), x.std()), 
                                  'Median': '{}'.format(x.median()),
                                  'Min - Max': '{} - {}'.format(x.min(), x.max())})
  return formatted_summary


def create_summary_table(melted_response_df, user_col, question_col, response_col, cohort_dict, format_fxn):
  
  """
  Create a table of summary statistics for categorical or continuous data
  This function takes in a melted (longform) dataframe with survey data that will be summarized for the 
  cohorts defined in the cohort dictionary, as well as a function that maps a series to a formatted summary (string)
    
  Parameters
  ----------
  melted_response_df : pandas dataframe, long format
      Columns for user id, question, and responses. 
      All responses should have the same data type (categorical or continuous)
  user_col : str
      Name of column with user ids
  question_col : str
      Name of column with questions
  response_col : str
      Name of column with user responses
  cohort_dict: dict
      Dictionary with cohort definitions. Keys should be cohort labels (strings) and values 
      should be lists of user ids that belong to the cohort.
  format_fxn: function
      Function that takes in a series and returns a formatted series where the index corresponds
      to the labeled rows in the summary table and the values correspond to the formatted table entries.
  
  Returns
  ----------
  summary_table : pandas dataframe
     A pandas dataframe with a multi-index corresponding to the question (level 0) and the appropriate 
     formatted data rows (level 1) and columns corresponding to cohorts.
  
  """

  summary_table = pd.DataFrame()
  for cohort_label, cohort_users in cohort_dict.items():
    summary_table = pd.concat([summary_table,
                               melted_response_df.loc[melted_response_df[user_col].isin(cohort_users)]
                                                 .groupby(by=question_col)[response_col]
                                                 .apply(format_fxn)
                                                 .rename(cohort_label+' (N={})'.format(len(cohort_users)))],
                               axis=1)
    
    summary_table.index = pd.MultiIndex.from_tuples(summary_table.index)

  
  return summary_table

## TABLE 1: Demographic Summary

In [0]:
count_cols = ['gender','education','age_group','bmi_group','race']

melted_demogs = demographics[['user_id']+count_cols].melt(id_vars='user_id')
melted_demogs['value'] = melted_demogs['value'].str.title()
melted_demogs = melted_demogs.fillna('Unavailable')

demog_summary = create_summary_table(melted_response_df=melted_demogs, 
                     user_col='user_id', 
                     question_col='variable', 
                     response_col='value', 
                     cohort_dict=cohort_dict, 
                     format_fxn=format_precentages_for_table)

#reorder the rows and make them more readable
manual_renaming = {'College Graduate, Associate Or Bachelor\'S Degree':'College Degree',
                   'Did Not Finish High School, No Diploma':'Did Not Finish High School',
                   'Doctorate Degree, Md':'Doctorate Degree or MD',
                   'High School Graduate, Diploma Or The Equivalent (For Example, Ged)':'High School Diploma or GED',
                   'I\'D Prefer Not To Answer':'Prefer Not To Answer',
                   }

edu_order = ['Did Not Finish High School','High School Diploma or GED','Some College, No Degree','Trade/Technical/Vocational Training',
             'College Degree', 'Graduate Degree', 'Doctorate Degree or MD']
gender_order = ['Female','Male','Other']
race_order = ['White / Caucasian', 'Hispanic Or Latino', 'Black Or African American', 'Asian Or Pacific Islander',
              'American Indian Or Alaskan Native', 'Prefer Not To Answer']
age_bin_labels = ['< 25','25-34','35-44','45-54','55 +']
bmi_bin_labels = ['< 18.5','18.5 - 24.9','25.0 - 29.9','30 +']

demog_summary = demog_summary.rename(index=manual_renaming)
demog_summary = demog_summary.reindex(age_bin_labels+bmi_bin_labels+edu_order+gender_order+race_order+['Unavailable'], level=1)
demog_summary = demog_summary.reindex(['gender','race','education','age_group','bmi_group'], level=0)


demog_summary = demog_summary.fillna('0.0%')
demog_summary

Unnamed: 0,Unnamed: 1,All COVID-19 (N=230),All Non-COVID-19 Flu (N=426),All Pre-COVID-19 Flu (N=6270),Dense COVID-19 (N=41),Dense Non-COVID-19 Flu (N=85),Dense Pre-COVID-19 Flu (N=1226)
gender,Female,70.0%,74.4%,78.2%,80.5%,76.5%,82.8%
gender,Male,28.7%,24.6%,20.8%,17.1%,21.2%,16.4%
gender,Other,0.4%,0.7%,0.3%,0.0%,1.2%,0.2%
gender,Unavailable,0.9%,0.2%,0.6%,2.4%,1.2%,0.7%
race,White / Caucasian,63.9%,66.4%,70.0%,56.1%,75.3%,74.8%
race,Hispanic Or Latino,7.0%,6.8%,8.3%,9.8%,2.4%,5.2%
race,Black Or African American,3.5%,7.0%,6.0%,7.3%,5.9%,3.2%
race,Asian Or Pacific Islander,9.6%,8.0%,4.6%,9.8%,7.1%,3.1%
race,American Indian Or Alaskan Native,1.3%,0.7%,0.8%,0.0%,1.2%,0.6%
race,Prefer Not To Answer,4.8%,1.6%,1.4%,7.3%,0.0%,1.1%


#### Check for statistically significant demographic differences (full cohorts only)

In [0]:
full_cohort_dict = {key: value for key, value in cohort_dict.items() if key.startswith('All')}

In [0]:
demog_counts = create_summary_table(melted_response_df=melted_demogs, 
                     user_col='user_id', 
                     question_col='variable', 
                     response_col='value', 
                     cohort_dict=full_cohort_dict, 
                     format_fxn=(lambda x: x.value_counts()))
demog_counts = demog_counts.fillna(0)

for col in count_cols:
  chi2, p, _, _ = stats.chi2_contingency(demog_counts.loc[col].values)
  print(col, ': chi2=', chi2.round(2), '  p=', p.round(5)) 

print('\nbonferroni threshold for', len(count_cols), 'comparisons:', 0.05/len(count_cols))

In [0]:
def two_proportion_z_test(melted_df, variable, cohort1, cohort2):
  
  proportions = pd.DataFrame(index=melted_df.loc[melted_df['variable']==variable,'value'].unique(), columns=['p1','p2','p_all'])

  proportions['p1'] = melted_df.loc[(melted_df['user_id'].isin(cohort1))&(melted_df['variable']==variable), 'value'].value_counts(normalize=True)
  proportions['p2'] = melted_df.loc[(melted_df['user_id'].isin(cohort2))&(melted_df['variable']==variable), 'value'].value_counts(normalize=True)
  proportions['p_all'] = melted_df.loc[(melted_df['user_id'].isin(cohort1.union(cohort2)))&(melted_df['variable']==variable), 'value'].value_counts(normalize=True)
  
  proportions = proportions.fillna(0)

  proportions['z'] = proportions.apply(lambda x: (x['p1'] - x['p2']) / np.sqrt(x['p_all']*(1-x['p_all'])*(1/len(cohort1)+1/len(cohort2))), axis=1)
  proportions['p_value'] = proportions['z'].apply(lambda z: stats.norm.sf(abs(z))*2)

  return proportions

In [0]:
#follow-up two-proportion z-tests
variables = ['race','age_group']

for variable in variables:
  
  print('\n\n', variable, ': Covid vs. Non-Covid')
  print(two_proportion_z_test(melted_demogs, variable, cohort_dict['All COVID-19'], cohort_dict['All Non-COVID-19 Flu']))
  
  print('\n\n', variable, ': Covid vs. Pre-Covid')
  print(two_proportion_z_test(melted_demogs, variable, cohort_dict['All COVID-19'], cohort_dict['All Pre-COVID-19 Flu']))
  
  print('\n\n', variable, ': Non-Covid vs. Pre-Covid')
  print(two_proportion_z_test(melted_demogs, variable, cohort_dict['All Non-COVID-19 Flu'], cohort_dict['All Pre-COVID-19 Flu']))


### Check for statistically significant demographic differences between users with dense vs. non-dense activity

In [0]:
#first create lists of all users with dense activity data and all users without dense activity data
activity_dict = {'Dense Activity': dense_activity_users,
                 'Non-dense Activity': set(demographics['user_id']).difference(dense_activity_users)}

#print size of each cohort
for cohort_label, users in activity_dict.items():
  print('Number of unique users in', cohort_label, ':', len(users))

In [0]:
demog_counts = create_summary_table(melted_response_df=melted_demogs, 
                     user_col='user_id', 
                     question_col='variable', 
                     response_col='value', 
                     cohort_dict=activity_dict, 
                     format_fxn=(lambda x: x.value_counts()))
demog_counts = demog_counts.fillna(0)

for col in count_cols:
  chi2, p, _, _ = stats.chi2_contingency(demog_counts.loc[col].values)
  print(col, ': chi2=', chi2.round(2), '  p=', p.round(5)) 

print('\nbonferroni threshold for', len(count_cols), 'comparisons:', 0.05/len(count_cols))

In [0]:
#follow-up two-proportion z-tests
variables = ['gender','age_group','bmi_group','race']

for variable in variables:
  
  print('\n\n', variable, ': Dense Activity vs. Not')
  print(two_proportion_z_test(melted_demogs, variable, activity_dict['Dense Activity'], activity_dict['Non-dense Activity']))


## Table 2: Healthcare-related behaviors and outcomes

In [0]:
#clean response options for the question about seeking medical care
care_location_options = {'primary care clinic e g family medicine internal medicine':'Primary care clinic',
                         'urgent care facility':'Urgent care facility', 
                         'emergency room er':'Emergency room', 
                         'ear nose and throat otolaryngology clinic':'Ear, nose, and throat clinic', 
                         'infectious disease clinic':'Infectious disease clinic', 
                         'other':'Other',
                         'multiple':'Multiple locations',
                         'unavailable':'Unavailable',}

ili_events['care_location__cleaned'] = ili_events['medical__sought_attention_where'].copy()
ili_events.loc[ili_events['care_location__cleaned'].str.contains('__'), 'care_location__cleaned'] = 'multiple'
ili_events.loc[~ili_events['care_location__cleaned'].isin(care_location_options.keys()), 'care_location__cleaned'] = 'other'

ili_events['care_location__cleaned'].value_counts(dropna=False)

In [0]:
healthcare_cols = ['care_location__cleaned', 'medical__hospitalized', 'medical__medication']

melted_events = ili_events[['user_id']+healthcare_cols].melt(id_vars='user_id')
melted_events['value'] = melted_events['value'].fillna('unavailable')

med_care_summary = create_summary_table(melted_response_df=melted_events, 
                     user_col='user_id', 
                     question_col='variable', 
                     response_col='value', 
                     cohort_dict=full_cohort_dict, 
                     format_fxn=format_precentages_for_table)
med_care_summary = med_care_summary.fillna('0.0%')

#make index more readable
med_care_mappings = {'care_location__cleaned':'Medical care location', 'medical__hospitalized':'Hospitalized', 'medical__medication':'Prescribed medication'}
med_care_response_mappings = {'yes':'Yes', 'no':'No', 'dont know or remember':'Do not know/remember', 'unavailable':'Unavailable'}
med_care_summary = med_care_summary.rename(index=med_care_mappings)
med_care_summary = med_care_summary.rename(index=med_care_response_mappings)
med_care_summary = med_care_summary.rename(index=care_location_options)

med_care_summary = med_care_summary.reindex(pd.Series(list(care_location_options.values())+
                                                      list(med_care_response_mappings.values())).drop_duplicates(keep='last'), level=1)
med_care_summary

Unnamed: 0,Unnamed: 1,All COVID-19 (N=230),All Non-COVID-19 Flu (N=426),All Pre-COVID-19 Flu (N=6270)
Medical care location,Primary care clinic,37.4%,50.2%,45.7%
Medical care location,Urgent care facility,16.1%,23.5%,39.1%
Medical care location,Emergency room,17.0%,8.2%,6.9%
Medical care location,"Ear, nose, and throat clinic",2.2%,2.1%,0.8%
Medical care location,Infectious disease clinic,1.7%,1.2%,0.4%
Medical care location,Other,10.9%,4.7%,4.3%
Medical care location,Multiple locations,14.8%,10.1%,2.8%
Hospitalized,Yes,36.1%,15.7%,7.1%
Hospitalized,No,63.9%,83.6%,92.6%
Hospitalized,Unavailable,0.0%,0.7%,0.3%


### Check for differences between healthcare questions

In [0]:
care_counts = create_summary_table(melted_response_df=melted_events, 
                     user_col='user_id', 
                     question_col='variable', 
                     response_col='value', 
                     cohort_dict=full_cohort_dict, 
                     format_fxn=(lambda x: x.value_counts()))
care_counts = care_counts.fillna(0)

for col in healthcare_cols:
  chi2, p, _, _ = stats.chi2_contingency(care_counts.loc[col].values)
  print(col, ': chi2=', chi2.round(2), '  p=', p.round(5)) 
  
print('\nbonferroni threshold for', len(healthcare_cols), 'comparisons:', 0.05/len(healthcare_cols))

In [0]:
#follow-up two-proportion z-tests
variables = healthcare_cols

for variable in variables:
  
  print('\n\n', variable, ': Covid vs. Non-Covid')
  print(two_proportion_z_test(melted_events, variable, cohort_dict['All COVID-19'], cohort_dict['All Non-COVID-19 Flu']))
  
  print('\n\n', variable, ': Covid vs. Pre-Covid')
  print(two_proportion_z_test(melted_events, variable, cohort_dict['All COVID-19'], cohort_dict['All Pre-COVID-19 Flu']))
  
  print('\n\n', variable, ': Non-Covid vs. Pre-Covid')
  print(two_proportion_z_test(melted_events, variable, cohort_dict['All Non-COVID-19 Flu'], cohort_dict['All Pre-COVID-19 Flu']))


## Table 3: Symptom Summary

In [0]:
#pull out list of symptoms
symptom_cols = ili_events.columns[ili_events.columns.str.contains('symptoms__')].tolist()  
symptom_cols.remove('symptoms__other')
symptom_cols.remove('symptoms__none')
symptom_cols.remove('symptoms__no_symptoms')

#melt symptoms
melted_symptoms = ili_events[['user_id']+symptom_cols].melt(id_vars='user_id')
melted_symptoms['value'] = melted_symptoms['value'].fillna('')

#create summary table of symptom prevalances
symptoms_summary = create_summary_table(melted_response_df=melted_symptoms, 
                     user_col='user_id', 
                     question_col='variable', 
                     response_col='value', 
                     cohort_dict=full_cohort_dict, 
                     format_fxn=format_precentages_for_table)
symptoms_summary = symptoms_summary.loc[symptoms_summary.index.get_level_values(1)!=''].droplevel(1)

#use daily symptom data to find peak symptom day across each cohort 
daily_symp_cols = [col.replace('symptoms','daily_symp') for col in symptom_cols]
symp_counts_by_day = ili_days.groupby(by=['cohort','days_since_symptoms_onset'])[daily_symp_cols].sum()
#record peak symptom day for each cohort (use a centered 5-day rolling window)
peak_symptom_days = (symp_counts_by_day.groupby('cohort')
                                       .apply(lambda x: x.rolling(5, center=True).mean().idxmax())
                                       .applymap(lambda x: x[1])
                                       .transpose()
                                       .replace(-8, np.nan))

#format the table
symptom_map =  {'chills_shiver':'Chills or Shivering', 
                'cough':'Cough',
                'fatigue':'Fatigue', 
                'fever':'Fever',
                'headache':'Headache', 
                'bodyache':'Body Muscle Ache',
                'nasal':'Nasal Congestion',
                'sneezing':'Sneezing', 
                'sore_throat':'Sore Throat',
                'sweats':'Sweats',
                'anosmia':'Anosmia',
                'chest_pain':'Chest Pain/Pressure', 
                'short_breath':'Shortness of Breath'}
symptoms_summary.index = symptoms_summary.reset_index()['index'].apply(lambda x: x.split('__')[-1])
symptoms_summary = symptoms_summary.rename(index=symptom_map)
peak_symptom_days.index = peak_symptom_days.reset_index()['index'].apply(lambda x: x.split('__')[-1])
peak_symptom_days = peak_symptom_days.rename(index=symptom_map)

#merge tables
symptoms_summary = symptoms_summary.merge(peak_symptom_days, left_index=True, right_index=True)

#sort from most to least common in covid cases
symptoms_summary = symptoms_summary.loc[symptoms_summary['All COVID-19 (N=230)'].sort_values(ascending=False).index]

#fill in NaNs
symptoms_summary = symptoms_summary.fillna('NA')
symptoms_summary

#print(symptoms_summary.to_latex())

Unnamed: 0_level_0,All COVID-19 (N=230),All Non-COVID-19 Flu (N=426),All Pre-COVID-19 Flu (N=6270),COVID-19,Non-COVID-19 Flu,Pre-COVID-19 Flu
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Cough,84.3%,71.6%,85.1%,5.0,3.0,3.0
Headache,71.3%,68.1%,74.3%,4.0,3.0,3.0
Body Muscle Ache,66.1%,67.1%,80.8%,4.0,2.0,2.0
Shortness of Breath,65.7%,24.2%,,6.0,8.0,-5.0
Fatigue,61.7%,54.7%,70.9%,5.0,3.0,3.0
Fever,61.3%,62.0%,74.6%,5.0,2.0,2.0
Chills or Shivering,53.5%,55.4%,69.3%,4.0,3.0,2.0
Sore Throat,51.7%,48.8%,61.1%,3.0,2.0,3.0
Nasal Congestion,49.6%,49.3%,65.4%,5.0,2.0,3.0
Chest Pain/Pressure,49.6%,19.7%,,6.0,2.0,-5.0


### Symptom statistical testing

In [0]:
#chi-squared
symptom_counts = create_summary_table(melted_response_df=melted_symptoms, 
                     user_col='user_id', 
                     question_col='variable', 
                     response_col='value', 
                     cohort_dict=full_cohort_dict, 
                     format_fxn=(lambda x: x.value_counts()))
symptom_counts = symptom_counts.fillna(0)
symptom_counts = symptom_counts.loc[symptom_counts.index.get_level_values(1)!=''].droplevel(1)

chi2, p, _, _ = stats.chi2_contingency(symptom_counts.values)
print('symptoms', ': chi2=', chi2.round(2), '  p=', p.round(5)) 

In [0]:
#follow-up two-proportion z-tests  
variables = symptom_cols

for variable in variables:
  
  print('\n\n', variable, ': Covid vs. Non-Covid')
  print(two_proportion_z_test(melted_symptoms, variable, cohort_dict['All COVID-19'], cohort_dict['All Non-COVID-19 Flu']))
  
  print('\n\n', variable, ': Covid vs. Pre-Covid')
  print(two_proportion_z_test(melted_symptoms, variable, cohort_dict['All COVID-19'], cohort_dict['All Pre-COVID-19 Flu']))
  
  print('\n\n', variable, ': Non-Covid vs. Pre-Covid')
  print(two_proportion_z_test(melted_symptoms, variable, cohort_dict['All Non-COVID-19 Flu'], cohort_dict['All Pre-COVID-19 Flu']))


## Supplement Table: Self-reported medication conditions

In [0]:
#IBD and IBS are listed in separate columns, but also in a column called ibs_ibd --> investigate
print(demographics[['user_id','ibd','ibs','ibs_ibd']].fillna(False).groupby(['ibs_ibd','ibd','ibs']).nunique())

#it looks disjoint -- no one has ibs_ibd AND ibd or ibs --> collapse ibd and ibs into ibs_ibd
demographics['ibs_ibd_collapsed'] = demographics[['ibd','ibs','ibs_ibd']].any(axis=1)

In [0]:
demographics.columns

In [0]:
conditions = ['alzheimers', 'anxiety', 'arrhythmia', 'arthritis', 'asthma', 'cancer',
       'chronic_pain', 'copd', 'coronary_heart_disease', 'depression',
       'diabetes_t1', 'diabetes_t2', 'fibromyalgia', 'gerd',
       'gestational_diabetes', 'heart_attack', 'heart_failure',
       'high_colesterol', 'hypertension', 'hypo_hyper_thyroidism', 'ibs_ibd_collapsed', 
       'insomnia', 'lupus',
       'mental_health_not_depression_anxiety', 'migraines',
       'multiple_sclerosis', 'neurodegenerative',
       'obstructive_coronary_heart_disease', 'osteoporosis', 'pcos',
       'psoriasis', 'restless_leg', 'rheumatoid_arthritis',
       'seasonal_allergies', 'sleep_apnea', 'stroke',]
#sort by prevalance in covid cohort
condition_order = (demographics.loc[demographics['user_id'].isin(cohort_dict['All COVID-19']), conditions]
                               .sum()
                               .sort_values(ascending=False)
                               .index.tolist())

#for people who took the health survey, fill in "False" for conditions they did not select 
demographics.loc[demographics['hs_completed']==True,conditions].fillna(False, inplace=True)

melted_conditions = demographics[['user_id']+conditions].melt(id_vars='user_id')
melted_conditions['value'] = melted_conditions['value'].fillna('unavailable')
#melted_fluvey = melted_fluvey.loc[melted_fluvey['value']!='']

conditions_summary = create_summary_table(melted_response_df=melted_conditions, 
                     user_col='user_id', 
                     question_col='variable', 
                     response_col='value', 
                     cohort_dict=full_cohort_dict, 
                     format_fxn=format_count_data_for_table)
conditions_summary = conditions_summary.loc[conditions_summary.index.get_level_values(1)==True].droplevel(1).fillna('0 (0.0%)')

#reorder symptoms based on prevalance
symptoms_summary = symptoms_summary.loc[[cond for cond in condition_order if cond in symptoms_summary.index]]

#clean up condition names for publication
conditions_summary.index = conditions_summary.index.str.title().str.replace('_',' ')
manual_condition_naming = {'Pcos':'PCOS', 'Mental Health Not Depression Anxiety':'Mental Health (Excluding Depression/Anxiety)',
                           'Gerd':'GERD', 'Restless Leg':'Restless Leg Syndrome', 'Hypo Hyper Thyroidism':'Hypo- or Hyperthyrodism',
                           'High Colesterol':'High Cholesterol', 'Ibs Ibd Collapsed':'IBS or IBD', 'Alzheimers':'Alzheimer\'s Disease', 
                           'Copd':'COPD', 'Diabetes T1':'Type 1 Diabetes', 'Diabetes T2':'Type 2 Diabetes'}
conditions_summary = conditions_summary.rename(index=manual_condition_naming)

conditions_summary

Unnamed: 0,All COVID-19 (N=230),All Non-COVID-19 Flu (N=426),All Pre-COVID-19 Flu (N=6270)
Alzheimer's Disease,1 (0.4%),1 (0.2%),3 (0.0%)
Anxiety,65 (28.3%),122 (28.6%),1915 (30.5%)
Arrhythmia,6 (2.6%),5 (1.2%),165 (2.6%)
Asthma,56 (24.3%),79 (18.5%),1247 (19.9%)
Cancer,6 (2.6%),13 (3.1%),165 (2.6%)
Chronic Pain,30 (13.0%),40 (9.4%),572 (9.1%)
COPD,2 (0.9%),4 (0.9%),61 (1.0%)
Coronary Heart Disease,1 (0.4%),0 (0.0%),13 (0.2%)
Depression,62 (27.0%),104 (24.4%),1868 (29.8%)
Type 1 Diabetes,4 (1.7%),6 (1.4%),63 (1.0%)
