# Parse US Delphi dataset

It can be accessed using the epidatpy package by https://cmu-delphi.github.io/epidatpy/   
Access keys can be obtained from https://github.com/cmu-delphi/epidatpy/blob/dev/docs/index.rst#api-keys or by registering at https://api.delphi.cmu.edu/epidata/admin/registration_form to avoid rate limits

In [None]:
from epidatpy import EpiDataContext, EpiRange

# Create the client object.
epidata = EpiDataContext()
apicall = epidata.pub_covidcast(
    data_source="jhu-csse",
    signals="confirmed_cumulative_num",
    geo_type="nation",
    time_type="day",
    geo_values="us",
    time_values=EpiRange(20210405, 20210410),
)
print(apicall.df())


In [None]:
from epidatpy import CovidcastEpidata, EpiDataContext, EpiRange

# Create the client object. Note that due to the arguments below all results will be cached to your disk for 7 days, which helps avoid making repeated downloads.
epidata = EpiDataContext(use_cache=True, cache_max_age_days=7)

# `pub_covidcast` returns an `EpiDataCall`, which is a not-yet-executed query that can be inspected.
apicall = epidata.pub_covidcast(
    data_source="jhu-csse",
    signals="confirmed_cumulative_num",
    geo_type="nation",
    time_type="day",
    geo_values="us",
    time_values=EpiRange(20210405, 20210410),
)
print(apicall)
apicall.df()

In [None]:
#These indicators are the same as those used from the UMD-CTIS dataset

indicators = [
    "smoothed_wwearing_mask_7d", ##mask
    "smoothed_wworried_catch_covid", ##worried_catch_covid
    "smoothed_wbelief_masking_effective", #belief_masking_effective

    "smoothed_wreceived_news_local_health", #received_news_local_health
    "smoothed_wreceived_news_experts",
    "smoothed_wreceived_news_cdc",# beware, this is not associated with WHO but with CDC
    "smoothed_wreceived_news_govt_health",
    "smoothed_wreceived_news_politicians",
    "smoothed_wreceived_news_journalists", #received_news_journalists
    "smoothed_wreceived_news_friends",

    "smoothed_wtrust_covid_info_doctors",
    "smoothed_wtrust_covid_info_experts",
    "smoothed_wtrust_covid_info_cdc", # beware, this is not associated with WHO but with CDC
    "smoothed_wtrust_covid_info_govt_health",
    "smoothed_wtrust_covid_info_politicians",
    "smoothed_wtrust_covid_info_journalists",
    "smoothed_wtrust_covid_info_friends"
]

In [None]:
for indicator in indicators:
    df = epidata.pub_covidcast(
    data_source="fb-survey",
    signals= indicator,
    geo_type="state",
    time_type="day",
    geo_values="*",
    time_values=EpiRange(20210521, 20220625),
    ).df()
    df = df[['signal', 'geo_value', 'time_value', 'value', 'sample_size']]
    df_sorted = df.sort_values(by=['geo_value', 'time_value'])
    filename = f"Dataus/{indicator}.csv"
    df_sorted['value']=df_sorted['value']/100
    df_sorted.to_csv(filename, index=False)

In [None]:
from functools import reduce
import pandas as pd
df_list = []

for indicator in indicators:
    filename = f"Dataus/{indicator}.csv"
    df = pd.read_csv(filename)
    signal = df['signal'].iloc[0]
    df = df[['geo_value', 'time_value', 'value', 'sample_size']].copy()
    df.rename(columns={
        'value': signal,
        'sample_size': f'sample_size_{signal}'
    }, inplace=True)

    df_list.append(df)

# unite everything upon geo_value and time_value
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['geo_value', 'time_value'], how='outer'), df_list)
df_merged.sort_values(by=['geo_value', 'time_value'], inplace=True)

us_state_abbrev_lower = { # complete state names (dictionary)
    'al': 'Alabama',
    'ak': 'Alaska',
    'az': 'Arizona',
    'ar': 'Arkansas',
    'ca': 'California',
    'co': 'Colorado',
    'ct': 'Connecticut',
    'dc': 'District of Columbia',
    'de': 'Delaware',
    'fl': 'Florida',
    'ga': 'Georgia',
    'hi': 'Hawaii',
    'id': 'Idaho',
    'il': 'Illinois',
    'in': 'Indiana',
    'ia': 'Iowa',
    'ks': 'Kansas',
    'ky': 'Kentucky',
    'la': 'Louisiana',
    'me': 'Maine',
    'md': 'Maryland',
    'ma': 'Massachusetts',
    'mi': 'Michigan',
    'mn': 'Minnesota',
    'ms': 'Mississippi',
    'mo': 'Missouri',
    'mt': 'Montana',
    'ne': 'Nebraska',
    'nv': 'Nevada',
    'nh': 'New Hampshire',
    'nj': 'New Jersey',
    'nm': 'New Mexico',
    'ny': 'New York',
    'nc': 'North Carolina',
    'nd': 'North Dakota',
    'oh': 'Ohio',
    'ok': 'Oklahoma',
    'or': 'Oregon',
    'pa': 'Pennsylvania',
    'ri': 'Rhode Island',
    'sc': 'South Carolina',
    'sd': 'South Dakota',
    'tn': 'Tennessee',
    'tx': 'Texas',
    'ut': 'Utah',
    'vt': 'Vermont',
    'va': 'Virginia',
    'wa': 'Washington',
    'wv': 'West Virginia',
    'wi': 'Wisconsin',
    'wy': 'Wyoming'
}

df_merged['geo_value'] = df_merged['geo_value'].map(us_state_abbrev_lower).fillna(df_merged['geo_value'])

df_merged.to_csv("Dataus/datiusa_unificati.csv", index=False) # all US data
num_unici = df_merged['geo_value'].nunique()
print(f"Number unique 'geo_value' values: {num_unici}")

valori_unici = df_merged['geo_value'].unique()
print("Valori unici in 'geo_value':")
print(valori_unici)


## Temporal coverage and continuity


In [None]:
paesimeno300 = set()
for col in indicators:
    counts = df_merged.groupby('geo_value')[col].count()
    paesimeno300.update(counts[counts < 300].index)

print("States with less than 300 lines on at least one indicator:")
print(sorted(paesimeno300))

In [None]:
df_merged['time_value'] = pd.to_datetime(df_merged['time_value'])
paesibuchi7giorni = set()

for col in indicators:
    for stato, gruppo in df_merged[['geo_value', 'time_value', col]].groupby('geo_value'):
        gruppo = gruppo.sort_values('time_value') # make sure about temporal sequence of dates
        date_valide = gruppo.loc[gruppo[col].notna(), 'time_value'] # use non-NAN values 
        differenze = date_valide.diff().dt.days # difference between consecutive dates in list 
        if (differenze > 7).any():
            paesibuchi7giorni.add(stato) # find states with at least 7 days gaps

print("States with > 7 days gaps:")
print(sorted(paesibuchi7giorni))

In [None]:
paesidascartare= list(paesimeno300.union(paesibuchi7giorni))
paesidascartare

df_merged2 = df_merged[~df_merged['geo_value'].isin(paesidascartare)].copy()

df_merged2.to_csv("Dataus/dati1.csv", index=False)  # final CSV dataset


## Merge US and other countries in a single dataset

In [None]:
import pandas as pd
df1 = pd.read_csv('csv/FULL_DATA1.csv')
df2 = pd.read_csv('Dataus/dati1.csv')
print("Colonne csv1:", df1.columns.tolist())
print("Colonne csv2:", df2.columns.tolist())

In [None]:

# Correspondence over columns
rename_map = {
    'geo_value': 'country',
    'time_value': 'survey_date',

    'sample_size_smoothed_wwearing_mask_7d': 'sample_size_mask',
    'smoothed_wwearing_mask_7d': 'pct_mask',

    'sample_size_smoothed_wworried_catch_covid': 'sample_size_worried_catch_covid',
    'smoothed_wworried_catch_covid': 'pct_worried_catch_covid',
    'sample_size_smoothed_wbelief_masking_effective': 'sample_size_belief_masking_effective',
    'smoothed_wbelief_masking_effective': 'pct_belief_masking_effective',

    'sample_size_smoothed_wreceived_news_local_health': 'sample_size_received_news_local_health',
    'smoothed_wreceived_news_local_health': 'pct_received_news_local_health',
    'sample_size_smoothed_wreceived_news_experts': 'sample_size_received_news_experts',
    'smoothed_wreceived_news_experts': 'pct_received_news_experts',
    'sample_size_smoothed_wreceived_news_cdc': 'sample_size_received_news_who',
    'smoothed_wreceived_news_cdc': 'pct_received_news_who',
    'sample_size_smoothed_wreceived_news_govt_health': 'sample_size_received_news_govt_health',
    'smoothed_wreceived_news_govt_health': 'pct_received_news_govt_health',
    'sample_size_smoothed_wreceived_news_politicians': 'sample_size_received_news_politicians',
    'smoothed_wreceived_news_politicians': 'pct_received_news_politicians',
    'sample_size_smoothed_wreceived_news_journalists': 'sample_size_received_news_journalists',
    'smoothed_wreceived_news_journalists': 'pct_received_news_journalists',
    'sample_size_smoothed_wreceived_news_friends': 'sample_size_received_news_friends',
    'smoothed_wreceived_news_friends': 'pct_received_news_friends',

    'sample_size_smoothed_wtrust_covid_info_doctors': 'sample_size_trust_covid_info_local_health',
    'smoothed_wtrust_covid_info_doctors': 'pct_trust_covid_info_local_health',
    'sample_size_smoothed_wtrust_covid_info_experts': 'sample_size_trust_covid_info_experts',
    'smoothed_wtrust_covid_info_experts': 'pct_trust_covid_info_experts',  
    'sample_size_smoothed_wtrust_covid_info_cdc': 'sample_size_trust_covid_info_who',
    'smoothed_wtrust_covid_info_cdc': 'pct_trust_covid_info_who', 
    'sample_size_smoothed_wtrust_covid_info_govt_health': 'sample_size_trust_covid_info_govt_health',
    'smoothed_wtrust_covid_info_govt_health': 'pct_trust_covid_info_govt_health',
    'sample_size_smoothed_wtrust_covid_info_politicians': 'sample_size_trust_covid_info_politicians',
    'smoothed_wtrust_covid_info_politicians': 'pct_trust_covid_info_politicians',
    'sample_size_smoothed_wtrust_covid_info_journalists': 'sample_size_trust_covid_info_journalists',
    'smoothed_wtrust_covid_info_journalists': 'pct_trust_covid_info_journalists',
    'sample_size_smoothed_wtrust_covid_info_friends': 'sample_size_trust_covid_info_friends',
    'smoothed_wtrust_covid_info_friends': 'pct_trust_covid_info_friends',
}

df2_renamed = df2.rename(columns=rename_map)  # rinomina colonne csv2
df2_aligned = df2_renamed[df1.columns]
df_finale = pd.concat([df1, df2_aligned], ignore_index=True) #unisci i due df
df_finale
df_finale.to_csv("csv/usa_mondo.csv", index=False)


print(df_finale['country'].unique())