In [1]:
from pathlib import Path
import sys

# Path to Yu group's github repo
YU_REPO = Path('../data/external/covid19-severity-prediction/')

# Add their modules to python path
sys.path.append(str(YU_REPO))

# Path to our data folder
DATA = Path('../data/')

# Flag to indicate whether to use cached or uncached files
cached=True

In [2]:
from collections import defaultdict
import pickle as pkl
import data
import numpy as np
import pandas as pd
from datetime import datetime as dt
from nltk import ngrams
import addfips
af = addfips.AddFIPS()

In [3]:
# Read data (returns wide format)
# df_unabridged = data.load_county_data(data_dir = str(YU_REPO/'data'), cached = cached, abridged = False)
df_abridged = data.load_county_data(data_dir = str(YU_REPO/'data'), cached = cached, abridged = True)

loaded and merged COVID-19 cases/deaths data successfully


In [4]:
# Mapping from state abbreviation to full name
state_abb = pd.read_csv(DATA/'interim/state-abbreviations.csv', header=None)
state_abb = state_abb.set_index(1)[0].to_dict()

# Fill null state names in abridged data
df_abridged['State'] = df_abridged['StateName'].map(lambda s: state_abb[s])

## Abridged Dataset

In [5]:
assert df_abridged['countyFIPS'].nunique() == df_abridged.shape[0], "Non unique values for countyFIPS"

# Set FIPS to be the index
df_abridged.set_index('countyFIPS', inplace=True, drop=True)

# Mapping from county FIPS to CountyName and State
countyfips2identifier = df_abridged.loc[:, ['STATEFP', 'COUNTYFP', 'CountyName', 'StateName', 'State']].T.to_dict()

# Delete redundant identifiers 
df_abridged.drop(['STATEFP', 'COUNTYFP', 'CountyName', 'State'], inplace=True, axis=1)

### Stationary features

In [6]:
# List of stationary features
stationary_features = open(DATA/'interim/abridged_stationary_feature_list.txt', 'r').read().split('\n')

# Subset dataframe and save to disk
df_abridged[stationary_features].to_csv(DATA/'processed/abridged_stationary_features.tsv', sep='\t', encoding='utf-8')

# Remove
df_abridged.drop(stationary_features, inplace=True, axis=1)

### Time varying features

#### Cases and Deaths

In [7]:
# Time varying features
time_varying_features = list()

# County wise date of first death
first_death_date = dict()

# How far back to look 
K = 5

deathcols = df_abridged.columns.values[np.where(df_abridged.columns.str.startswith('#Deaths'))[0]]
casecols =  df_abridged.columns.values[np.where(df_abridged.columns.str.startswith('#Case'))[0]]

for county, row in df_abridged.iterrows():

    # Skip counties which have not had any deaths
    if row['deaths'].max() == 0: continue
        
    # Find the first day of death
    first_death_day = np.where(np.array(row['deaths']) > 0)[0][0]        
    
    # Index according to the common time scale (days since first death)
    modts_deaths = row['deaths'][first_death_day:]
    modts_cases = row['cases'][first_death_day:]
    
    # Skip counties which have not had K days since first death
    if len(modts_deaths) < (K+1): continue 
    
    # Subset from day of first day of death onwards
    datewise_deaths = row[deathcols[first_death_day:]]
    datewise_cases = row[casecols[first_death_day:]]
    
    for i, (lag_deaths, lag_cases) in enumerate(zip(ngrams(datewise_deaths, K+1), ngrams(datewise_cases, K+1))):
        features = {'countyFIPS': county}
        for j in range(K+1):
            features[f'deaths-{j}'] = lag_deaths[K-j]
            features[f'cases-{j}'] = lag_cases[K-j]
            features['days_since_first_death'] = K+i
            
        time_varying_features.append(features)
        
    first_death_date[county] = dt.strptime(deathcols[first_death_day][8:], '%m-%d-%Y')
    
        
first_death_date = pd.DataFrame(first_death_date.items(), columns=['countyFIPS', 'first_death_date']).set_index('countyFIPS')

# Save to disk
pd.DataFrame(time_varying_features).to_csv(DATA/'processed/abridged_time_varying_features.tsv', sep='\t', 
                                           encoding='utf-8', index=None)

#### Mobility

In [8]:
google_mobility = pd.read_csv(DATA/'external/Global_Mobility_Report.csv', low_memory=False)

# Drop rows with missing info for sub_region_1 or sub_region_2
google_mobility.dropna(subset=['sub_region_1', 'sub_region_2'], inplace=True)

# Keep only data for US
google_mobility = google_mobility[google_mobility['country_region'] == 'United States']

# Add countyFIPS
google_mobility['countyFIPS'] = google_mobility.\
    apply(lambda row: af.get_county_fips(row['sub_region_2'], state=row['sub_region_1']), axis=1)

# Ensure evereything got mapped 
assert google_mobility['countyFIPS'].map(len).min() == 5, "Some FIPS codes are wrong"

# Set index to county fips
google_mobility.set_index('countyFIPS', inplace=True, drop=True)

# Convert to datetime
google_mobility['date'] = google_mobility['date'].map(lambda s: dt.strptime(s, "%Y-%m-%d"))

# Only keep counties which for which have experienced K days since first death and present in mobility dataset
common_fips = list(set(first_death_date.index) & set(google_mobility.index))
missing_fips = set(first_death_date.index) - set(google_mobility.index)
print(f"There are {len(missing_fips)} countyFIPS not present in mobility dataset.")
google_mobility = google_mobility.loc[common_fips]

There are 17 countyFIPS not present in mobility dataset.


In [9]:
# Convert to days since first death
countyFIPS2first_death_date = first_death_date.to_dict()['first_death_date']
google_mobility['days_since_first_death'] = google_mobility.reset_index().apply(
    lambda row: (row['date'] - countyFIPS2first_death_date[row['countyFIPS']]).days, axis=1).values

# Remove redundant identifiers
google_mobility.drop(['country_region_code', 'country_region', 'sub_region_1', 'sub_region_2', 'date'], axis=1, inplace=True)

# Groupby county FIPS
google_mobility = google_mobility.reset_index().groupby('countyFIPS').agg(list)

In [10]:
# Add google mobility to time varying features dataset
K = 6
time_varying_features = pd.DataFrame(time_varying_features)
mobility_colnames = ['retail_and_recreation_percent_change_from_baseline',
                     'grocery_and_pharmacy_percent_change_from_baseline',
                     'parks_percent_change_from_baseline',
                     'transit_stations_percent_change_from_baseline',
                     'workplaces_percent_change_from_baseline',
                     'residential_percent_change_from_baseline']

In [11]:
mobility_features = list() 

for i, row in time_varying_features.iterrows():
    
    features = {
        'countyFIPS': row['countyFIPS'],
        'days_since_first_death': row['days_since_first_death']
    }
    
    # Retrieve mobility data for that specific county
    try:
        county_mobility = google_mobility.loc[features['countyFIPS']]
        
    # No mobility data at all for that county (this raises valueerror later and thus everything gets set to null)
    except KeyError: 
        county_mobility = {'days_since_first_death': [-1e9]}
    
    # Find mobility data preceding the days since first death
    try:
        idx = county_mobility['days_since_first_death'].index(features['days_since_first_death'])

        for col in mobility_colnames:
            for j in range(K):   
                features[f'{col}-{j}']  = county_mobility[col][idx-j]
    
    # Mobility data doesn't exist a specific date onwards
    except ValueError:
        for col in mobility_colnames:
            for j in range(K):   
                features[f'{col}-{j}'] = np.nan
        
    mobility_features.append(features)
    
mobility_features = pd.DataFrame(mobility_features)

In [12]:
mobility_features.to_csv(DATA/'processed/mobility_time_varying_features.tsv', sep='\t', 
                                       encoding='utf-8', index=None)

### Time based feature

In [13]:
time_based_features = df_abridged[['stay at home', '>50 gatherings', '>500 gatherings', 'public schools',
                                   'restaurant dine-in', 'entertainment/gym', 'federal guidelines']]

# Remove counties which have not experienced any deaths or K days since first death
time_based_features = time_based_features.loc[first_death_date.index]

# Convert to datetime
time_based_features = time_based_features.applymap(lambda s: dt.fromordinal(int(s)) if s==s else s)

for col in time_based_features.columns:
    time_based_features[col] = (time_based_features[col] - first_death_date['first_death_date'])
    
    # Convert to integer
    time_based_features[col] =  time_based_features[col].map(lambda s: s.days if s==s else np.nan)
    
# Write to disk
time_based_features.to_csv(DATA/'processed/abridged_time_based_features.tsv', sep='\t', 
                           encoding='utf-8')