In [441]:
from pathlib import Path
import sys

# Path to Yu group's github repo
YU_REPO = Path('../data/external/covid19-severity-prediction/')

# Add their modules to python path
sys.path.append(str(YU_REPO))

# Path to our data folder
DATA = Path('../data/')

# Flag to indicate whether to use cached or uncached files
cached=True

In [442]:
from collections import defaultdict
import pickle as pkl
import data
import pandas as pd
from datetime import datetime as dt
from nltk import ngrams

In [443]:
# Read data (returns wide format)
# df_unabridged = data.load_county_data(data_dir = str(YU_REPO/'data'), cached = cached, abridged = False)
df_abridged = data.load_county_data(data_dir = str(YU_REPO/'data'), cached = cached, abridged = True)

loaded and merged COVID-19 cases/deaths data successfully


## Abridged Dataset

In [466]:
assert df_abridged['countyFIPS'].nunique() == df_abridged.shape[0], "Non unique values for countyFIPS"

# Set FIPS to be the index
df_abridged.set_index('countyFIPS', inplace=True, drop=True)

# Mapping from county FIPS to CountyName and State
countyfips2identifier = df_abridged.loc[:, ['STATEFP', 'COUNTYFP', 'CountyName', 'StateName', 'State']].T.to_dict()

# Delete redundant identifiers 
df_abridged.drop(['STATEFP', 'COUNTYFP', 'CountyName', 'State'], inplace=True, axis=1)

### Stationary features

In [230]:
# List of stationary features
stationary_features = open(DATA/'interim/abridged_stationary_feature_list.txt', 'r').read().split('\n')

# Subset dataframe and save to disk
df_abridged[stationary_features].to_csv(DATA/'processed/abridged_stationary_features.tsv', sep='\t', encoding='utf-8')

# Remove
df_abridged.drop(stationary_features, inplace=True, axis=1)

### Time varying features

#### Cases and Deaths

In [503]:
# Time varying features
time_varying_features = list()

# County wise date of first death
first_death_date = dict()

# How far back to look 
K = 5

deathcols = df_abridged.columns.values[np.where(df_abridged.columns.str.startswith('#Deaths'))[0]]
casecols =  df_abridged.columns.values[np.where(df_abridged.columns.str.startswith('#Case'))[0]]


for county, row in df_abridged.iterrows():

    # Skip counties which have not had any deaths
    if row['deaths'].max() == 0: continue
        
    # Find the first day of death
    first_death_day = np.where(np.array(row['deaths']) > 0)[0][0]        
    
    # Index according to the common time scale (days since first death)
    modts_deaths = row['deaths'][first_death_day:]
    modts_cases = row['cases'][first_death_day:]
    
    # Skip counties which have not had K days since first death
    if len(modts_deaths) < (K+1): continue 
    
    # Subset from day of first day of death onwards
    datewise_deaths = row[deathcols[first_death_day:]]
    datewise_cases = row[casecols[first_death_day:]]
    
    for i, (lag_deaths, lag_cases) in enumerate(zip(ngrams(datewise_deaths, K+1), ngrams(datewise_cases, K+1))):
        features = {'countyFIPS': county}
        for j in range(K+1):
            features[f'deaths-{j}'] = lag_deaths[K-j]
            features[f'cases-{j}'] = lag_cases[K-j]
            features['days_since_first_death'] = K+i
            
        time_varying_features.append(features)
        
    first_death_date[county] = dt.strptime(deathcols[first_death_day][8:], '%m-%d-%Y')
    
        
first_death_date = pd.DataFrame(first_death_date.items(), columns=['countyFIPS', 'first_death_date']).set_index('countyFIPS')

# Save to disk
pd.DataFrame(time_varying_features).to_csv(DATA/'processed/abridged_time_varying_features.tsv', sep='\t', 
                                           encoding='utf-8', index=None)

### Time based feature

In [530]:
time_based_features = df_abridged[['stay at home', '>50 gatherings', '>500 gatherings', 'public schools',
                                   'restaurant dine-in', 'entertainment/gym', 'federal guidelines']]

# Remove counties which have not experienced any deaths or K days since first death
time_based_features = time_based_features.loc[first_death_date.index]

# Convert to datetime
time_based_features = time_based_features.applymap(lambda s: dt.fromordinal(int(s)) if s==s else s)

for col in time_based_features.columns:
    time_based_features[col] = (time_based_features[col] - first_death_date['first_death_date'])
    
    # Convert to integer
    time_based_features[col] =  time_based_features[col].map(lambda s: s.days if s==s else np.nan)
    
# Write to disk
time_based_features.to_csv(DATA/'processed/abridged_time_based_features.tsv', sep='\t', 
                           encoding='utf-8')