# COVID-19 Data Processing: Daily Reports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [12]:
from datetime import datetime
from glob import glob
import numpy as np
import pandas as pd

In [28]:
out_dir = 'data'
data_dir = 'covid19'

daily_reports_paths = glob(f'{out_dir}/{data_dir}/csse_covid_19_daily_reports/*.csv')
daily_reports_paths.sort(key=lambda x: datetime.strptime(x.split('/')[-1].split('.')[0], '%m-%d-%Y'))

In [144]:
regions = None
cases = []
deaths = []
recoveries = []

columns_by_count = {
    6: ['state', 'country', 'last_update', 'cases', 'deaths', 'recoveries'],
    8: ['state', 'country', 'last_update', 'cases', 'deaths', 'recoveries', 'lat', 'long'],
    12: ['fips', 'county', 'state', 'country', 'last_update', 'lat', 'long', 'cases', 'deaths', 'recoveries', 'active', 'key']
}

us_state_abbr_to_full = {
    'U.S.': None,
    'AL': 'Alabam',
    'AK': 'Alaska',
    'AZ': 'Arizon',
    'AR': 'Arkansas',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticu',
    'DE': 'Delaware',
    'FL': 'Florid',
    'GA': 'Georgi',
    'HI': 'Hawaii',
    'ID': 'Idah',
    'IL': 'Illinois',
    'IN': 'Indian',
    'IA': 'Iowa',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisian',
    'ME': 'Main',
    'MD': 'Maryland',
    'MA': 'Massachusett',
    'MI': 'Michigan',
    'MN': 'Minnesot',
    'MS': 'Mississipp',
    'MO': 'Missouri',
    'MT': 'Montan',
    'NE': 'Nebraska',
    'NV': 'Nevada',
    'NH': 'New Hampshir',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NY': 'New York',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennesse',
    'TX': 'Texa',
    'UT': 'Utah',
    'VT': 'Vermon',
    'VA': 'Virginia',
    'WA': 'Washington',
    'WV': 'West Virgini',
    'WI': 'Wisconsi',
    'WY': 'Wyomin',
    'DC': 'District of Columbia',
    'MH': 'Marshall Islands',
}

def fix_us_state_names(state):
    county, state = state.split(', ')
    full_state = us_state_abbr_to_full[state] if state in us_state_abbr_to_full else None
    
    if full_state is not None:
        return county + ', ' + full_state
    
    return county

for daily_reports_path in reversed(daily_reports_paths):
    report = pd.read_table(daily_reports_path, sep=',')
    num_cols = len(report.columns)
    
    report.columns = columns_by_count[num_cols]
    
    date = datetime.strptime(daily_reports_path.split('/')[-1].split('.')[0], '%m-%d-%Y').strftime('%Y-%m-%d')
    
    if regions is None:
        regions = report[['key', 'country', 'state', 'county', 'lat', 'long']]
    
    if num_cols == 12:
        report = report[report.country.str.lower().str.contains('diamond princess') == False]
        report = report[report.state.str.lower().str.contains('diamond princess') == False]
        report = report[report.state.str.lower().str.contains('recovered') == False]
        
        report['country'] = report['country'].replace({
            'Taiwan*': 'Taiwan'
        })
        
        daily_cases = report[['key', 'cases']]
        daily_deaths = report[['key', 'deaths']]
        daily_recoveries = report[['key', 'recoveries']]
        
    else:
        report = report[report.country.str.lower().str.contains('diamond princess') == False]
        report = report[report.state.str.lower().str.contains('diamond princess') == False]
        report = report[report.state.str.lower().str.contains('recovered') == False]
        
        report['country'] = report['country'].replace({
            'Mainland China': 'China',
            'Taiwan*': 'Taiwan'
        })
        report['state'] = report['state'].replace({
            'Taiwan': None
        })
        
        is_us = report['country'] == 'US'
        has_county = report['state'].str.contains(',')
        report.loc[is_us & has_county, 'state'] = report.loc[is_us & has_county, 'state'].apply(lambda r: fix_us_state_names(r))
        
        daily_cases = report[['state', 'country', 'cases']].copy()
        daily_cases['key'] = ''
        daily_cases.loc[daily_cases['state'].notnull(), 'key'] = daily_cases['state'] + ', ' + daily_cases['country']
        daily_cases.loc[daily_cases['state'].isnull(), 'key'] = daily_cases['country']
        daily_cases = daily_cases[['key', 'cases']]
        
        daily_deaths = report[['state', 'country', 'deaths']].copy()
        daily_deaths['key'] = ''
        daily_deaths.loc[daily_deaths['state'].notnull(), 'key'] = daily_deaths['state'] + ', ' + daily_deaths['country']
        daily_deaths.loc[daily_deaths['state'].isnull(), 'key'] = daily_deaths['country']
        daily_deaths = daily_deaths[['key', 'deaths']]
        
        daily_recoveries = report[['state', 'country', 'recoveries']].copy()
        daily_recoveries['key'] = ''
        daily_recoveries.loc[daily_recoveries['state'].notnull(), 'key'] = daily_recoveries['state'] + ', ' + daily_recoveries['country']
        daily_recoveries.loc[daily_recoveries['state'].isnull(), 'key'] = daily_recoveries['country']
        daily_recoveries = daily_recoveries[['key', 'recoveries']]
        
    daily_cases.columns = ['key', date]
    daily_deaths.columns = ['key', date]
    daily_recoveries.columns = ['key', date]
    
    daily_cases = daily_cases.set_index('key')
    daily_deaths = daily_deaths.set_index('key')
    daily_recoveries = daily_recoveries.set_index('key')
    
    cases.append(daily_cases)
    deaths.append(daily_deaths)
    recoveries.append(daily_recoveries)

In [338]:
regions = None
cases_by_region = {}

columns_by_count = {
    6: ['state', 'country', 'last_update', 'cases', 'deaths', 'recoveries'],
    8: ['state', 'country', 'last_update', 'cases', 'deaths', 'recoveries', 'lat', 'long'],
    12: ['fips', 'county', 'state', 'country', 'last_update', 'lat', 'long', 'cases', 'deaths', 'recoveries', 'active', 'key']
}

us_state_abbr_to_full = {
    'u.s.': None,
    'al': 'alabama',
    'ak': 'alaska',
    'az': 'arizona',
    'ar': 'arkansas',
    'ca': 'california',
    'co': 'colorado',
    'ct': 'connecticut',
    'de': 'delaware',
    'fl': 'florida',
    'ga': 'georgia',
    'hi': 'hawaii',
    'id': 'idaho',
    'il': 'illinois',
    'in': 'indiana',
    'ia': 'iowa',
    'ks': 'kansas',
    'ky': 'kentucky',
    'la': 'louisiana',
    'me': 'maine',
    'md': 'maryland',
    'ma': 'massachusetts',
    'mi': 'michigan',
    'mn': 'minnesota',
    'ms': 'mississippi',
    'mo': 'missouri',
    'mt': 'montana',
    'ne': 'nebraska',
    'nv': 'nevada',
    'nh': 'new hampshire',
    'nj': 'new jersey',
    'nm': 'new mexico',
    'ny': 'new york',
    'nc': 'north carolina',
    'nd': 'north dakota',
    'oh': 'ohio',
    'ok': 'oklahoma',
    'or': 'oregon',
    'pa': 'pennsylvania',
    'ri': 'rhode island',
    'sc': 'south carolina',
    'sd': 'south dakota',
    'tn': 'tennessee',
    'tx': 'texas',
    'ut': 'utah',
    'vt': 'vermond',
    'va': 'virginia',
    'wa': 'washington',
    'wv': 'west virginia',
    'wi': 'wisconsin',
    'wy': 'wyoming',
    'dc': 'district of columbia',
    'mh': 'marshall islands',
}

def to_us_state(state):
    county, state = state.split(', ')
    state = state.replace('.', '').strip()
    full_state = us_state_abbr_to_full[state] if state in us_state_abbr_to_full else None
    
    if full_state is not None:
        return full_state
    
    return county.strip()

ca_state_abbr_to_full = {
    'nl': 'newfoundland and labrador',
    'pe': 'prince edward island',
    'ns': 'nova scotia',
    'nb': 'new brunswick',
    'qc': 'quebec',
    'on': 'ontario',
    'mb': 'manitoba',
    'sk': 'saskatchewan',
    'ab': 'alberta',
    'bc': 'british columbia',
    'yt': 'yukon',
    'nt': 'northwest territories',
    'nu': 'nunavut',
}

def to_ca_state(state):
    county, state = state.split(', ')
    state = state.replace('.', '').strip()
    full_state = ca_state_abbr_to_full[state] if state in ca_state_abbr_to_full else None
    
    if full_state is not None:
        return full_state
    
    return county.strip()

def to_county(state):
    return state.split(',')[0].strip().replace('county', '').strip()

def nan_to_num(num):
    return int(np.nan_to_num(num))

num_days = len(daily_reports_paths)

for i, daily_reports_path in enumerate(reversed(daily_reports_paths)):
    report = pd.read_table(daily_reports_path, sep=',')
    num_cols = len(report.columns)
    
    report.columns = columns_by_count[num_cols]
    
    date = datetime.strptime(daily_reports_path.split('/')[-1].split('.')[0], '%m-%d-%Y').strftime('%Y-%m-%d')
    
    report['country'] = report['country'].replace({
        'Mainland China': 'China',
        'Taiwan*': 'Taiwan',
        'Hong Kong SAR': 'Hong Kong',
        'Macao SAR': 'Macau',
        'Taipei and environs': 'Taiwan',
        'UK': 'United Kingdom',
        'Bahamas, The': 'Bahamas',
        'Gambia, The': 'Gambia',
        'The Bahamas': 'Bahamas',
        'The Gambia': 'Gambia',
        'Republic of Korea': 'South Korea',
        'Korea, South': 'South Korea',
        'Czechia': 'Czech Republic',
        'Russian Federation': 'Russia',
        'Iran (Islamic Republic of)': 'Iran',
        'Viet Nam': 'Vietnam',
        'Republic of Moldova': 'Moldova',
        'Occupied Palestinian territory': 'Palestine',
        'Republic of Ireland': 'Ireland'
    })
                
    forgotten_countries = {
        'ivory coast': True,
        'kosovo': True,
        'republic of the congo': True,
        'east timor': True,
        'cape verde': True,
        'palestine': True
    }
    
    if regions is None:
        regions = report[['key', 'country', 'state', 'county', 'lat', 'long']].copy()
        
        regions = regions[regions.country.isna() | (regions.country.str.lower().str.contains('princess') == False)]
        regions = regions[regions.country.isna() | (regions.country.str.lower().str.contains('cruise') == False)]
        regions = regions[regions.country.isna() | (regions.country.str.lower().str.contains('ship') == False)]
        regions = regions[regions.country.isna() | (regions.country.str.lower().str.contains('others') == False)]
        regions = regions[regions.state.isna() | (regions.state.str.lower().str.contains('princess') == False)]
        regions = regions[regions.state.isna() | (regions.state.str.lower().str.contains('recovered') == False)]
        regions = regions[regions.state.isna() | (regions.state.str.lower().str.contains('cruise') == False)]
        regions = regions[regions.state.isna() | (regions.state.str.lower().str.contains('ship') == False)]
        regions = regions[regions.state.isna() | (regions.state.str.lower().str.contains('wuhan') == False)]
        
        for j, row in regions.iterrows():
            country = row['country'].strip().lower() if row['country'] and isinstance(row['country'], str) else None
            state = row['state'].strip().lower() if row['state'] and isinstance(row['state'], str) else None
            county = row['county'].strip().lower() if row['county'] and isinstance(row['county'], str) else None
            
            if country is not None and country not in cases_by_region:
                cases_by_region[country] = {
                    'cases': [0] * num_days,
                    'deaths': [0] * num_days,
                    'recoveries': [0] * num_days
                }
                
            if state is not None and state not in cases_by_region[country]:
                cases_by_region[country][state] = {
                    'cases': [0] * num_days,
                    'deaths': [0] * num_days,
                    'recoveries': [0] * num_days
                }

            if county is not None and county not in cases_by_region[country][state]:
                cases_by_region[country][state][county] = {
                    'cases': [0] * num_days,
                    'deaths': [0] * num_days,
                    'recoveries': [0] * num_days
                }
    
    if num_cols == 12:
        report = report[report.country.isna() | (report.country.str.lower().str.contains('princess') == False)]
        report = report[report.country.isna() | (report.country.str.lower().str.contains('cruise') == False)]
        report = report[report.country.isna() | (report.country.str.lower().str.contains('ship') == False)]
        report = report[report.country.isna() | (report.country.str.lower().str.contains('others') == False)]
        report = report[report.state.isna() | (report.state.str.lower().str.contains('princess') == False)]
        report = report[report.state.isna() | (report.state.str.lower().str.contains('recovered') == False)]
        report = report[report.state.isna() | (report.state.str.lower().str.contains('cruise') == False)]
        report = report[report.state.isna() | (report.state.str.lower().str.contains('ship') == False)]
        report = report[report.state.isna() | (report.state.str.lower().str.contains('wuhan') == False)]
        report = report[(report.long != 0.0) | (report.lat != 0.0)]
        
        report['country'] = report['country'].replace({
            'Taiwan*': 'Taiwan'
        })
        
        report['county'] = report['county'].apply(lambda r: r.replace('County', '').strip() if isinstance(r, str) else None)
        
        report['country'] = report['country'].str.lower()
        report['state'] = report['state'].str.lower()
        report['county'] = report['county'].str.lower()
                            
        for j, row in report.iterrows():
            country = row['country'].strip().lower() if row['country'] and isinstance(row['country'], str) else None
            state = row['state'].strip().lower() if row['state'] and isinstance(row['state'], str) else None
            county = row['county'].strip().lower() if row['county'] and isinstance(row['county'], str) else None
    
            if county is not None and state is not None and country is not None:
                try:
                    cases_by_region[country][state][county]['cases'][i] = nan_to_num(row['cases'])
                    cases_by_region[country][state][county]['deaths'][i] = nan_to_num(row['deaths'])
                    cases_by_region[country][state][county]['recoveries'][i] = nan_to_num(row['recoveries'])
                except KeyError:
                    print(country, state, county)

            if state is not None and country is not None:
                try:
                    cases_by_region[country][state]['cases'][i] += nan_to_num(row['cases'])
                    cases_by_region[country][state]['deaths'][i] += nan_to_num(row['deaths'])
                    cases_by_region[country][state]['recoveries'][i] += nan_to_num(row['recoveries'])
                except KeyError:
                    print(country, state, county)
                
            if country is not None:
                cases_by_region[country]['cases'][i] += nan_to_num(row['cases'])
                cases_by_region[country]['deaths'][i] += nan_to_num(row['deaths'])
                cases_by_region[country]['recoveries'][i] += nan_to_num(row['recoveries'])
        
    else:
        report = report[report.country.isna() | (report.country.str.lower().str.contains('princess') == False)]
        report = report[report.country.isna() | (report.country.str.lower().str.contains('cruise') == False)]
        report = report[report.country.isna() | (report.country.str.lower().str.contains('ship') == False)]
        report = report[report.country.isna() | (report.country.str.lower().str.contains('others') == False)]
        report = report[report.state.isna() | (report.state.str.lower().str.contains('princess') == False)]
        report = report[report.state.isna() | (report.state.str.lower().str.contains('recovered') == False)]
        report = report[report.state.isna() | (report.state.str.lower().str.contains('cruise') == False)]
        report = report[report.state.isna() | (report.state.str.lower().str.contains('ship') == False)]
        report = report[report.state.isna() | (report.state.str.lower().str.contains('wuhan') == False)]
        
        if num_cols == 8:
            report = report[(report.long != 0.0) | (report.lat != 0.0)]
        
        report.loc[report['country'] == 'Hong Kong', 'state'] = 'Hong Kong'
        report.loc[report['country'] == 'Hong Kong', 'country'] = 'China'
        report.loc[report['country'] == 'Macau', 'state'] = 'Macau'
        report.loc[report['country'] == 'Macau', 'country'] = 'China'
        
        report.loc[report['country'] == 'Greenland', 'state'] = 'Greenland'
        report.loc[report['country'] == 'Greenland', 'country'] = 'Denmark'
        report.loc[report['country'] == 'Faroe Islands', 'state'] = 'Faroe Islands'
        report.loc[report['country'] == 'Faroe Islands', 'country'] = 'Denmark'
        
        report.loc[report['country'] == 'Aruba', 'state'] = 'Aruba'
        report.loc[report['country'] == 'Aruba', 'country'] = 'Netherlands'
        report.loc[report['country'] == 'Curacao', 'state'] = 'Curacao'
        report.loc[report['country'] == 'Curacao', 'country'] = 'Netherlands'
        
        report.loc[report['country'] == 'Cayman Islands', 'state'] = 'Cayman Islands'
        report.loc[report['country'] == 'Cayman Islands', 'country'] = 'United Kingdom'
        report.loc[report['country'] == 'Channel Islands', 'state'] = 'Channel Islands'
        report.loc[report['country'] == 'Channel Islands', 'country'] = 'United Kingdom'
        report.loc[report['country'] == 'Gibraltar', 'state'] = 'Gibraltar'
        report.loc[report['country'] == 'Gibraltar', 'country'] = 'United Kingdom'
        
        report.loc[report['country'] == 'French Guiana', 'state'] = 'French Guiana'
        report.loc[report['country'] == 'French Guiana', 'country'] = 'France'
        report.loc[report['country'] == 'French Polynesia', 'state'] = 'French Polynesia'
        report.loc[report['country'] == 'French Polynesia', 'country'] = 'France'
        report.loc[report['country'] == 'Martinique', 'state'] = 'Martinique'
        report.loc[report['country'] == 'Martinique', 'country'] = 'France'
        report.loc[report['country'] == 'Reunion', 'state'] = 'Reunion'
        report.loc[report['country'] == 'Reunion', 'country'] = 'France'
        report.loc[report['country'] == 'Mayotte', 'state'] = 'Mayotte'
        report.loc[report['country'] == 'Mayotte', 'country'] = 'France'
        report.loc[report['country'] == 'Guadeloupe', 'state'] = 'Guadeloupe'
        report.loc[report['country'] == 'Guadeloupe', 'country'] = 'France'
        report.loc[report['country'] == 'Saint Barthelemy', 'state'] = 'Saint Barthelemy'
        report.loc[report['country'] == 'Saint Barthelemy', 'country'] = 'France'
        
        report.loc[report['country'] == 'Puerto Rico', 'state'] = 'Puerto Rico'
        report.loc[report['country'] == 'Puerto Rico', 'country'] = 'US'
        report.loc[report['country'] == 'Guam', 'state'] = 'Guam'
        report.loc[report['country'] == 'Guam', 'country'] = 'US'
        
        report.loc[report['state'] == report['country'], 'state'] = None
        report['state'] = report['state'].replace({
            'United States Virgin Islands': 'Virgin Islands',
            'Virgin Islands, U.S.': 'Virgin Islands',
            'Fench Guiana': 'French Guiana',
            'UK': None,
            'None': None
        })
        
        report['country'] = report['country'].str.lower()
        report['state'] = report['state'].str.lower()        
        report['county'] = None
        
        is_us = report['country'] == 'us'
        is_canada = report['country'] == 'canada'
        has_county = report['state'].str.contains(',')
        report.loc[is_us & has_county, 'county'] = report.loc[is_us & has_county, 'state'].apply(lambda r: to_county(r))
        report.loc[is_us & has_county, 'state'] = report.loc[is_us & has_county, 'state'].apply(lambda r: to_us_state(r))
        report.loc[is_canada & has_county, 'county'] = report.loc[is_canada & has_county, 'state'].apply(lambda r: to_county(r))
        report.loc[is_canada & has_county, 'state'] = report.loc[is_canada & has_county, 'state'].apply(lambda r: to_ca_state(r))
        
        report.loc[report['country'] == 'germany', 'state'] = None
        
        report.loc[report['state'] == report['county'], 'county'] = None
                            
        for j, row in report.iterrows():
            country = row['country'].strip() if row['country'] and isinstance(row['country'], str) else None
            state = row['state'].strip() if row['state'] and isinstance(row['state'], str) else None
            county = row['county'].strip() if row['county'] and isinstance(row['county'], str) else None
    
            if county is not None and state is not None:
                try:
                    cases_by_region[country][state][county]['cases'][i] = nan_to_num(row['cases'])
                    cases_by_region[country][state][county]['deaths'][i] = nan_to_num(row['deaths'])
                    cases_by_region[country][state][county]['recoveries'][i] = nan_to_num(row['recoveries'])
                except KeyError:
                    print(f'  ? Missing County: "{country}", "{state}", "{county}"', date)
    
            if state is not None:
                try:
                    cases_by_region[country][state]['cases'][i] += nan_to_num(row['cases'])
                    cases_by_region[country][state]['deaths'][i] += nan_to_num(row['deaths'])
                    cases_by_region[country][state]['recoveries'][i] += nan_to_num(row['recoveries'])
                except KeyError:
                    print(f' !! Missing State:  "{country}", "{state}"', date)
    
            
            try:
                cases_by_region[country]['cases'][i] += nan_to_num(row['cases'])
                cases_by_region[country]['deaths'][i] += nan_to_num(row['deaths'])
                cases_by_region[country]['recoveries'][i] += nan_to_num(row['recoveries'])
            except KeyError:
                if country in forgotten_countries:
                    cases_by_region[country] = {
                        'cases': [0] * num_days,
                        'deaths': [0] * num_days,
                        'recoveries': [0] * num_days
                    }
                    cases_by_region[country]['cases'][i] += nan_to_num(row['cases'])
                    cases_by_region[country]['deaths'][i] += nan_to_num(row['deaths'])
                    cases_by_region[country]['recoveries'][i] += nan_to_num(row['recoveries'])
                else:
                    print(f'!!! Missing Country: "{country}"', date)

us massachusetts brockton
us tennessee out-of-state
us tennessee unknown
us minnesota leseur
us new hampshire nashua
us alaska soldotna
us alaska sterling
canada northwest territories None
australia external territories None
australia jervis bay territory None
 !! Missing State:  "canada", "northwest territories" 2020-03-21
!!! Missing Country: "guernsey" 2020-03-21
!!! Missing Country: "jersey" 2020-03-21
!!! Missing Country: "guernsey" 2020-03-20
!!! Missing Country: "jersey" 2020-03-20
!!! Missing Country: "guernsey" 2020-03-19
!!! Missing Country: "jersey" 2020-03-19
!!! Missing Country: "guernsey" 2020-03-18
!!! Missing Country: "jersey" 2020-03-18
!!! Missing Country: "guernsey" 2020-03-17
!!! Missing Country: "jersey" 2020-03-17
!!! Missing Country: "occupied palestinian territory" 2020-03-17
!!! Missing Country: "jersey" 2020-03-16
!!! Missing Country: "guernsey" 2020-03-16
!!! Missing Country: "occupied palestinian territory" 2020-03-16
!!! Missing Country: "jersey" 2020-03-15

In [335]:
# Reverse time series

from copy import deepcopy

cases_by_region2 = deepcopy(cases_by_region)

exclude = {
    'cases': True,
    'deaths': True,
    'recoveries': True
}

def add_missing_values(values):
    values = list(reversed(values))
    
    last_v = values[0]
    for i, v in enumerate(values):
        if i == 0:
            continue
        
        compare_v = values[i-1] if values[i-1] is not None else last_v
        if compare_v > v:
            values[i] = None
        else:
            last_v = v
    return values

for c in cases_by_region2.keys():
    country = cases_by_region2[c]
    country['cases'] = add_missing_values(country['cases'])
    country['deaths'] = add_missing_values(country['deaths'])
    country['recoveries'] = add_missing_values(country['recoveries'])
    
    for s in country.keys():
        if s in exclude:
            continue
        
        state = country[s]
        state['cases'] = add_missing_values(state['cases'])
        state['deaths'] = add_missing_values(state['deaths'])
        state['recoveries'] = add_missing_values(state['recoveries'])
    
        for y in state.keys():
            if y in exclude:
                continue

            county = state[y]
            county['cases'] = add_missing_values(county['cases'])
            county['deaths'] = add_missing_values(county['deaths'])
            county['recoveries'] = add_missing_values(county['recoveries'])

In [336]:
import json

data = {
    'startDate': datetime.strptime(daily_reports_paths[0].split('/')[-1].split('.')[0], '%m-%d-%Y').strftime('%Y-%m-%d'),
    'endDate': datetime.strptime(daily_reports_paths[-1].split('/')[-1].split('.')[0], '%m-%d-%Y').strftime('%Y-%m-%d'),
    'data': cases_by_region2
}

with open('data/covid-19.json', 'w') as f:
    json.dump(data, f)

In [341]:
regions_out = regions.copy()

regions_out['key'] = regions_out['key'].str.lower()
regions_out['country'] = regions_out['country'].str.lower()
regions_out['state'] = regions_out['state'].str.lower()
regions_out['county'] = regions_out['county'].str.lower()

regions_out = regions_out.set_index('key')

regions_out.to_json('data/covid-19-regions.json', orient='index')

In [345]:
regions_out.loc['california, us']

KeyError: 'california, us'

In [138]:
combined_cases = pd.concat(list(reversed(cases)), axis=1, sort=False, join='outer')
combined_deaths = pd.concat(list(reversed(deaths)), axis=1, sort=False, join='outer')
combined_recoveries = pd.concat(list(reversed(recoveries)), axis=1, sort=False, join='outer')

combined_cases.head()

ValueError: Shape of passed values is (3479, 63), indices imply (3477, 63)

In [48]:
regions.set_index('key')
regions.head()

Unnamed: 0_level_0,country,state,county,lat,long
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Abbeville, South Carolina, US",US,South Carolina,Abbeville,34.223334,-82.461707
"Acadia, Louisiana, US",US,Louisiana,Acadia,30.295065,-92.414197
"Accomack, Virginia, US",US,Virginia,Accomack,37.767072,-75.632346
"Ada, Idaho, US",US,Idaho,Ada,43.452658,-116.241552
"Adair, Iowa, US",US,Iowa,Adair,41.330756,-94.471059
