In [1]:
import pandas as pd
import geopandas as gpd
import plotly
import shapely

import os
from pathlib import Path

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [2]:
GEO_INFO = ['Lat', 'Long_']

In [3]:
# get project top directory 
base_dir = Path(os.getcwd()).parent

In [4]:
csse_data = base_dir / 'data/external/csse_data/csse_covid_19_data'
daily_data = csse_data / 'csse_covid_19_daily_reports/'
ts_data = csse_data / 'csse_covid_19_time_series'

In [5]:
import glob

def get_daily_csv(path=str(daily_data) + '/*.csv'):
    path = glob.glob(path)
    file_list = [os.path.basename(file) for file in path]
    return sorted(file_list)

### Prepare geo data from csse 

In [6]:
def create_geo_column(lat=None, long=None):
    wkt_string = f'POINT ({lat} {long})'
    return shapely.wkt.loads(wkt_string)

geo_mapper = {
    'US': 'New York',
    'Canada': 'Quebec',
    'China': 'Beijing',
    'Australia': 'Australian Capital Territory',
    # 'United Kingdom': np.nan,
    # 'Netherlands': np.nan,
    # 'Denmark': np.nan,
    # 'France': np.nan
}

# get most recent daily report
filepath = daily_data / get_daily_csv()[-1]

COLUMN_LIST = ['Country_Region', 'Province_State', 'Lat', 'Long_']
INDEX = ['Country_Region', 'Province_State']
df = pd.read_csv(filepath, usecols=COLUMN_LIST)

# create point wkt and initialize geodataframe
df.loc[:, 'geometry'] = df.apply(lambda row: create_geo_column(row['Long_'], row['Lat']), axis=1)
df = df.drop(columns=['Long_', 'Lat'])
df = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')

# get a list of unique country names and get dataframe of countries with main provinces
country_list = df['Country_Region'].unique()
mask = (df['Country_Region'].isin(country_list)) & (pd.isna(df['Province_State']))
df_clean = df[mask]

# select a specific province for each country with multiple province entries
# the province is specified in the 'geo_mapper' object
for country, province in geo_mapper.items():
    mask = (df['Country_Region'] == country) & (df['Province_State'] == province)
    df_temp = df[mask].dissolve('Province_State').reset_index()
    df_temp.loc[:, 'geometry'] = df_temp.loc[:, 'geometry'].representative_point()
    
    df_clean = df_clean.append(df_temp)
    df_clean = df_clean.sort_values('Country_Region')
    df_clean = df_clean.reset_index(drop=True)

### Merge geo data from naturalearthdata with csse data

In [7]:
import pandas as pd
import geopandas as gpd
import plotly
import shapely

import os
from pathlib import Path

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

# get project top directory 
base_dir = Path(os.getcwd()).parent
geo_data = base_dir / 'data/external/geo_data/' / 'ne_50m_admin_0_countries.zip'

# load geodataframe
gdf = gpd.read_file('zip://' + str(geo_data), crs='EPSG:4326')

# subset dataframe to relevant columns
COLUMNS = ['NAME_EN', 'geometry']
gdf_clean = gdf[COLUMNS]

# perform spatial join and perform cleaning steps 
gdf_clean = gpd.sjoin(gdf, df_clean, how='left')
gdf_clean = gdf_clean[['Country_Region', 'NAME_EN', 'geometry']]
gdf_clean = gdf_clean.sort_values('NAME_EN')
gdf_clean = gdf_clean.rename(columns={'NAME_EN': 'name_geo'})
gdf_clean = gdf_clean.reset_index(drop=True)

# save cleaned dataset with geographical references to disk
outputfile = base_dir / 'data/processed/' / 'georeference.json'
gdf_clean.to_file(outputfile, driver="GeoJSON")