In [1]:
import pandas
import lib.csse_covid_19_dataset.data_processing as data_processing
import data

output_dir = data.data_dir

output_filename = data.hopkins_data_filename


infected_column_name = 'Infected (confirmed)'
deceased_column_name = 'Deceased'
recovered_column_name = 'Recovered'

# get confirmed data
df_csse_covid_19_dataset_confirmed = pandas.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv',
                                                     infer_datetime_format=True, parse_dates=True)
df_confirmed_prep = data_processing.getCasesPerDayAndCountry(df_csse_covid_19_dataset_confirmed)
# add toplevel column header for confirmed
df_confirmed_prep.columns = pandas.MultiIndex.from_tuples(map(lambda x: (infected_column_name, x[0], x[1]), df_confirmed_prep.columns))

# get deaths data
df_csse_covid_19_dataset_deaths = pandas.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv',
                                           infer_datetime_format=True, parse_dates=True)
df_deaths_prep = data_processing.getCasesPerDayAndCountry(df_csse_covid_19_dataset_deaths)
# add toplevel column header for deceased
df_deaths_prep.columns = pandas.MultiIndex.from_tuples(map(lambda x: (deceased_column_name, x[0], x[1]), df_deaths_prep.columns))

# get recovered data
df_csse_covid_19_dataset_recovered = pandas.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv',
                                           infer_datetime_format=True, parse_dates=True)
df_recovered_prep = data_processing.getCasesPerDayAndCountry(df_csse_covid_19_dataset_recovered)
# add toplevel column header for recovered
df_recovered_prep.columns = pandas.MultiIndex.from_tuples(map(lambda x: (recovered_column_name, x[0], x[1]), df_recovered_prep.columns))

# get everything into one dataframe
df_prep = pandas.concat([df_confirmed_prep, df_deaths_prep, df_recovered_prep], axis=1)

# Remove sum up provice data
# Flip indices
df_prep.columns = df_prep.columns.swaplevel(0, 1)
df_prep.columns = df_prep.columns.swaplevel(1, 2)
#print(df_prep.columns.values)
# e.g. ('US', 'Ulster County, NY', 'Infected (confirmed)')
# Remove provinces and sum up values from provinces
data_processing.dropProvinceIndex(df_prep)
df_prep = df_prep.groupby(level=[0, 1], axis=1).sum()
#print(df_prep.columns)

print("Loaded covid-19 data:")
print(df_prep.columns.values)
print(df_prep.index.values)

df_prep.to_csv(output_dir+output_filename)

Loaded covid-19 data:
[('Afghanistan', 'Deceased') ('Afghanistan', 'Infected (confirmed)')
 ('Afghanistan', 'Recovered') ('Albania', 'Deceased')
 ('Albania', 'Infected (confirmed)') ('Albania', 'Recovered')
 ('Algeria', 'Deceased') ('Algeria', 'Infected (confirmed)')
 ('Algeria', 'Recovered') ('Andorra', 'Deceased')
 ('Andorra', 'Infected (confirmed)') ('Andorra', 'Recovered')
 ('Antigua and Barbuda', 'Deceased')
 ('Antigua and Barbuda', 'Infected (confirmed)')
 ('Antigua and Barbuda', 'Recovered') ('Argentina', 'Deceased')
 ('Argentina', 'Infected (confirmed)') ('Argentina', 'Recovered')
 ('Armenia', 'Deceased') ('Armenia', 'Infected (confirmed)')
 ('Armenia', 'Recovered') ('Australia', 'Deceased')
 ('Australia', 'Infected (confirmed)') ('Australia', 'Recovered')
 ('Austria', 'Deceased') ('Austria', 'Infected (confirmed)')
 ('Austria', 'Recovered') ('Azerbaijan', 'Deceased')
 ('Azerbaijan', 'Infected (confirmed)') ('Azerbaijan', 'Recovered')
 ('Bahamas, The', 'Deceased') ('Bahamas, Th