### Datasets & Source

In [2]:
import pandas as pd

# If True then use local datasets with latest record date 2022-3-11
# If False then read from the updated source
TEST = True


Datasets are from two most trusted and well-maintained public repositories:
1. Johns Hopkins Univeristy Center for Systems Science and Engineering Dataset for Covid-19 (Hereinafter referred as CSSEGIS dataset). GitHub link: https://github.com/CSSEGISandData/COVID-19.
> Ensheng Dong, Hongru Du, Lauren Gardner. 2020. ``An interactive web-based dashboard to track COVID-19 in real time". The Lancet Infectious Diseases Correpsondence, volume 20, issue 5, p533-534. DOI:10.1016/S1473-3099(20)30120-1.
2. Our World in Data Covid-19 Dataset (Hereinafter referred as OWID dataset). GitHub link: https://github.com/owid/covid-19-data.

Both of them are updated on a daily basis.

More precisely, we will use three time series from CSSEGIS dataset for global confirmed, global recovered, and global death cases.

In [3]:
# Confirmed cases dataframe
confirmed_df = pd.read_csv(
    'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv' if not TEST else 'Test_Data/time_series_covid19_confirmed_global.csv'
    )
confirmed_df.head(5)


Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/2/22,3/3/22,3/4/22,3/5/22,3/6/22,3/7/22,3/8/22,3/9/22,3/10/22,3/11/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,174073,174214,174214,174331,174582,175000,175353,175525,175893,175974
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,271825,271825,272030,272030,272210,272250,272337,272412,272479,272552
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,265079,265130,265186,265227,265265,265297,265323,265346,265366,265391
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,38249,38342,38434,38434,38434,38620,38710,38794,38794,38794
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,98746,98746,98796,98796,98806,98806,98829,98855,98855,98855


In [4]:
# Recovered cases dataframe
recovered_df = pd.read_csv(
    'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv' if not TEST else 'Test_Data/time_series_covid19_recovered_global.csv'
    )
recovered_df.head(5)


Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/2/22,3/3/22,3/4/22,3/5/22,3/6/22,3/7/22,3/8/22,3/9/22,3/10/22,3/11/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Death cases dataframe
deaths_df = pd.read_csv(
    'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv' if not TEST else 'Test_Data/time_series_covid19_deaths_global.csv'
    )
deaths_df.head(5)


Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,3/2/22,3/3/22,3/4/22,3/5/22,3/6/22,3/7/22,3/8/22,3/9/22,3/10/22,3/11/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,7617,7619,7619,7622,7623,7626,7630,7636,7639,7640
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,3474,3474,3478,3478,3482,3483,3483,3483,3484,3485
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,6843,6848,6852,6853,6855,6857,6858,6860,6861,6861
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,151,151,151,151,151,151,152,152,152,152
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,1900,1900,1900,1900,1900,1900,1900,1900,1900,1900


And two time series from OWID dataset for vaccination and hospitalization/ICU.

In [6]:
vaccines_df = pd.read_csv(
    'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv' if not TEST else 'Test_Data/vaccinations.csv'
    )
vaccines_df.head(5)


Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
0,Afghanistan,AFG,2021-02-22,0.0,0.0,,,,,0.0,0.0,,,,,
1,Afghanistan,AFG,2021-02-23,,,,,,1367.0,,,,,34.0,1367.0,0.003
2,Afghanistan,AFG,2021-02-24,,,,,,1367.0,,,,,34.0,1367.0,0.003
3,Afghanistan,AFG,2021-02-25,,,,,,1367.0,,,,,34.0,1367.0,0.003
4,Afghanistan,AFG,2021-02-26,,,,,,1367.0,,,,,34.0,1367.0,0.003


In [7]:
hospitals_df = pd.read_csv(
    'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/hospitalizations/covid-hospitalizations.csv' if not TEST else 'Test_Data/covid-hospitalizations.csv'
    )
hospitals_df.head(5)


Unnamed: 0,entity,iso_code,date,indicator,value
0,Algeria,DZA,2020-07-17,Daily ICU occupancy,62.0
1,Algeria,DZA,2020-07-17,Daily ICU occupancy per million,1.39
2,Algeria,DZA,2020-07-18,Daily ICU occupancy,67.0
3,Algeria,DZA,2020-07-18,Daily ICU occupancy per million,1.502
4,Algeria,DZA,2020-07-20,Daily ICU occupancy,64.0


### Preparation & Preprocessing


In [8]:
import datetime

Since vaccination is only available after a while from the outbreak, we are going to explore on the half-year data, from 2021-9-1 to 2022-3-1. Alignment is required because the two data sources use different record formats:
- Generate desired columns/rows to keep for the two sources respectively. 

In [33]:
start_date, end_date = datetime.date(2021, 9, 1), datetime.date(2022, 3, 1)
CSSEGIS_col, OWID_date = ['Province/State', 'Country/Region', 'Lat', 'Long'], []

# Single date generator
def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + datetime.timedelta(n)

for single_date in daterange(start_date, end_date):
    CSSEGIS_col.append(single_date.strftime('%#m/%#d/%y'))
    OWID_date.append(single_date.strftime('%Y-%m-%d'))

CSSEGIS_col[:5], OWID_date[:5]

(['Province/State', 'Country/Region', 'Lat', 'Long', '9/1/21'],
 ['2021-09-01', '2021-09-02', '2021-09-03', '2021-09-04', '2021-09-05'])

- Keep only data within the range of interest for the two sources respectively.

In [34]:
halfyear_confirmed_df = confirmed_df[confirmed_df.columns.intersection(CSSEGIS_col)]
halfyear_recovered_df = recovered_df[recovered_df.columns.intersection(CSSEGIS_col)]
halfyear_deaths_df = deaths_df[deaths_df.columns.intersection(CSSEGIS_col)]

# halfyear_confirmed_df.head(5)
# halfyear_recovered_df.head(5)
halfyear_deaths_df.head(5)

Unnamed: 0,Province/State,Country/Region,Lat,Long,9/1/21,9/2/21,9/3/21,9/4/21,9/5/21,9/6/21,...,2/19/22,2/20/22,2/21/22,2/22/22,2/23/22,2/24/22,2/25/22,2/26/22,2/27/22,2/28/22
0,,Afghanistan,33.93911,67.709953,7123,7127,7127,7128,7132,7141,...,7534,7539,7549,7569,7574,7578,7579,7585,7593,7598
1,,Albania,41.1533,20.1683,2501,2505,2508,2512,2515,2519,...,3433,3438,3445,3451,3453,3457,3458,3458,3464,3469
2,,Algeria,28.0339,1.6596,5302,5339,5373,5399,5420,5445,...,6787,6797,6805,6812,6816,6820,6823,6828,6831,6835
3,,Andorra,42.5063,1.5218,130,130,130,130,130,130,...,150,150,150,151,151,151,151,151,151,151
4,,Angola,-11.2027,17.8739,1227,1235,1248,1258,1270,1282,...,1899,1899,1899,1899,1899,1899,1899,1899,1899,1900


In [49]:
halfyear_vaccines_df = vaccines_df[vaccines_df.date.isin(OWID_date)]
halfyear_hospitals_df = hospitals_df[hospitals_df.date.isin(OWID_date)]

# halfyear_hospitals_df.head(5)
halfyear_vaccines_df.head(5)


Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,daily_vaccinations_per_million,daily_people_vaccinated,daily_people_vaccinated_per_hundred
191,Afghanistan,AFG,2021-09-01,,,,,,62509.0,,,,,1569.0,456.0,0.001
192,Afghanistan,AFG,2021-09-02,,,,,,54257.0,,,,,1362.0,689.0,0.002
193,Afghanistan,AFG,2021-09-03,,,,,,46006.0,,,,,1155.0,922.0,0.002
194,Afghanistan,AFG,2021-09-04,,,,,,37754.0,,,,,948.0,1155.0,0.003
195,Afghanistan,AFG,2021-09-05,,,,,,29502.0,,,,,741.0,1388.0,0.003


- Aligned them with respect to countries or regions. Note that *hospital_df* has only 47 unique countries/regions and therefore will not be aligned (keep it for later use). 

In [110]:
align_countries = set(halfyear_vaccines_df.location).intersection(set(halfyear_confirmed_df['Country/Region']))

# GSSEGIS
aligned_halfyear_confirmed_df = halfyear_confirmed_df[halfyear_confirmed_df['Country/Region'].isin(align_countries)]
aligned_halfyear_recovered_df = halfyear_recovered_df[halfyear_recovered_df['Country/Region'].isin(align_countries)]
aligned_halfyear_deaths_df = halfyear_deaths_df[halfyear_deaths_df['Country/Region'].isin(align_countries)]

# OWID
aligned_halfyear_vaccines_df = halfyear_vaccines_df[halfyear_vaccines_df['location'].isin(align_countries)]

set(aligned_halfyear_confirmed_df['Country/Region']) == set(aligned_halfyear_vaccines_df.location)

True

- Reformat the *aligned_halfyear_confirmed_df* with respect to date and remove null values. Use Country/Region as rows, dates as column, and *daily_vaccinations* as values. 
> daily_vaccinations: new doses administered per day (7-day smoothed). For countries that don't report data on a daily basis, we assume that doses changed equally on a daily basis over any periods in which no data was reported. This produces a complete series of daily figures, which is then averaged over a rolling 7-day window. --OWID Dataset

In [112]:
aligned_halfyear_daily_vaccines_df = pd.DataFrame()

count = 0
for country in aligned_halfyear_confirmed_df['Country/Region']:
    
    # Country, Latite, Longtitude
    temp = {
        'Country': [country],
        'Lat': aligned_halfyear_confirmed_df.loc[
            aligned_halfyear_confirmed_df['Country/Region'] == country
            ]['Lat'].values[0],
        'Long': aligned_halfyear_confirmed_df.loc[
            aligned_halfyear_confirmed_df['Country/Region'] == country
            ]['Long'].values[0]
        }
    
    # Number of vaccination across the whole range of interest for the country
    daily_vaccinations = aligned_halfyear_vaccines_df.loc[
        aligned_halfyear_vaccines_df['location'] == country
    ]['daily_vaccinations'].values

    for date, num in zip(OWID_date, daily_vaccinations): temp[date] = [num]

    aligned_halfyear_daily_vaccines_df = pd.concat(
        [aligned_halfyear_daily_vaccines_df, pd.DataFrame(temp)],
        ignore_index=True, axis=0)

aligned_halfyear_daily_vaccines_df.head(5)


Unnamed: 0,Country,Lat,Long,2021-09-01,2021-09-02,2021-09-03,2021-09-04,2021-09-05,2021-09-06,2021-09-07,...,2022-02-19,2022-02-20,2022-02-21,2022-02-22,2022-02-23,2022-02-24,2022-02-25,2022-02-26,2022-02-27,2022-02-28
0,Afghanistan,33.93911,67.709953,62509.0,54257.0,46006.0,37754.0,29502.0,21251.0,12999.0,...,15091.0,15129.0,15509.0,15852.0,16194.0,16536.0,16879.0,17221.0,17564.0,16317.0
1,Albania,41.1533,20.1683,11371.0,11999.0,11679.0,11885.0,12091.0,11598.0,11616.0,...,3280.0,3125.0,2945.0,2775.0,2663.0,2673.0,2683.0,2693.0,2703.0,2741.0
2,Algeria,28.0339,1.6596,256927.0,256927.0,256927.0,256927.0,255753.0,254579.0,253405.0,...,14207.0,14207.0,,,,,,,,
3,Andorra,42.5063,1.5218,267.0,255.0,243.0,231.0,218.0,218.0,218.0,...,,,,,,,,,,
4,Angola,-11.2027,17.8739,34965.0,35442.0,35918.0,36395.0,36871.0,37348.0,37824.0,...,57521.0,55373.0,53225.0,51077.0,50451.0,49825.0,49199.0,48573.0,47947.0,47321.0
