In [1]:
import numpy as np
import pandas as pd
import sqlite3

In [2]:
class Covid19:
    def get_lookup_table(self):
        uid_iso_fips_lookup_table = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/UID_ISO_FIPS_LookUp_Table.csv")
        uid_iso_fips_lookup_table['Country_Region'] = uid_iso_fips_lookup_table['Country_Region'].str.replace('*', '', regex=False)
        uid_iso_fips_lookup_table['Combined_Key'] = uid_iso_fips_lookup_table['Combined_Key'].str.replace('*', '', regex=False)
        uid_iso_fips_lookup_table['Population'] = uid_iso_fips_lookup_table['Population'].astype('Int64')
        split_series = uid_iso_fips_lookup_table['Combined_Key'].str.split(', ')
        counties = []
        states = []
        for lst in split_series:
            if len(lst) == 1:
                counties.append(np.nan)
                states.append(np.nan)
            elif len(lst) == 2:
                counties.append(np.nan)
                states.append(lst[0])
            elif len(lst) == 3:
                counties.append(lst[0])
                states.append(lst[1])
        uid_iso_fips_lookup_table['Admin2'] = counties
        uid_iso_fips_lookup_table['Province_State'] = states
        uid_iso_fips_lookup_table = uid_iso_fips_lookup_table[['UID', 'Combined_Key',
                                                               'iso2', 'iso3',
                                                               'Country_Region', 'Province_State', 'Admin2',
                                                               'Lat', 'Long_', 'Population']]
        return uid_iso_fips_lookup_table

    def get_daily_report(self, report_date):
        self._report_date = report_date
        try:
            daily_report = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{}.csv".format(report_date))
            daily_report['Country_Region'] = daily_report['Country_Region'].str.replace('*', '', regex=False)
            daily_report['Combined_Key'] = daily_report['Combined_Key'].str.replace('*', '', regex=False)
            daily_report = daily_report.drop(labels=['Active', 'Lat', 'Long_', 'FIPS', 'Admin2', 'Province_State', 'Country_Region'], axis=1)
            return daily_report[['Combined_Key', 'Last_Update', 'Confirmed', 'Deaths']]
        except:
            print("Wrong format or unavailable report date: {}.".format(report_date))
            print("Expecting mm-dd-yyyy format.")
    def get_time_series(self):
        time_series_confirmed = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
        time_series_deathes = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
        time_series_confirmed['Province/State'] = time_series_confirmed['Province/State'].fillna(time_series_confirmed['Country/Region'])
        time_series_deathes['Province/State'] = time_series_deathes['Province/State'].fillna(time_series_deathes['Country/Region'])
        time_series_confirmed = time_series_confirmed.drop(labels=['Lat', 'Long'], axis=1)
        time_series_deathes = time_series_deathes.drop(labels=['Lat', 'Long'], axis=1)
        time_series_confirmed_long = pd.melt(time_series_confirmed, id_vars=['Province/State', 'Country/Region'], var_name='Date', value_name='Confirmed')
        time_series_deathes_long = pd.melt(time_series_deathes, id_vars=['Province/State', 'Country/Region'], var_name='Date', value_name='Deaths')
        time_series = time_series_confirmed_long
        time_series['Deaths'] = time_series_deathes_long['Deaths']
        time_series['Date'] = pd.to_datetime(time_series['Date'])
        time_series = time_series[time_series['Date'] <= pd.to_datetime(self._report_date)]
        date_series = time_series['Date'].dt.strftime('%Y-%m-%d')
        time_series = time_series.drop('Date', axis=1)
        time_series['Date'] = date_series
        time_series['Country/Region'] = time_series['Country/Region'].str.replace('*', '', regex=False)
        groupby_date_country = time_series.groupby(['Date', 'Country/Region'])
        time_series = groupby_date_country[['Confirmed', 'Deaths']].sum().reset_index()
        time_series.columns = ['Date', 'Country_Region', 'Confirmed', 'Deaths']
        confirmed_shifted = time_series.groupby('Country_Region')['Confirmed'].shift(1, fill_value=0)
        deaths_shifted = time_series.groupby('Country_Region')['Deaths'].shift(1, fill_value=0)
        daily_cases = time_series['Confirmed'] - confirmed_shifted
        daily_deaths = time_series['Deaths'] - deaths_shifted
        n_cols = time_series.shape[1]
        time_series.insert(n_cols, 'Daily_Cases', daily_cases)
        n_cols = time_series.shape[1]
        time_series.insert(n_cols, 'Daily_Deaths', daily_deaths)
        return time_series

In [3]:
covid19 = Covid19()
lookup_table = covid19.get_lookup_table()
daily_report = covid19.get_daily_report('01-31-2023')
time_series = covid19.get_time_series()

In [4]:
lookup_table

Unnamed: 0,UID,Combined_Key,iso2,iso3,Country_Region,Province_State,Admin2,Lat,Long_,Population
0,4,Afghanistan,AF,AFG,Afghanistan,,,33.939110,67.709953,38928341
1,8,Albania,AL,ALB,Albania,,,41.153300,20.168300,2877800
2,10,Antarctica,AQ,ATA,Antarctica,,,-71.949900,23.347000,
3,12,Algeria,DZ,DZA,Algeria,,,28.033900,1.659600,43851043
4,20,Andorra,AD,AND,Andorra,,,42.506300,1.521800,77265
...,...,...,...,...,...,...,...,...,...,...
4316,84056037,"Sweetwater, Wyoming, US",US,USA,US,Wyoming,Sweetwater,41.659439,-108.882788,42343
4317,84056039,"Teton, Wyoming, US",US,USA,US,Wyoming,Teton,43.935225,-110.589080,23464
4318,84056041,"Uinta, Wyoming, US",US,USA,US,Wyoming,Uinta,41.287818,-110.547578,20226
4319,84056043,"Washakie, Wyoming, US",US,USA,US,Wyoming,Washakie,43.904516,-107.680187,7805


In [5]:
daily_report

Unnamed: 0,Combined_Key,Last_Update,Confirmed,Deaths
0,Afghanistan,2023-02-01 04:20:54,208545,7882
1,Albania,2023-02-01 04:20:54,334167,3596
2,Algeria,2023-02-01 04:20:54,271378,6881
3,Andorra,2023-02-01 04:20:54,47839,165
4,Angola,2023-02-01 04:20:54,105184,1931
...,...,...,...,...
4011,West Bank and Gaza,2023-02-01 04:20:54,703228,5708
4012,Winter Olympics 2022,2023-02-01 04:20:54,535,0
4013,Yemen,2023-02-01 04:20:54,11945,2159
4014,Zambia,2023-02-01 04:20:54,340763,4047


In [6]:
time_series

Unnamed: 0,Date,Country_Region,Confirmed,Deaths,Daily_Cases,Daily_Deaths
0,2020-01-22,Afghanistan,0,0,0,0
1,2020-01-22,Albania,0,0,0,0
2,2020-01-22,Algeria,0,0,0,0
3,2020-01-22,Andorra,0,0,0,0
4,2020-01-22,Angola,0,0,0,0
...,...,...,...,...,...,...
222301,2023-01-31,West Bank and Gaza,703228,5708,0,0
222302,2023-01-31,Winter Olympics 2022,535,0,0,0
222303,2023-01-31,Yemen,11945,2159,0,0
222304,2023-01-31,Zambia,340763,4047,181,1


In [7]:
lookup_table.to_csv('lookup_table.csv', index=False)
daily_report.to_csv('daily_report.csv', index=False)
time_series.to_csv('time_series.csv', index=False)

In [8]:
con = sqlite3.connect('covid19.db')
lookup_table.to_sql('lookup_table', con, if_exists='replace', index=False)
daily_report.to_sql('daily_report', con, if_exists='replace', index=False)
time_series.to_sql('time_series', con, if_exists='replace', index=False)
cur = con.cursor()

In [9]:
create_lookup_table = """
PRAGMA foreign_keys=off;
BEGIN TRANSACTION;
ALTER TABLE lookup_table RENAME TO lookup_table_no_keys;
CREATE TABLE lookup_table (
    UID INTEGER,
    Combined_Key TEXT,
    iso2 TEXT,
    iso3 TEXT,
    Country_Region TEXT,
    Province_State TEXT,
    Admin2 TEXT,
    Lat REAL,
    Long_ REAL,
    Population INTEGER,
    PRIMARY KEY (UID)
);
INSERT INTO lookup_table SELECT * FROM lookup_table_no_keys;
COMMIT;
PRAGMA foreign_keys=on;
"""
cur.executescript(create_lookup_table)
con.commit()

In [10]:
create_daily_report = """
PRAGMA foreign_keys=off;
BEGIN TRANSACTION;
ALTER TABLE daily_report RENAME TO daily_report_no_keys;
CREATE TABLE daily_report (
    Combined_Key TEXT,
    Last_Update TEXT,
    Confirmed INTEGER,
    Deaths INTEGER,
    PRIMARY KEY (Combined_Key),
    FOREIGN KEY (Combined_Key) REFERENCES lookup_table (Combined_Key) 
            ON DELETE CASCADE ON UPDATE NO ACTION
);
INSERT INTO daily_report SELECT * FROM daily_report_no_keys;
COMMIT;
PRAGMA foreign_keys=on;
"""
cur.executescript(create_daily_report)
con.commit()

In [11]:
create_time_series = """
PRAGMA foreign_keys=off;
BEGIN TRANSACTION;
ALTER TABLE time_series RENAME TO time_series_no_keys;
CREATE TABLE time_series (
    Date TEXT,
    Country_Region TEXT,
    Confirmed INTEGER,
    Deaths INTEGER,
    Daily_Cases INTEGER,
    Daily_Deaths INTEGER,
    PRIMARY KEY (Date, Country_Region),
    FOREIGN KEY (Country_Region) REFERENCES lookup_table (Country_Region) 
            ON DELETE CASCADE ON UPDATE NO ACTION
);
INSERT INTO time_series SELECT * FROM time_series_no_keys;
COMMIT;
PRAGMA foreign_keys=on;
"""
cur.executescript(create_time_series)
con.commit()

In [12]:
drop_tables = """
DROP TABLE lookup_table_no_keys;
DROP TABLE daily_report_no_keys;
DROP TABLE time_series_no_keys;
"""
cur.executescript(drop_tables)
con.commit()

In [13]:
con.close()