### Saving the permanent links to the JHU cases and deaths data to the folder /data

In [5]:
from datetime import datetime
from tqdm import tqdm
from github import Github
import pandas as pd

def get_JHU_data(g, dt):
    """Get the latest data links at a given time (dt)."""
    
    # repo object
    repo = g.get_repo("CSSEGISandData/COVID-19")
    
    # get the latest commit at dt
    commit = repo.get_commits(until=dt)[0]
    
    # generate the urls to the files
    return [
        f"https://raw.githubusercontent.com/CSSEGISandData/COVID-19/{commit.sha}/csse_covid_19_data\
/csse_covid_19_time_series/time_series_covid19_{data_type}_{scope}.csv"
        for scope in ['global', 'US']
        for data_type in ['confirmed', 'deaths']
    ]

In [9]:
# https://github.com/settings/tokens  -> personal access token
# 5000 requests/hour
MY_GITHUB_TOKEN = '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' # this is Katya's token, to modify if needed
g = Github(login_or_token=MY_GITHUB_TOKEN)

In [11]:
# sample data at 6am UTC every day
sampling_time = '06:00:00'

# from the April 1 2020 till January 31 2021 
dt_range = pd.date_range(start=f"2020-04-01 {sampling_time}", end=f"2022-03-13 {sampling_time}", freq='D').to_pydatetime()

links = {}
for dt in tqdm(dt_range):
    links[str(dt.date())] = get_JHU_data(g, dt)


  0%|          | 0/712 [00:00<?, ?it/s][A
  0%|          | 1/712 [00:00<06:02,  1.96it/s][A
  0%|          | 2/712 [00:00<05:41,  2.08it/s][A
  0%|          | 3/712 [00:01<05:29,  2.15it/s][A
  1%|          | 4/712 [00:01<05:26,  2.17it/s][A
  1%|          | 5/712 [00:02<05:18,  2.22it/s][A
  1%|          | 6/712 [00:02<05:18,  2.22it/s][A
  1%|          | 7/712 [00:03<05:08,  2.29it/s][A
  1%|          | 8/712 [00:03<05:06,  2.30it/s][A
  1%|▏         | 9/712 [00:03<05:08,  2.28it/s][A
  1%|▏         | 10/712 [00:04<05:05,  2.30it/s][A
  2%|▏         | 11/712 [00:04<05:04,  2.30it/s][A
  2%|▏         | 12/712 [00:05<05:22,  2.17it/s][A
  2%|▏         | 13/712 [00:05<05:29,  2.12it/s][A
  2%|▏         | 14/712 [00:06<05:16,  2.20it/s][A
  2%|▏         | 15/712 [00:06<05:17,  2.20it/s][A
  2%|▏         | 16/712 [00:07<05:22,  2.16it/s][A
  2%|▏         | 17/712 [00:07<05:21,  2.16it/s][A
  3%|▎         | 18/712 [00:08<05:07,  2.26it/s][A
  3%|▎         | 19/712 [00:0

#### Save the links

In [12]:
df_links = pd.DataFrame(links).T
df_links.columns = ['global-confirmed', 'global-deaths', 'US-confirmed', 'US-deaths']

In [13]:
df_links.to_csv('../data/JHU_data_links.csv')

#### Check the results

In [4]:
sampling_time = '06:00:00'

# from the April 1 2020 till January 31 2021 
dt_range = pd.date_range(start=f"2022-01-10 {sampling_time}", end=f"2022-01-12 {sampling_time}", freq='D').to_pydatetime()

links = {}
for dt in tqdm(dt_range):
    links[str(dt.date())] = get_JHU_data(g, dt)

100%|██████████| 3/3 [00:01<00:00,  2.27it/s]


In [5]:
links

{'2022-01-10': ['https://raw.githubusercontent.com/CSSEGISandData/COVID-19/f5a36cca9fec296ad6f0c19a206a4be523ad6fe5/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv',
  'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/f5a36cca9fec296ad6f0c19a206a4be523ad6fe5/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',
  'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/f5a36cca9fec296ad6f0c19a206a4be523ad6fe5/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv',
  'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/f5a36cca9fec296ad6f0c19a206a4be523ad6fe5/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv'],
 '2022-01-11': ['https://raw.githubusercontent.com/CSSEGISandData/COVID-19/b849410bace2cc777227f0f1ac747a74fd8dc4be/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv',
  'https://raw.githubusercontent.co