In [1]:
%matplotlib notebook

In [2]:
! pip install PyGithub



In [3]:
# Import dependencies
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import datetime

# Import GitHub 'personal access key' to be able to pull source coronavirus data from John Hopkins' public GitHub repo
from config import git_key

In [4]:
# Create defined function to pull an individual data set from GitHub
def corona_df(git_key, branch):
    from github import Github
    import requests
    import io
    # First create a Github instance  using username and password
    # g = Github("user", "password")
    # # or using an access token
    g = Github(git_key)
    
    # Specific GitHub repository name/URL for 'get_repo' request:
    repo = g.get_repo("CSSEGISandData/COVID-19")
    
    # URL for GET requests to retrieve coronavirus data:
    contents = repo.get_contents(branch)
    
    # Reading source data (.csv files) from GitHub branch and decode, returning a Pandas dataframe:
    df = pd.read_csv(io.StringIO(contents.decoded_content.decode('utf-8')))
    return df

In [5]:
# Pull "APIs" using defined formula above
confirmed_df = corona_df(git_key,"/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
deaths_df = corona_df(git_key,"/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")
recovered_df = corona_df(git_key,"/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv")

In [6]:
# View Confirmed cases data
confirmed_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20
0,Anhui,Mainland China,31.82571,117.2264,1,9,15,39,60,70,...,591,665,733,779,830,860,889,910,934,950
1,Beijing,Mainland China,40.18238,116.4142,14,22,36,41,68,80,...,274,297,315,326,337,342,352,366,372,375
2,Chongqing,Mainland China,30.05718,107.874,6,9,27,57,75,110,...,411,426,428,468,486,505,518,529,537,544
3,Fujian,Mainland China,26.07783,117.9895,1,5,10,18,35,59,...,215,224,239,250,261,267,272,279,281,285
4,Gansu,Mainland China,36.0611,103.8343,0,2,2,4,7,14,...,62,67,79,83,83,86,87,90,90,90


In [7]:
# View Deaths data
deaths_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20
0,Anhui,Mainland China,31.82571,117.2264,0,0,0,0,0,0,...,0,0,0,1,3,4,4,5,6,6
1,Beijing,Mainland China,40.18238,116.4142,0,0,0,0,0,1,...,1,1,2,2,2,3,3,3,3,4
2,Chongqing,Mainland China,30.05718,107.874,0,0,0,0,0,0,...,2,2,2,2,2,3,3,4,5,5
3,Fujian,Mainland China,26.07783,117.9895,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Gansu,Mainland China,36.0611,103.8343,0,0,0,0,0,0,...,0,0,1,2,2,2,2,2,2,2


In [8]:
# View Recovered cases data
recovered_df.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/06/20,2/07/20,2/08/20,2/09/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20
0,Anhui,Mainland China,31.82571,117.2264,0.0,0.0,0.0,0.0,0.0,0.0,...,34.0,47.0,59.0,72.0,88.0,105.0,127.0,157.0,193,221
1,Beijing,Mainland China,40.18238,116.4142,0.0,0.0,1.0,2.0,2.0,2.0,...,31.0,33.0,34.0,37.0,44.0,48.0,56.0,69.0,80,98
2,Chongqing,Mainland China,30.05718,107.874,0.0,0.0,0.0,0.0,0.0,0.0,...,24.0,31.0,39.0,51.0,66.0,79.0,102.0,128.0,152,184
3,Fujian,Mainland China,26.07783,117.9895,0.0,0.0,0.0,0.0,0.0,0.0,...,14.0,20.0,24.0,35.0,39.0,45.0,53.0,57.0,63,71
4,Gansu,Mainland China,36.0611,103.8343,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,9.0,12.0,16.0,17.0,24.0,31.0,39.0,39,49


In [9]:
#Define function for cleaning the data
def clean_df(df):
    
    df = df.fillna(value=0)                                          # Fill NaN with zero values
    df = df.drop(columns=[ "Lat", "Long"])                           # Drop "Lat" "Long" for raw number data
    df = df.groupby(['Country/Region'], as_index=False).agg('sum')   # Create groupby object for sorting by country/region and aggregate
    df = df.set_index(["Country/Region"])                            # Define new index
    df = df.astype(int)                                              # Set all values as integers
    df = df.sort_values(by=df.columns[-1], ascending=False)          # Sort by highest value of most recent recorded date

    return df

In [10]:
# Initiate new formula for all data sets and assign to variables
df_deaths_clean = clean_df(deaths_df)
df_confirmed_clean = clean_df(confirmed_df)
df_recovered_clean = clean_df(recovered_df)

In [11]:
# View clean, sorted Confirmed cases data
df_confirmed_clean

# Note: "Others" as a 'Country/Region' means a cruise ship. 
# (JB confirmed in the raw Confirmed cases data: 
# "Diamond Princess cruise ship" appears in that "Others" row 'Province/State' field.)

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mainland China,547,639,916,1399,2062,2863,5494,6070,8124,9783,...,30553,34075,36778,39790,42306,44327,44699,59832,66292,68347
Others,0,0,0,0,0,0,0,0,0,0,...,0,61,61,64,135,135,175,175,218,285
Singapore,0,1,3,3,4,5,7,7,10,13,...,28,30,33,40,45,47,50,58,67,72
Hong Kong,0,2,2,5,8,8,8,10,10,12,...,24,25,26,29,38,49,50,53,56,56
Japan,2,1,2,2,4,4,7,7,11,15,...,45,25,25,26,26,26,28,28,29,43
Thailand,2,3,5,7,8,8,14,14,14,19,...,25,25,32,32,32,33,33,33,33,33
South Korea,1,1,2,2,3,4,4,4,4,11,...,23,24,24,25,27,28,28,28,28,28
Malaysia,0,0,0,3,4,4,4,7,8,8,...,12,12,16,16,18,18,18,19,19,22
Taiwan,1,1,3,3,4,5,8,8,9,10,...,16,16,17,18,18,18,18,18,18,18
Vietnam,0,2,2,2,2,2,2,2,2,2,...,10,10,13,13,14,15,15,16,16,16


In [12]:
# View clean, sorted Deaths data
df_deaths_clean

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mainland China,17,18,26,42,56,82,131,133,171,213,...,632,717,804,904,1011,1111,1116,1368,1520,1662
Hong Kong,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
France,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Japan,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
Philippines,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Sri Lanka,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Singapore,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
South Korea,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Spain,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Australia,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# View clean, sorted Recovered cases data
df_recovered_clean

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,2/06/20,2/07/20,2/08/20,2/09/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mainland China,28,30,36,39,49,58,101,120,135,214,...,1476,1998,2595,3218,3917,4635,5079,6213,7973,9294
Singapore,0,0,0,0,0,0,0,0,0,0,...,0,0,2,2,2,9,15,15,17,18
Japan,0,0,0,0,1,1,1,1,1,1,...,1,1,1,1,4,9,9,9,9,12
Thailand,0,0,0,0,2,2,5,5,5,5,...,5,5,10,10,10,10,10,12,12,12
South Korea,0,0,0,0,0,0,0,0,0,0,...,0,1,1,3,3,3,7,7,7,9
Australia,0,0,0,0,0,0,0,0,2,2,...,2,2,2,2,2,2,2,8,8,8
Malaysia,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,3,3,3,3,7
Vietnam,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,6,6,7,7,7
France,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,2,2,4
United Arab Emirates,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,3


In [14]:
# Set x axis and tick locations
x_axis = np.arange(len(df_confirmed_clean)) # <-- len is counting how many rows in dataframe
tick_locations = [value for value in x_axis]
x_axis

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [16]:
fig, ax = plt.subplots()
ax.bar(x_axis, df_confirmed_clean.columns[-1], color='r', alpha=0.5, align="center");

<IPython.core.display.Javascript object>