In [None]:
%matplotlib inline

In [None]:
! pip install PyGithub

In [None]:
# Import dependencies
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import datetime

# Import GitHub 'personal access key' to be able to pull source coronavirus data from John Hopkins' public GitHub repo
from config import git_key

In [None]:
# Create defined function to pull an individual data set from GitHub
def corona_df(git_key, branch):
    from github import Github
    import requests
    import io
    # First create a Github instance  using username and password
    # g = Github("user", "password")
    # # or using an access token
    g = Github(git_key)
    
    # Specific GitHub repository name/URL for 'get_repo' request:
    repo = g.get_repo("CSSEGISandData/COVID-19")
    
    # URL for GET requests to retrieve coronavirus data:
    contents = repo.get_contents(branch)
    
    # Reading source data (.csv files) from GitHub branch and decode, returning a Pandas dataframe:
    df = pd.read_csv(io.StringIO(contents.decoded_content.decode('utf-8')))
    return df

In [None]:
# Pull "APIs" using defined formula above
confirmed_df = corona_df(git_key,"/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv")
deaths_df = corona_df(git_key,"/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv")
recovered_df = corona_df(git_key,"/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv")

In [None]:
# View Confirmed cases data
confirmed_df.head()

In [None]:
# View Deaths data
deaths_df.head()

In [None]:
# View Recovered cases data
recovered_df.head()

In [None]:
#Define function for cleaning the data
def clean_df(df):
    
    df = df.fillna(value=0)                                          # Fill NaN with zero values
    df = df.drop(columns=[ "Lat", "Long"])                           # Drop "Lat" "Long" for raw number data
    df = df.groupby(['Country/Region'], as_index=False).agg('sum')   # Create groupby object for sorting by country/region and aggregate
    df = df.set_index(["Country/Region"])                            # Define new index
    df = df.astype(int)                                              # Set all values as integers
    df = df.sort_values(by=df.columns[-1], ascending=False)          # Sort by highest value of most recent recorded date

    return df

In [None]:
# Initiate new formula for all data sets and assign to variables
df_deaths_clean = clean_df(deaths_df)
df_confirmed_clean = clean_df(confirmed_df)
df_recovered_clean = clean_df(recovered_df)

In [None]:
# View clean, sorted Confirmed cases data
df_confirmed_clean

# Note: "Others" as a 'Country/Region' means a cruise ship. 
# (JB confirmed in the raw Confirmed cases data: 
# "Diamond Princess cruise ship" appears in that "Others" row 'Province/State' field.)

In [None]:
# View clean, sorted Deaths data
df_deaths_clean

In [None]:
# View clean, sorted Recovered cases data
df_recovered_clean

In [None]:
# Define fig saving function to an 'images' folder in the same directory
def fig_save(fig, file_name):
    import os
    path = os.path.join("images", file_name)
    fig.savefig(path)

In [None]:
# Prepare simple bar chart of Confirmed cases
# Set x axis to the list of countries, which is the index in the cleaned Confirmed cases dataframe
x_axis = df_confirmed_clean.index.values

# Print that list of countries (i.e., the dataframe index) 
x_axis

In [None]:
# See the date of the latest/most recent Confirmed cases data (i.e., header from the last column farthest to the right)
df_confirmed_clean.columns[-1]

In [None]:
# Set y axis to that latest/most recent Confirmed cases data
y_axis_confirmed = df_confirmed_clean.loc[:, df_confirmed_clean.columns[-1]]

# Print that latest/most recent Confirmed cases data
y_axis_confirmed

In [None]:
# Plot bar chart of Confirmed cases by country/region
fig, ax = plt.subplots()
ax.bar(x_axis, y_axis_confirmed, color='r', alpha=1, align="center");   # Set bar chart axes, color, transparency & alignment
fig.set_size_inches(10,8);                                    # Set size of figure in inches, horizontal and vertical, respectively
ax.set_xticklabels(x_axis, rotation=90);                      # Set labels under horizontal x-axis, rotate words to be vertical
ax.set_xlabel("Country/Region");                              # Set horizontal x-label
ax.set_ylabel("People");                                      # Set vertical y-label
ax.set_title("Number of Confirmed Cases by Country/Region");  # Set title
ax.set_ylim(0, y_axis_confirmed.max() + 1000)
plt.tight_layout()
ax.grid(b=None, axis="y")
fig_save(fig, "bar_confirmed_by_country.png")                 # Save fig

In [None]:
# See the date of the latest/most recent Deaths data (i.e., header from the last column farthest to the right)
recent = df_deaths_clean.columns[-1]
recent

In [None]:
# Set y axis to that latest/most recent Deaths data
y_axis_deaths = []
for dead in df_deaths_clean.loc[:, df_deaths_clean.columns[-1]]:
    if dead > 0:
        y_axis_deaths.append(dead)

# Set y axis to only list those countries where there were Deaths occurring (values greater than 0)

# Print that latest/most recent Deaths data
y_axis_deaths

In [None]:
# Create x_axis variable based upon length of y_axis
x_axis_deaths = df_deaths_clean.index[0:len(y_axis_deaths)]
x_axis_deaths

In [None]:
# Plot bar chart of Deaths cases by country/region
fig2, ax2 = plt.subplots()
ax2.bar(x_axis_deaths, y_axis_deaths, color='b', alpha=1, align="center");   # Set bar chart axes, color, transparency & alignment
fig2.set_size_inches(10,8);                                    # Set size of figure in inches, horizontal and vertical, respectively
ax2.set_xticklabels(x_axis_deaths, rotation=90);                      # Set labels under horizontal x-axis, rotate words to be vertical
ax2.set_xlabel("Country/Region");                              # Set horizontal x-label
ax2.set_ylabel("People");                                      # Set vertical y-label
ax2.set_title("Number of Deaths by Country/Region");           # Set title
ax2.set_ylim(0, y_axis_deaths[0] + 100)
ax2.grid(b=None, axis="y")
plt.tight_layout()

fig_save(fig2, "bar_deaths_by_country.png")                 # Save fig

In [None]:
df_deaths_row = df_deaths_clean[1:][recent]
df_confirmed_row = df_confirmed_clean[1:][recent]
df_recovered_row = df_recovered_clean[1:][recent]

In [None]:
fig3, ax3 = plt.subplots(figsize=(10,10))
bar_confirmed = ax3.bar(df_confirmed_row.index, df_confirmed_row, width=0.5)
bar_recovered = ax3.bar(df_recovered_row.index, df_recovered_row, width=0.5)
ax3.set_xticklabels(df_confirmed_row.index, rotation=90)
ax3.legend((bar_confirmed[0], bar_recovered[0]), ('Confirmed', 'Recovered'));
ax3.set_yticks(np.arange(0, df_confirmed_row.max(), 50))
ax3.set_xlabel("Country/Region");                              # Set horizontal x-label
ax3.set_ylabel("People");                                      # Set vertical y-label
ax3.set_title("Number of Confirmed Cases & Recovered Cases by Country/Region Outside of Mainland China"); 
ax3.grid(b=None, axis="y")
plt.tight_layout()

fig_save(fig3, "stacked_bar_ROW.png")

In [None]:
total_dates = len(df_confirmed_clean[:1].columns)
num_list = [1,2,3]
quartiles = [int(round(total_dates*(q/4))) for q in num_list]
quartiles

In [None]:
confirmed_quartiles = list(df_confirmed_clean.iat[0,quartiles[q]] for q in range(0,3))
confirmed_quartiles.append(df_confirmed_clean.iat[0, -1])
deaths_quartiles = list(df_deaths_clean.iat[0,quartiles[q]] for q in range(0,3))
deaths_quartiles.append(df_deaths_clean.iat[0, -1])
recovered_quartiles = list(df_recovered_clean.iat[0,quartiles[q]] for q in range(0,3))
recovered_quartiles.append(df_recovered_clean.iat[0, -1])

In [None]:
np.arange(len(confirmed_quartiles))

In [None]:
ind = np.arange(len(confirmed_quartiles))  # the x locations for the groups
width = 0.35  # the width of the bars

fig4, ax4 = plt.subplots(figsize=(10,10))
pt1 = ax4.bar(ind - width, confirmed_quartiles, width=width,
                label='Confirmed')
pt2 = ax4.bar(ind + width/2, deaths_quartiles, width=width,
                label='Deaths')
pt3 = ax4.bar(ind - width/3, recovered_quartiles, width=width,
                label='Recovered')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax4.set_ylabel('People')
ax4.set_title('Corona Virus in Mainland China Over Time')
ax4.set_xticks(ind)
ax4.set_xticklabels((df_deaths_clean.columns[quartiles[0]],df_deaths_clean.columns[quartiles[1]],
                    df_deaths_clean.columns[quartiles[2]],df_deaths_clean.columns[-1]))
ax4.legend()
ax4.grid(axis="y")
plt.tight_layout()

fig_save(fig4, "layered_bar_MLC.png")

In [None]:
fig5, ax5 = plt.subplots(figsize=(15,10))
bar_confirmed = ax5.bar(df_confirmed_row.index, df_confirmed_row, width=.75)
ax5.set_xticklabels(df_confirmed_row.index, rotation=90)
ax5.set_yticks(np.arange(0, df_confirmed_row.max(), 50))
ax5.set_xlabel("Country/Region");                              # Set horizontal x-label
ax5.set_ylabel("People");                                      # Set vertical y-label
ax5.set_title("Number of Confirmed Cases by Country/Region Outside of Mainland China (ROW)"); 
ax5.grid(b=None, axis="y")
plt.tight_layout()

fig_save(fig5, "bar_ROW.png")