In [1]:
# comparisons
# https://docs.google.com/spreadsheets/u/1/d/e/2PACX-1vTX5x2nxCde90Zwo83cdixZsyd_hU1orGsGYKpDe344wHeFi9MqI71aZYC6GLjOV_P2lp6_lUoacPNa/pubhtml?gid=1467787327&single=true
# https://twitter.com/jkwan_md/status/1312035412274221058?s=21
# https://russell-pollari.github.io/ontario-covid19/

import pandas as pd
import numpy as np
import urllib, json
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from mpl_toolkits.mplot3d import Axes3D
# from mpl_toolkits.basemap import Basemap
from matplotlib.collections import PolyCollection
import seaborn as sns
import datetime as dt
import geopandas as gpd
import folium
from shapely.geometry import Point, Polygon

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# set the standard plot size
plt.rcParams['figure.figsize']=(30,15)

In [None]:
%%time
# get the "Confirmed positive cases of COVID19 in Ontario" data
# https://data.ontario.ca/dataset/confirmed-positive-cases-of-covid-19-in-ontario/resource/455fd63b-603d-4608-8216-7d8647f43350
# import data using the ontario open data API
# make sure to check the limit against current cases
url = "https://data.ontario.ca/en/api/3/action/datastore_search?resource_id=455fd63b-603d-4608-8216-7d8647f43350&limit=1500000"
response = urllib.request.urlopen(url)
data = json.loads(response.read().decode('utf-8'))

# use this if you want to see the json
# print (data['result']['records'])

# use this if you want to write it to a pandas dataframe
df = pd.DataFrame(data['result']['records'])

In [None]:
%%time
# get the "Status of COVID-19 Cases in Ontario" data
# https://data.ontario.ca/dataset/status-of-covid-19-cases-in-ontario
# make sure to check the limit against current cases
url = "https://data.ontario.ca/en/api/3/action/datastore_search?resource_id=ed270bb8-340b-41f9-a7c6-e8ef587e6d11&limit=1500000"
response = urllib.request.urlopen(url)
data = json.loads(response.read().decode('utf-8'))

# use this if you want to see the json
# print (data['result']['records'])

status_df = pd.DataFrame(data['result']['records'])

In [None]:
df.info()
df.head()

In [None]:
status_df.info()
status_df.head()

In [None]:
# sort by date
df = df.sort_values(by=['Accurate_Episode_Date'], ascending=False);
status_df = status_df.sort_values(by=['Reported Date'], ascending=True);

In [None]:
# see how many new cases are reported today in the "confirmed positive" dataset (make sure to change value to today's date)
# keep in mind that the numbers are always missing the most recent day or two
today = df[df['Accurate_Episode_Date'].str.contains("2021-01-10", na=False)]
today.head(10)

In [None]:
# set the dates to datetime64 for later use
df["Accurate_Episode_Date"] = df["Accurate_Episode_Date"].astype("datetime64")
df["Case_Reported_Date"] = df["Case_Reported_Date"].astype("datetime64")
df["Test_Reported_Date"] = df["Test_Reported_Date"].astype("datetime64")
df["Specimen_Date"] = df["Specimen_Date"].astype("datetime64")

In [None]:
# for accurate daily totals, better to use the "status of covid cases" dataset
# you'll need to create a new column for daily growth in cases
status_df['New Cases'] = status_df['Total Cases'].diff(periods=1).fillna(0)
status_df['New Deaths'] = status_df['Deaths'].diff(periods=1).fillna(0)

In [None]:
# plot cases by public health unit
fig = plt.figure()
fig.suptitle('COVID-19 Cases by Public Health Unit', fontsize=25) # Add the text/suptitle to figure

ax = df['Reporting_PHU'].value_counts().plot(
    kind='barh', 
    figsize=(14,8)
)
ax.set_xlabel("Public Health Unit", fontsize=16)
ax.set_ylabel("Cases Reported by PHU", fontsize=16)
ax.tick_params(axis='both', labelsize=14)

# if you want to save your graph
# fig.savefig('PHU.jpg')

In [None]:
# use vanilla matplotlib to plot new cases based on reported change in the "Status of COVID" dataset
fig = plt.figure()
fig.suptitle('Ontario COVID-19 Reported Cases', fontsize=24) # Add the text/suptitle to figure

y = status_df['New Cases']
x = status_df['Reported Date']
#plt.bar(counts.index,counts)

# plt.plot(x, y) # line plot
plt.fill_between(x, y) # area plot
plt.tick_params(axis ='x', rotation = 90)
plt.xticks(np.arange(0, 300, 50))

plt.show()

In [None]:
# or using seaborn's barplot
g = sns.barplot(x="Reported Date", y="New Cases",  data=status_df)
g.set(xticks=[]);

# g.fig.set_size_inches(30,15) # and we'll use this to change the figure size

In [None]:
# deaths only
deaths = df[df.Outcome1 == 'Fatal']
deaths.head()

In [None]:
# add moving averages
fig = plt.figure()
fig.suptitle('Ontario COVID-19 Reported Cases', fontsize=24) # Add the text/suptitle to figure

# variables for x and y axes
y = status_df['New Cases']
x = status_df['Reported Date']

# create some rolling averages
rolling_mean5 = y.rolling(window=5).mean()
rolling_mean7 = y.rolling(window=7).mean()

#plt.bar(counts.index,counts)
#plt.bar(x, y, color='burlywood')
plt.fill_between(x, y, color='burlywood') # area plot
plt.plot(x, rolling_mean5, label='5 Day SMA', color='red')
plt.plot(x, rolling_mean7, label='7 Day SMA', color='green')
plt.tick_params(axis ='x', rotation = 90)
plt.xticks(np.arange(0, 300, 50))
plt.legend(loc='upper left')
plt.show()

In [None]:
# whole thing in plotly

#fig = make_subplots()
fig = go.Figure()

# This "trace" will display the "other" data
fig.add_trace(go.Bar(
    y = status_df['New Cases'],
    x = status_df['Reported Date'],
    name = 'New Cases',
    marker_color='rgb(204,164,168)',
    # marker_line_color="black",
    hoverinfo = ['all'],
    opacity=0.5
))

fig.add_trace(go.Bar(
    y = status_df['New Deaths'],
    x = status_df['Reported Date'],
    name = 'New Deaths',
    marker_color='rgb(59,33,36)',
    # marker_line_color="black",
    hoverinfo = ['all']
))

fig.add_trace(go.Scatter(
    y = rolling_mean7,
    x = status_df['Reported Date'],
    mode='lines',
    name = '7 Day Rolling Average',
    line_color='rgb(12,15,29)',
    hoverinfo = ['all']
))

#fig.add_annotation(
#    x = status_df.at['Reported Date'] == '2020-10-02T00:00:00',
    # 2020-10-02T00:00:00     
    #y = status_df['New Deaths'],
#    text="Province dumped a whole bunch of new deaths",
#    showarrow=True,
#    arrowhead=1
#)

#fig.update_layout(
#    autosize=False,
#    width=1800,
#    height=900
#)

#plotly.offline.plot(fig, filename='ontario_covid.html')
fig.show()

In [None]:
# use matplotlib to plot reported cases from the "conpos" dataset
#fig = plt.figure()
#fig.suptitle('Ontario COVID-19 Reported Cases', fontsize=24) # Add the text/suptitle to figure

#counts = df['Case_Reported_Date'].value_counts()
#plt.bar(counts.index,counts)

# extra options
# ax.set_xlabel("Public Health Unit", fontsize=16)
# ax.set_ylabel("Cases Reported by PHU", fontsize=16)
# ax.tick_params(axis='both', labelsize=14)
# plt.show()

In [None]:
# same thing, but using seaborn lets us plot a kernel density estimate (which can visually stand in for a running average)
# gotta watch out for that drop-off, though
# https://seaborn.pydata.org/generated/seaborn.kdeplot.html

#sns.set()
#ax = sns.histplot(df['Case_Reported_Date'], kde=True, bins=300)

In [None]:
# comparisons based on gender
#g = sns.displot(df, x="Accurate_Episode_Date", hue="Client_Gender", bins=300)
#g.set_xticklabels(rotation=90)
#g.fig.set_size_inches(30,5)

In [None]:
# side-by-side comparison based on age group for a recent day
#g = sns.displot(today, x="Outcome1", hue="Outcome1", col="Age_Group")
#g.set_xticklabels(rotation=90)
#sns.set(font_scale=1.3) # seaborn's method for setting fontsize is pretty dumb
#g.fig.set_size_inches(20,10)

In [None]:
# see how many new cases are reported today in the "confirmed positive" dataset (make sure to change value to today's date)
# keep in mind that the numbers are always missing the most recent day or two
# five_days = df[df['Accurate_Episode_Date'].str.contains("2020-10-23", na=False)]
# today.head()

In [None]:
# or how about age group x outcome
#deaths = deaths.replace(['<20'],'10s') # either re-set the order, or re-name <20 to something like 10s
#deaths = deaths.sort_values('Age_Group', ascending=True) 

# deaths only
#deaths = df[df.Outcome1 == 'Fatal']
#deaths.head()
#g = sns.displot(deaths, x="Age_Group", multiple="dodge")
#sns.color_palette("tab10")
#g.set_xticklabels(rotation=90)
#g.fig.set_size_inches(20,10)


# Some things to fix: 

* Re-order x axis (seaborn won't order age groups by default)
* Increase fonts (or change default matplotlib/seaborn options at the top of my notebook)
* Change colour palettes
* Make sure my H x W ratios are appropriate (see here: https://stats.stackexchange.com/questions/185346/what-is-the-best-ratio-between-width-and-height-in-a-bar-chart)
* Get rid of the NaNs in the dataset
* Make some more space between the age categories