In [54]:
import pandas as pd
from zipfile import ZipFile
import requests
import numpy as np
import datetime
import copy
import chart_studio.plotly as py
import plotly.graph_objects as go

csv_url = "https://www150.statcan.gc.ca/n1/pub/13-26-0003/2020001/COVID19-eng.zip"
save_path = 'data.zip'


In [15]:
'''
def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)

download_url(csv_url, save_path)

with ZipFile('data.zip', 'r') as zip:
    zip.extractall()
    print('Done!')
'''

KeyboardInterrupt: 

In [51]:
df = pd.read_csv('COVID19-eng.csv')

#recode numerical data so it is intelligible
df.replace({'COV_REG':{1: 'Atlantic', 2: 'Quebec', 3: 'Ontario and Nunavut', 4: 'Prairies and NWT',
                              5: 'BC and Yukon'}}, inplace = True)
df.replace({'COV_AGR':{1: '0-19', 2: '20-29', 3: '30-39', 4: '40-49', 5: '50-59', 6: '60-69', 
                                 7: '70-79', 8: '80+', 99: np.nan}}, inplace = True)
df.replace({'COV_ASM':{1: 'Y', 2: 'N', 9: np.nan}}, inplace = True)
df.replace({'COV_GDR':{1: 'M', 2: 'F', 9: np.nan}}, inplace = True)
df.replace({'COV_HSP':{1: 'Y_ICU', 2: 'Y', 3: 'N', 9: np.nan}}, inplace = True)
df.replace({'COV_OCC':{1: 'healthcare', 2: 'school', 3: 'longtermcareresident', 4: 'other', 
                                9: np.nan}}, inplace = True)
df.replace({'COV_TRM':{1: 'domestic', 2: 'international', 9: np.nan}}, inplace = True)
df.replace({'COV_DTH':{1: 'Y', 2: 'N', 9: np.nan}}, inplace = True)
df.replace({'COV_RSV':{1: 'Y', 2: 'N', 3: np.nan}}, inplace = True)
df.replace({'COV_EW':{99: np.nan}}, inplace = True)
df.replace({'COV_EWG':{99: np.nan}}, inplace = True)
df.replace({'COV_EY':{99: np.nan}}, inplace = True)
df.replace({'COV_OW':{99: np.nan}}, inplace = True)
df.replace({'COV_OY':{99: np.nan}}, inplace = True)
df.replace({'COV_RW':{99: np.nan}}, inplace = True)
df.replace({'COV_RY':{99: np.nan}}, inplace = True)

#rename columns for easier legibility
df.columns = ['ID', 'region', 'week', 'week_group', 'year', 'gender', 'age', 'occupation', 'asymptomatic', 'onset_week', 'onset_year', 'hospitalization', 'resolved', 'resolution_week', 'resolution_year', 'death', 'transmission']

#convert onset year and week to single column with complete date (Monday of week)
df["onset_year"] = "20" + (df.onset_year).astype('Int32').astype(str)
df["onset_week"] = (df.onset_week).astype('Int32').astype(str)
df["onset"] = pd.to_datetime(df.onset_week + df.onset_year.add('-1'), format='%V%G-%u', errors = 'coerce')

#convert resolution year and week to single column with complete date (Monday of week)
df["resolution_year"] = "20" + (df.resolution_year).astype('Int32').astype(str)
df["resolution_week"] = (df.resolution_week).astype('Int32').astype(str)
df["resolution"] = pd.to_datetime(df.resolution_week + df.resolution_year.add('-1'), format='%V%G-%u', errors = 'coerce')

#convert episode year and week to single column with complete date (Monday of week)
df["year"] = "20" + (df.year).astype('Int32').astype(str)
df["week"] = (df.week).astype('Int32').astype(str)
df["episode"] = pd.to_datetime(df.week + df.year.add('-1'), format='%V%G-%u', errors = 'coerce')

#combine onset and episode columns, put the earliest date in new column
df['start'] = df[['onset', 'episode']].min(axis = 1) 

#drop redundant columns
df.drop(['onset_year', 'onset_week', 'resolution_year', 'resolution_week', 'week', 'year', 'onset', 'episode'], inplace = True, axis = 1)
df.head()

Unnamed: 0,ID,region,week_group,gender,age,occupation,asymptomatic,hospitalization,resolved,death,transmission,resolution,start
0,1,Ontario and Nunavut,0,F,50-59,,,N,Y,N,domestic,NaT,2021-04-19
1,2,Ontario and Nunavut,0,F,30-39,school,N,,Y,N,domestic,NaT,2020-10-19
2,3,Quebec,0,F,40-49,other,N,N,Y,N,domestic,2020-12-14,2020-11-30
3,4,Ontario and Nunavut,0,F,20-29,,N,N,Y,N,domestic,NaT,2020-06-15
4,5,Quebec,0,M,0-19,other,Y,N,Y,N,domestic,2020-12-07,2020-11-30


In [52]:
(df.shape[0] - df.isnull().sum())/df.shape[0] * 100

ID                 100.000000
region             100.000000
week_group         100.000000
gender              99.692831
age                 99.926657
occupation          59.898240
asymptomatic        79.505592
hospitalization     69.193840
resolved           100.000000
death               94.885435
transmission        86.049176
resolution          46.579344
start               98.916153
dtype: float64

In [None]:
data = df[['transmission', 'age']]
for age in data.age.unique():
    x_val = data[data['age']] == age].age.tolist()
    y_val = data[data['age']] == age].transmission.tolist()

In [None]:
data = data[['prname', 'YMD', 'numdeaths']]
    for region in unique_provnames():
        x_val = data[data['prname'] == region].YMD.tolist()
        y_val = data[data['prname'] == region].numdeaths.tolist()
        graph_two.append(
            go.Scatter(
                x = x_val,
                y = y_val,
                mode = 'lines',
                name = region
            )
        )

    layout_two = dict(title = "Cumulative Deaths by Region",
                    xaxis = dict(title = 'Date'),
                    yaxis = dict(title = 'Total Number of Deaths'),
                )