In [1]:
import pandas as pd
import datetime as dt

from helper_functions import map_prov_names
from helper_functions import map_voc_names

In [3]:
df = pd.read_csv("sequence.csv")

df['date'] = pd.to_datetime(df['date'], dayfirst=True)
df

Unnamed: 0,region,date,qc_pass,qc_fail,gisaid
0,Alberta,2020-09-18,,,
1,British Columbia,2020-09-18,,,
2,Manitoba,2020-09-18,,,
3,New Brunswick,2020-09-18,,,
4,Newfoundland and Labrador,2020-09-18,,,
...,...,...,...,...,...
435,Ontario,2021-06-18,29154.0,5701,21728.0
436,Prince Edward Island,2021-06-18,114.0,52,0.0
437,Quebec,2021-06-18,29240.0,7150,7595.0
438,Saskatchewan,2021-06-18,7971.0,2187,3406.0


In [9]:
map_prov_names(df, 'region')
df.head()

Unnamed: 0,region,date,qc_pass,qc_fail,gisaid
0,1,2020-09-18,,,
1,2,2020-09-18,,,
2,3,2020-09-18,,,
3,4,2020-09-18,,,
4,5,2020-09-18,,,


In [16]:
df.to_csv('sequence_mapped.csv', index = False)

In [5]:
date = pd.read_csv("date.csv")
date['sequence_date'] = pd.to_datetime(date['sequence_date'], dayfirst = True)
date['voc_date'] = pd.to_datetime(date['voc_date'], dayfirst = True)
date['week_monday_date'] = date['sequence_date'].dt.to_period('W').apply(lambda r: r.start_time)
date.head()

Unnamed: 0,sequence_date,voc_date,week_monday_date
0,2020-09-18,NaT,2020-09-14
1,2020-09-25,NaT,2020-09-21
2,2020-10-02,NaT,2020-09-28
3,2020-10-09,NaT,2020-10-05
4,2020-10-16,NaT,2020-10-12


In [11]:
date.to_csv('date.csv', index = True)

In [4]:
voc = pd.read_csv('voc_case.csv')
voc.head()

Unnamed: 0,region,date,voc,total
0,Alberta,2021-03-15,alpha,902
1,British Columbia,2021-03-15,alpha,667
2,Manitoba,2021-03-15,alpha,12
3,New Brunswick,2021-03-15,alpha,6
4,Newfoundland and Labrador,2021-03-15,alpha,88


In [5]:
map_voc_names(voc, 'voc')
map_prov_names(voc, 'region')
voc.head()

Unnamed: 0,region,date,voc,total
0,1,2021-03-15,1,902
1,2,2021-03-15,1,667
2,3,2021-03-15,1,12
3,4,2021-03-15,1,6
4,5,2021-03-15,1,88


In [None]:
voc.to_csv('voc_case_mapped.csv', index = False)

In [3]:
csv_url = "https://health-infobase.canada.ca/src/data/covidLive/covid19.csv"
df = pd.read_csv(csv_url)

#parse government data so it is usable
df['date'] = pd.to_datetime(df['date'], dayfirst = True)
df = df.drop(['prnameFR', 'pruid'], axis = 1)
#if provinces don't report active cases, assume they have 0, so the data will plot
df['numactive'].fillna(0.0, inplace = True)

provnames = df.prname.unique()
provnames = list(provnames)
provnames.pop()
#function to make province-wise dataset
def make_dataset(pnames):
    
    by_prov = pd.DataFrame()

    # Iterate through all the provinces
    for i in pnames:

        # Subset to the province
        sub = df[df['prname'] == i]

        # Add to the overall dataframe
        by_prov = by_prov.append(sub)
    
    return by_prov


#must pivot datasets for linear plot data so prov names are columns
def format_dataset(df, value):
    df_pivot = pd.pivot_table(df,
    columns = 'prname',
    index = 'date',
    values = value)
    
    df_pivot = df_pivot.rename_axis(None)

    return df_pivot


dataset = format_dataset(make_dataset(provnames), 'numtotal_last7')
dataset = dataset.reset_index()
#print(dataset.head())
#dataset.rename_axis('prname')

mondays = pd.read_csv('mondays.csv')
mondays['week_monday_date'] = pd.to_datetime(mondays['week_monday_date'], dayfirst=True, format = "%Y-%m-%d")
mondays['date'] = mondays['week_monday_date'].dt.date
mondays['d'] = pd.to_datetime(mondays['date'])
mondays = mondays.drop(['week_monday_date', 'date'], axis = 1)
mon_lst = list(mondays['d'])

mon_data = dataset.loc[dataset['index'].isin(mon_lst)]
mon_data.rename(columns={'index':'date'}, inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [40]:
cases_formatted = pd.melt(mon_data, id_vars = ['date']).sort_values(by = ['date','prname'])
cases_formatted.rename(columns = {'prname':'region', 'value':'week_cases'}, inplace = True)

map_prov_names(cases_formatted, 'region')

cases_formatted.to_csv('cases_formatted.csv', index = False)

This section is for parsing new data files
------------------------------------------

In [4]:
df = pd.read_csv("sequence_jul_09.csv")

df['date'] = pd.to_datetime(df['date'], dayfirst=True)
map_prov_names(df, 'region')
df['date'] = f"'{'2021-07-09'}'"
print(df)
df.to_csv('sequence_mapped_jul_09.csv', index = False)

    region  qc_pass  qc_fail  gisaid          date  data_portal
0        1    22998     2192   14582  '2021-07-09'         9955
1        2    46676     8462   22751  '2021-07-09'            0
2        3     2886      404    1690  '2021-07-09'            0
3        4     1012      161     476  '2021-07-09'            0
4        5      469       71     353  '2021-07-09'            0
5        7     1378      343    1223  '2021-07-09'            0
6        9    32028     5766   25308  '2021-07-09'          111
7       10      114       53       0  '2021-07-09'            0
8       11    32079     7950   10105  '2021-07-09'         6016
9       12     8719     2327    4691  '2021-07-09'            0
10      14   148359    27729   81179  '2021-07-09'        16082


In [6]:
voc = pd.read_csv('voc_jul_19.csv')
map_voc_names(voc, 'voc')
map_prov_names(voc, 'region')
voc['date'] = f"'{'2021-07-19'}'"
voc.head()
voc.to_csv('voc_case_mapped_jul_19.csv', index = False)


In [7]:
csv_url = "https://health-infobase.canada.ca/src/data/covidLive/covid19.csv"
df = pd.read_csv(csv_url)

#parse government data so it is usable
df['date'] = pd.to_datetime(df['date'], dayfirst = True)
df = df.drop(['prnameFR', 'pruid'], axis = 1)
#if provinces don't report active cases, assume they have 0, so the data will plot
df['numactive'].fillna(0.0, inplace = True)

provnames = df.prname.unique()
provnames = list(provnames)
provnames.pop()
#function to make province-wise dataset
def make_dataset(pnames):
    
    by_prov = pd.DataFrame()

    # Iterate through all the provinces
    for i in pnames:

        # Subset to the province
        sub = df[df['prname'] == i]

        # Add to the overall dataframe
        by_prov = by_prov.append(sub)
    
    return by_prov


#must pivot datasets for linear plot data so prov names are columns
def format_dataset(df, value):
    df_pivot = pd.pivot_table(df,
    columns = 'prname',
    index = 'date',
    values = value)
    
    df_pivot = df_pivot.rename_axis(None)

    return df_pivot


dataset = format_dataset(make_dataset(provnames), 'numtotal_last7')
dataset = dataset.reset_index()
#print(dataset.head())
#dataset.rename_axis('prname')

monday = '2021-07-19'
mon_date = pd.to_datetime(monday, dayfirst = True)
mon_lst = [mon_date]

mon_data = dataset.loc[dataset['index'].isin(mon_lst)]
mon_data.rename(columns={'index':'date'}, inplace = True)
print(mon_data)

prname       date  Alberta  British Columbia  Canada  Manitoba  New Brunswick  \
482    2021-07-19    305.0             333.0  2787.0     286.0           10.0   

prname  Newfoundland and Labrador  Northwest Territories  Nova Scotia  \
482                          35.0                    0.0          4.0   

prname  Nunavut  Ontario  Prince Edward Island  Quebec  Saskatchewan  Yukon  
482         0.0   1084.0                   0.0   516.0         173.0   41.0  


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [16]:
cases_formatted = pd.melt(mon_data, id_vars = ['date']).sort_values(by = ['date','prname'])
cases_formatted.rename(columns = {'prname':'region', 'value':'week_cases'}, inplace = True)

map_prov_names(cases_formatted, 'region')
cases_formatted['date'] = f"'{monday}'"
cases_formatted['week_cases'] = cases_formatted['week_cases'].astype(int)

print(cases_formatted)
cases_formatted.to_csv('cases_formatted_jul19.csv', index = False)

                     date  region  week_cases
0   "2021-07-12 00:00:00"       1         265
1   "2021-07-12 00:00:00"       2         364
2   "2021-07-12 00:00:00"      14        3467
3   "2021-07-12 00:00:00"       3         438
4   "2021-07-12 00:00:00"       4           0
5   "2021-07-12 00:00:00"       5           2
6   "2021-07-12 00:00:00"       6           0
7   "2021-07-12 00:00:00"       7          15
8   "2021-07-12 00:00:00"       8           0
9   "2021-07-12 00:00:00"       9        1290
10  "2021-07-12 00:00:00"      10           1
11  "2021-07-12 00:00:00"      11         720
12  "2021-07-12 00:00:00"      12         299
13  "2021-07-12 00:00:00"      13          73
