In [1]:
import pandas as pd


In [2]:
csv_url = "https://health-infobase.canada.ca/src/data/covidLive/covid19.csv"
df = pd.read_csv(csv_url)

#parse government data so it is usable
df['date'] = pd.to_datetime(df['date'], dayfirst = True)
df = df.drop(['prnameFR', 'pruid'], axis = 1)
#if provinces don't report active cases, assume they have 0, so the data will plot
df['numactive'].fillna(0.0, inplace = True)

provnames = df.prname.unique()
provnames = list(provnames)
provnames.pop()
#function to make province-wise dataset
def make_dataset(pnames):
    
    by_prov = pd.DataFrame()

    # Iterate through all the provinces
    for i in pnames:

        # Subset to the province
        sub = df[df['prname'] == i]

        # Add to the overall dataframe
        by_prov = by_prov.append(sub)
    
    return by_prov


#must pivot datasets for linear plot data so prov names are columns
def format_dataset(df, value):
    df_pivot = pd.pivot_table(df,
    columns = 'prname',
    index = 'date',
    values = value)
    
    df_pivot = df_pivot.rename_axis(None)

    return df_pivot


dataset = format_dataset(make_dataset(provnames), 'numtotal_last7')
dataset = dataset.reset_index()
#print(dataset.head())
#dataset.rename_axis('prname')

mondays = pd.read_csv('data/mondays.csv')
mondays['week'] = pd.to_datetime(mondays['week'], dayfirst=True)
mon_lst = list(mondays['week'])

mon_data = dataset.loc[dataset['index'].isin(mon_lst)]
mon_data.rename(columns={'index':'date'}, inplace = True)

print(mon_data.head())
    

prname       date  Alberta  British Columbia   Canada  Manitoba  \
362    2021-03-21   3510.0            4027.0  24738.0     609.0   
369    2021-03-28   4406.0            5478.0  32207.0     570.0   
377    2021-04-05   6309.0            6725.0  42654.0     512.0   
384    2021-04-12   8844.0            7909.0  56641.0     837.0   
391    2021-04-19  10148.0            7211.0  60753.0     935.0   

prname  New Brunswick  Newfoundland and Labrador  Northwest Territories  \
362              20.0                        2.0                    0.0   
369              87.0                        2.0                    0.0   
377              75.0                        4.0                    1.0   
384              70.0                       10.0                    0.0   
391              65.0                       16.0                    0.0   

prname  Nova Scotia  Nunavut  Ontario  Prince Edward Island   Quebec  \
362            18.0     12.0  10768.0                   5.0   4747.0   
36

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [3]:
dataset = format_dataset(make_dataset(provnames), 'numtoday')
index_list = pd.date_range(start = '2021-08-20', end = '2021-09-17', closed = 'right')
month_period = dataset.loc[index_list, :]
#print(month_period)
month_total = month_period.sum(axis = 0)
print(month_total)
#month_total.to_csv('data/29JAN_19FEB_totals.csv', encoding = 'utf8')

prname
Alberta                       34561.0
British Columbia              19035.0
Canada                       103320.0
Manitoba                       1436.0
New Brunswick                   623.0
Newfoundland and Labrador        97.0
Northwest Territories           420.0
Nova Scotia                     356.0
Nunavut                           2.0
Ontario                       19947.0
Prince Edward Island             57.0
Quebec                        17744.0
Saskatchewan                   9000.0
Yukon                            42.0
dtype: float64


In [7]:
test_2 = month_total.reset_index()
test_2

Unnamed: 0,prname,0
0,Alberta,2557.0
1,British Columbia,2155.0
2,Canada,22924.0
3,Manitoba,3196.0
4,New Brunswick,61.0
5,Newfoundland and Labrador,65.0
6,Northwest Territories,0.0
7,Nova Scotia,144.0
8,Nunavut,3.0
9,Ontario,9213.0


In [31]:
import string

def makeFrequencyData(data):
    alphabet = (list(string.ascii_uppercase))
    df = pd.DataFrame(data.apply(lambda row : [row.count(i) for i in alphabet]).to_list(), columns = alphabet, index = data.index)
    return df

test = makeFrequencyData(test_2)
test

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,Q,R,S,T,U,V,W,X,Y,Z
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
cases_formatted = pd.melt(mon_data, id_vars = ['date']).sort_values(by = ['date','prname'])
cases_formatted.rename(columns = {'prname':'region', 'value':'week_cases'}, inplace = True)

print(cases_formatted)

#cases_formatted.to_csv('data/cases.csv', index = False)

          date                region  week_cases
0   2021-03-21               Alberta      3510.0
15  2021-03-21      British Columbia      4027.0
30  2021-03-21                Canada     24738.0
45  2021-03-21              Manitoba       609.0
60  2021-03-21         New Brunswick        20.0
..         ...                   ...         ...
149 2021-06-28               Ontario      1946.0
164 2021-06-28  Prince Edward Island         0.0
179 2021-06-28                Quebec       649.0
194 2021-06-28          Saskatchewan       302.0
209 2021-06-28                 Yukon       121.0

[210 rows x 3 columns]


In [20]:
print(cases_formatted.describe())

Newfoundland and Labrador    15
Nunavut                      15
Prince Edward Island         15
Canada                       15
Saskatchewan                 15
Ontario                      15
British Columbia             15
Yukon                        15
Manitoba                     15
Nova Scotia                  15
Northwest Territories        15
Alberta                      15
Quebec                       15
New Brunswick                15
Name: region, dtype: int64


In [23]:
subset = pd.read_csv("data/subset_ag_plot.csv")
subset2 = pd.melt(subset, id_vars = ['date', 'region'])
subset2.rename(columns = {'variable':'type', 'value':'cases'}, inplace = True)
print(subset2.columns)

canada = pd.read_csv('data/canadaplot.csv')
merged = pd.concat([canada, subset2], ignore_index = True)
print(merged)
merged.to_csv('data/ag_plot.csv', index = False)

Index(['date', 'region', 'type', 'cases'], dtype='object')
           date   region type_x  cases_x  cases_y type_y
0    2021-03-28  Alberta  alpha     1462    29121  other
1    2021-03-28  Alberta  alpha     1462     2918  alpha
2    2021-03-28  Alberta  alpha     1462      168  gamma
3    2021-03-28   Quebec  alpha      163    29121  other
4    2021-03-28   Quebec  alpha      163     2918  alpha
..          ...      ...    ...      ...      ...    ...
121  2021-06-21  Alberta  other      303     5132  alpha
122  2021-06-21  Alberta  other      303     1051  gamma
123  2021-06-21   Quebec  other      847      112  other
124  2021-06-21   Quebec  other      847     5132  alpha
125  2021-06-21   Quebec  other      847     1051  gamma

[126 rows x 6 columns]


KeyError: 'prname'