## Notebook to convert cumulative sequencing and repository uploads values to incremental values
### Created: April 1, 2022
### Last modified: April 1, 2022


In [1]:
# import libraries
import pandas as pd

In [3]:
# import dataset
df = pd.read_csv('data/data_reduced_mar_2022.csv')
df.head()

Unnamed: 0,region,sequences,GISAID,Portal,date
0,Alberta,1787,0,0,2020-09-01
1,British Columbia,3296,604,0,2020-09-01
2,Manitoba,395,15,0,2020-09-01
3,New Brunswick,51,51,0,2020-09-01
4,Newfoundland and Labrador,35,19,0,2020-09-01


In [24]:
# pivot dataframe so that province names are columns
df_pivot = df.pivot(index = 'date', columns = 'region')
df_pivot.head()


Unnamed: 0_level_0,sequences,sequences,sequences,sequences,sequences,sequences,sequences,sequences,sequences,sequences,...,Portal,Portal,Portal,Portal,Portal,Portal,Portal,Portal,Portal,Portal
region,Alberta,British Columbia,Canada,Manitoba,New Brunswick,Newfoundland and Labrador,Nova Scotia,Ontario,Prince Edward Island,Quebec,...,British Columbia,Canada,Manitoba,New Brunswick,Newfoundland and Labrador,Nova Scotia,Ontario,Prince Edward Island,Quebec,Saskatchewan
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-09-01,1787,3296,11598,395,51,35,308,2927,22,2506,...,0,0,0,0,0,0,0,0,0,0
2020-10-01,2419,4429,15181,513,51,35,514,3930,22,2978,...,0,0,0,0,0,0,0,0,0,0
2020-11-01,2571,5782,18830,686,51,65,514,5044,26,3788,...,0,0,0,0,0,0,0,0,0,0
2020-12-01,3371,7462,22778,1062,51,65,514,6115,36,3788,...,0,0,0,0,0,0,0,0,0,0
2021-01-01,3792,8959,26210,1062,99,86,514,7371,39,3945,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# compute incremental values for each month using the .diff() function
df_pivot_diff = df_pivot.diff().reset_index()
df_pivot_diff.head()

Unnamed: 0_level_0,date,sequences,sequences,sequences,sequences,sequences,sequences,sequences,sequences,sequences,...,Portal,Portal,Portal,Portal,Portal,Portal,Portal,Portal,Portal,Portal
region,Unnamed: 1_level_1,Alberta,British Columbia,Canada,Manitoba,New Brunswick,Newfoundland and Labrador,Nova Scotia,Ontario,Prince Edward Island,...,British Columbia,Canada,Manitoba,New Brunswick,Newfoundland and Labrador,Nova Scotia,Ontario,Prince Edward Island,Quebec,Saskatchewan
0,2020-09-01,,,,,,,,,,...,,,,,,,,,,
1,2020-10-01,632.0,1133.0,3583.0,118.0,0.0,0.0,206.0,1003.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-11-01,152.0,1353.0,3649.0,173.0,0.0,30.0,0.0,1114.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-12-01,800.0,1680.0,3948.0,376.0,0.0,0.0,0.0,1071.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-01-01,421.0,1497.0,3432.0,0.0,48.0,21.0,0.0,1256.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# melt back to a long-format dataframe
df_melt = df_pivot_diff.melt(id_vars=['date'])
df_melt

Unnamed: 0,date,NaN,region,value
0,2020-09-01,sequences,Alberta,
1,2020-10-01,sequences,Alberta,632.0
2,2020-11-01,sequences,Alberta,152.0
3,2020-12-01,sequences,Alberta,800.0
4,2021-01-01,sequences,Alberta,421.0
...,...,...,...,...
622,2021-11-01,Portal,Saskatchewan,0.0
623,2021-12-01,Portal,Saskatchewan,4869.0
624,2022-01-01,Portal,Saskatchewan,0.0
625,2022-02-01,Portal,Saskatchewan,0.0


In [26]:
# rename columns
df_melt.columns = ['date', 'variable', 'region', 'value']
df_melt.head()


Unnamed: 0,date,variable,region,value
0,2020-09-01,sequences,Alberta,
1,2020-10-01,sequences,Alberta,632.0
2,2020-11-01,sequences,Alberta,152.0
3,2020-12-01,sequences,Alberta,800.0
4,2021-01-01,sequences,Alberta,421.0


In [27]:
# remove NA columns generated for September 2021 
# when calculating incremental values
df_melt.dropna(how = 'any', inplace = True)
df_melt.head()

Unnamed: 0,date,variable,region,value
1,2020-10-01,sequences,Alberta,632.0
2,2020-11-01,sequences,Alberta,152.0
3,2020-12-01,sequences,Alberta,800.0
4,2021-01-01,sequences,Alberta,421.0
5,2021-02-01,sequences,Alberta,1211.0


In [28]:
# save dataframe to .csv format file
df_melt.to_csv('data/data_reduced_mar_2022_incremental.csv', index = False)