# Code for wrangling and plotting VOC proportions data downloaded from PHAC

## Notebook created on: October 21, 2021

## Notebook last updated on: October 21, 2021

## Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default='notebook'
import datetime as dt

## Read in data

In [42]:
def wrangle_data(url):
    df = pd.read_csv(csv_url)
    df.columns = ['grouping', 'identifier', 'lineage', 'percentage', 'week']
    df['week'] = pd.to_datetime(df['week'])
    unique_weeks = df.week.unique()
    unique_identifiers = df.identifier.unique()
    unique_weeks_columns = ['grouping', 'identifier', 'percentage', 'week']
    df_unique_weeks = pd.DataFrame(columns = unique_weeks_columns)

    for identifier in unique_identifiers:
        df_sub1 = df[df['identifier'] == identifier]
    #print(df_sub1.head())
        for week in unique_weeks:
            df_sub2 = df_sub1[df_sub1['week'] == week]
            #print(df_sub2.head())
            pct = df_sub2.percentage.sum()
            df_unique_weeks.loc[len(df_unique_weeks)] = [df_sub1.iloc[0]['grouping'], identifier, pct, week]
    df_unique_weeks['YMD'] = df_unique_weeks['week'].dt.date

    variants = df_unique_weeks.identifier.unique()
    df_unique_weeks = df_unique_weeks[df_unique_weeks['YMD'] > pd.to_datetime('2020-03-01')]
    
    return df_unique_weeks, variants

csv_url = 'https://health-infobase.canada.ca/src/data/covidLive/covid19-epiSummary-variants-detailed-download.csv'

week_df, var_list = wrangle_data(csv_url)

week_df.head()


Comparison of Timestamp with datetime.date is deprecated in order to match the standard library behavior.  In a future version these will be considered non-comparable.Use 'ts == pd.Timestamp(date)' or 'ts.date() == date' instead.



Unnamed: 0,grouping,identifier,percentage,week,YMD
0,VOC,Alpha,0.001,2020-04-05,2020-04-05
1,VOC,Alpha,0.001,2020-04-19,2020-04-19
2,VOC,Alpha,0.004,2020-05-10,2020-05-10
3,VOC,Alpha,0.006,2020-05-17,2020-05-17
4,VOC,Alpha,0.002,2020-05-31,2020-05-31


In [17]:
print(var_list)

['Alpha' 'Beta' 'Delta' 'Gamma' 'B.1.1.318' 'B.1.617.3' 'Eta' 'Iota'
 'Lambda' 'Mu' 'Theta' 'Other']


In [43]:
week_df = (week_df.drop(columns = ['week', 'grouping']))
print(week_df.head())

  identifier  percentage         YMD
0      Alpha       0.001  2020-04-05
1      Alpha       0.001  2020-04-19
2      Alpha       0.004  2020-05-10
3      Alpha       0.006  2020-05-17
4      Alpha       0.002  2020-05-31


In [48]:
def return_variant_graph(df, variants):
    graph = []
    for var in variants:
        x_val = df[df['identifier'] == var].YMD.tolist()
        y_val = df[df['identifier'] == var].percentage.tolist()
        graph.append(
            go.Bar(
                x = x_val,
                y = y_val,
                name = var
            )
        )

    layout = dict(title="Variant Proportion of Samples Sequenced in Canada by Date",
                  xaxis=dict(title='Date'),
                  yaxis=dict(title='Variant Proportion')
                  )
    
    fig = go.Figure(data = graph, layout = layout)
    #fig.update_layout(barmode='stack')
    fig.show()

return_variant_graph(week_df, var_list)
