In [1]:
import re
import json

import pandas as pd
from notion.block import CollectionViewBlock

from helpers import (
    get_notion_client,
    get_api_metadata
)

In [2]:
TAG_HIERARCHY_URL = 'https://www.notion.so/Tag-hierarchy-Features-db9799312efa4f88851e8d49393bbb16'

In [3]:
client = get_notion_client()

In [4]:
# fetch & compile features df from Notion

def get_tag_tables():
    tag_hierarchy_page = client.get_block(TAG_HIERARCHY_URL)
    return [child for child in tag_hierarchy_page.children
            if isinstance(child, CollectionViewBlock)
            and child.title != 'Unhandled']

def get_feature_df_from_table(tag_table):
    rows = tag_table.collection.get_rows()
    df = pd.DataFrame([row.get_all_properties() for row in rows])
    df['category'] = tag_table.title
    df = df[df.tags.apply(len) != 0] # leave out rows that have no tags
    return df[['category', 'feature', 'tags', 'description', 'quantities', 'aggregation']]

def get_features_df():
    return pd.concat([get_feature_df_from_table(tag_table)
                      for tag_table in get_tag_tables()]).reset_index(drop=True)

In [5]:
# parse out individual tags from features df

def get_tag_description(tag, description):
    """ Some descriptions are subidivided for each tag.
        Get only that tag's description if so. """
    match = re.search(f'[*_]{{2}}({tag}|all others)[*_:]{{3}}(.+)', description, re.IGNORECASE)

    if match is None:
        tag_specific_descriptions = re.search(r'[*_]{2}[\w ]+[*_:]{3}', description)
        parsed_description = '' if tag_specific_descriptions else description
    else:
        parsed_description = match[2].strip()

    return None if parsed_description in ('', '?') else parsed_description

def get_tag_quantity(tag, quantities):
    """ Logic for parsing quantity for each tag. """
    if quantities in ['1', '1 for all']:
        return 1

    match = re.search(f'({tag}|all others): ([ \\d\\.]+)', quantities, re.IGNORECASE)
    return float(match[2]) if match else quantities if len(quantities) > 0 else None

def get_tags_df(features):
    tags = []
    for feature in features.itertuples():
        for tag in feature.tags:
            tags.append({
                'category': feature.category,
                'feature': feature.feature,
                'tag': tag,
                'description': get_tag_description(tag, feature.description),
                'quantity': get_tag_quantity(tag, feature.quantities),
                'aggregation': feature.aggregation
            })
    return pd.DataFrame(tags)

In [6]:
def tags_to_api_object(tags):
    adjusted_tags = tags.drop(columns=['quantity']).rename(columns={'tag': 'name'})

    tags_dict = {}
    for _, tag in adjusted_tags.iterrows():
        tags_dict[tag['name']] = dict(tag)

    return {**get_api_metadata(client), 'data': {'containment_tags': tags_dict}}

In [7]:
features = get_features_df()
tags = get_tags_df(features)

In [8]:
tags.groupby('tag').filter(lambda x: len(x) > 1)

Unnamed: 0,category,feature,tag,description,quantity,aggregation
10,Isolation,Asymptomatic isolation - blanket,total vehicle ban,,3,max
12,Isolation,Domestic travel restriction,total vehicle ban,,2,max


In [9]:
tags[pd.isnull(tags.description)
     | pd.isnull(tags.aggregation)
     | pd.isnull(tags.quantity)].fillna('???')

Unnamed: 0,category,feature,tag,description,quantity,aggregation
1,Isolation,Symptomatic isolation - targeted,cohort isolation - symptoms,???,1,max
10,Isolation,Asymptomatic isolation - blanket,total vehicle ban,???,3,max
12,Isolation,Domestic travel restriction,total vehicle ban,???,2,max
14,Isolation,Domestic travel restriction,domestic travel limitation,???,1,max
24,Isolation Enhancement,Healthcare specialisation,hospital specialisation,???,1,sum unique
25,Isolation Enhancement,Healthcare specialisation,healthcare entry screening,???,1,sum unique
26,Isolation Enhancement,Healthcare specialisation,remote medical treatment,???,1,sum unique
27,Isolation Enhancement,Healthcare specialisation,visiting in hospital banned,???,1,sum unique
39,Public Hygiene,Miscellaneous hygiene measures,cash cleaned,???,1,sum unique
40,Public Hygiene,Miscellaneous hygiene measures,cashless transactions,???,1,sum unique
