In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from notion.client import NotionClient
from notion.block import CollectionViewBlock

In [3]:
# You'll need to find and store your Notion token for this. When logged
# into Notion, look in the web console for a cookie named "token_v2".
# Copy the contents into a file in this directory named .notion-token

with open('.notion-token', 'r') as token_file:
    NOTION_TOKEN = token_file.read().strip()
    
client = NotionClient(token_v2=NOTION_TOKEN)

In [4]:
tag_hierarchy_page = client.get_block(
    'https://www.notion.so/Tag-hierarchy-Features-db9799312efa4f88851e8d49393bbb16')

In [5]:
tag_tables = [child for child in tag_hierarchy_page.children
          if isinstance(child, CollectionViewBlock)
          and child.title != 'Unhandled']

In [6]:
def get_feature_df(tag_table):
    rows = tag_table.collection.get_rows()
    df = pd.DataFrame([row.get_all_properties() for row in rows])
    df['category'] = tag_table.title
    df = df[df.tags.apply(len) != 0] # leave out rows that have no tags
    return df[['category', 'feature', 'tags', 'description', 'quantities', 'aggregation']]

In [7]:
features = pd.concat(
    [get_feature_df(tag_table) for tag_table in tag_tables]).reset_index(drop=True)

In [8]:
counts = {}
for feature in features.itertuples():
    for tag in feature.tags:
        counts[tag] = counts[tag] + 1 if tag in counts else 1

{k:v for (k, v) in counts.items() if v > 1}

{'total vehicle ban': 2}

In [9]:
def get_tag_description(tag, description):
    """ Some descriptions are subidivided for each tag.
        Get only that tag's description if so. """
    match = re.search(f'[*_]{{2}}{tag}[*_:]{{3}}(.+)', description, re.IGNORECASE)
    return match[1].strip() if match else description

def get_tag_quantity(tag, quantities):
    """ Logic for parsing quantity for each tag. """
    if quantities in ['1', '1 for all']:
        return 1

    match = re.search(f'{tag}: ([ \\d\\.]+)', quantities, re.IGNORECASE)
    return float(match[1]) if match else quantities

In [10]:
tags = []
for feature in features.itertuples():
    for tag in feature.tags:
        tags.append({
            'category': feature.category,
            'feature': feature.feature,
            'tag': tag,
            'description': get_tag_description(tag, feature.description),
            'quantity': get_tag_quantity(tag, feature.quantities),
            'aggregation': feature.aggregation
        })

tags = pd.DataFrame(tags)

In [11]:
tags.to_csv('data/tags.csv', index=False)

In [12]:
tags[tags.description.str.contains('\n') | tags.quantity.str.contains('\n')
    ].to_csv('problem_tags.csv', index=False)

In [13]:
tags[tags.description.str.contains('\n') | tags.quantity.str.contains('\n')]

Unnamed: 0,category,feature,tag,description,quantity,aggregation
1,Isolation,Symptomatic isolation - targeted,cohort isolation - symptoms,__confirmed case isolation__: Isolation of con...,1,max
10,Isolation,Asymptomatic isolation - blanket,total vehicle ban,__cluster isolation - no symptoms:__ Entire di...,3,max
12,Isolation,Domestic travel restriction,total vehicle ban,__Domestic traveller quarantine:__ Domestic tr...,2,max
14,Isolation,Domestic travel restriction,domestic travel limitation,__Domestic traveller quarantine:__ Domestic tr...,1,max
23,Isolation Enhancement,Healthcare specialisation,quarantine zone,__Hospital specialisation - partial:__ some ho...,1,sum unique
24,Isolation Enhancement,Healthcare specialisation,hospital specialisation,__Hospital specialisation - partial:__ some ho...,1,sum unique
25,Isolation Enhancement,Healthcare specialisation,healthcare entry screening,__Hospital specialisation - partial:__ some ho...,1,sum unique
26,Isolation Enhancement,Healthcare specialisation,remote medical treatment,__Hospital specialisation - partial:__ some ho...,1,sum unique
27,Isolation Enhancement,Healthcare specialisation,visiting in hospital banned,__Hospital specialisation - partial:__ some ho...,1,sum unique
37,Public Hygiene,Miscellaneous hygiene measures,funeral hygiene,Measures to discourage the use of cash or disi...,1,sum unique
