In [1]:
import dr_util.file_utils as fu

In [2]:
import re
from collections import defaultdict

**Goal:** Programatically extract, surface and annotate useful things in work roam graph data.

## Define Clases & Utils

In [4]:
def print_roam_node(rn):
    node_i = 0
    nodes = [rn]
    while node_i < len(nodes):
        print(nodes[node_i])
        for ch in nodes[node_i].children:
            nodes.append(ch)
        node_i += 1

In [3]:
class RoamNode:
    def __init__(self, json, parent=None, start_depth=0):
        self.refs_uid_blacklist = set([
            'KVGudD7AP', # [[DONE]]
            'e2rS3SVH7', # [[TODO]]
        ])
        if json is None:
            raise Exception("None json page")
        if not isinstance(json, dict):
            raise Exception(
                "RoamNode expects single page json"
            )
            
        self.raw_data = None
        self.depth = start_depth
        self.parent = parent

        self.title = None
        self.string = None
        self.uid = None
        self.create_time = None
        self.edit_time = None
        self.children = []
        self.refs = []
        self.recursive_refs = set()
        self.other_keys = {}
        
        self.import_json(json)

    def __repr__(self):
        indent = "  " * self.depth
        if self.title is not None:
            rep_str = f"{self.title}\n"
            rep_str += f" {indent} {self.uid=} {self.refs=}\n"
        else:
            rep_str = f"{indent} - {self.string}"
            if len(self.refs) > 0:
                rep_str += f"\n{indent} ==> {self.uid=} {self.refs=}\n"
        return rep_str

    def import_json(self, json):
        self.raw_data =  json

        for k, v in json.items():
            if k[0] == ':' or "user" in k:
                continue

            # TODO: cleanup
            if k == 'title':
                self.title = v
            elif k == 'string':
                self.string = v
            elif k == 'uid':
                self.uid = v
            elif k == 'create-time':
                self.create_time = v
            elif k == 'edit-time':
                self.edit_time = v
            elif k == 'children':
                for ch in v:
                    self.children.append(
                        RoamNode(ch, parent=self, start_depth=self.depth+1)
                    )
            elif k == 'refs':
                for vv in v:
                    if 'uid' in vv and vv['uid'] not in self.refs_uid_blacklist:
                        self.refs.append(vv['uid'])
            else:
                self.other_keys[k] = v

        # Add my refs to the refs of my tree
        self.recursive_refs.update(self.refs)
        if self.parent is not None:
            # And if my parent exists, add my tree's refs to their tree's refs
            self.parent.recursive_refs.update(self.recursive_refs)

            

In [12]:
class RoamGraph:
    def __init__(self, input_path, checkpoint_path=None):
        self.input_path = input_path
        self.checkpoint_path = checkpoint_path

        self.roam_data = None
        self.roam_pages = None
        self.uid_to_title = None
        self.title_sets = None

        # Initialize
        self.parse_roam_data()
        
    def parse_roam_data(self):
        self.roam_data = fu.load_file(self.input_path)
        self.roam_pages = {rd['title']: RoamNode(rd) for rd in self.roam_data}
        self.uid_to_title = {v.uid: k for k, v in self.roam_pages.items()}
    
        self.title_sets = {
            'daily_pages': set(),
            'bars': set(),
            'backslashes': set(),
            'with_ref': {},
            'other': set(),
        }
        for title, node in self.roam_pages.items():
            if is_valid_date(node.uid):
                self.title_sets['daily_pages'].add(title)
            elif '|' in title:
                self.title_sets['bars'].add(title)
                if len(node.refs) > 0:
                    first_ref_title = self.uid_to_title[node.refs[0]]
                    if first_ref_title not in self.title_sets['with_ref']:
                        self.title_sets['with_ref'][first_ref_title] = set()
                    self.title_sets['with_ref'][first_ref_title].add(title)
            elif "/" in title:
                self.title_sets['backslashes'].add(title)
            else:
                self.title_sets['other'].add(title)

        if self.checkpoint_path is not None:
            fu.dump_file(self, self.checkpoint_path, force_suffix=True)

In [23]:
tags_to_find = [
    'Project',
]

In [16]:
rg = RoamGraph("/Users/daniellerothermel/Desktop/drotherm_roam_2024_09_19.json")

In [21]:
for k, v in rg.title_sets.items():
    print(k, len(v))
    if k == "with_ref":
        for kk, vv in v.items():
            if len(vv) > 1:
                print("   ", kk, len(vv))

daily_pages 312
bars 885
backslashes 178
with_ref 96
    Project 7
    TDS 3
    Survey Paper 17
    Yoshua Bengio 2
    ICLR 4
other 3484


In [32]:
def map_items_with_input(input_dict):
    """
    Function to surface items from input_dict one at a time, allowing the user to provide keyboard input.
    The input will be used as the key to create a new dictionary where the value is a set containing 
    all corresponding values from input_dict for the same user-provided key.
    
    Args:
        input_dict (dict): The input dictionary to be processed.

    Returns:
        dict: A new dictionary with user-provided keys and sets of values from input_dict.
    """
    output_dict = {}

    for key in input_dict:
        user_input = input(f"-> '{key}': ")

        # If the user input already exists in the dictionary, append the value to the set
        if user_input in output_dict:
            output_dict[user_input].add(key)
        else:
            # Create a new set for the new key
            output_dict[user_input] = {key}
    
    return output_dict

In [33]:
od_bars = map_items_with_input(rg.title_sets['bars'])

-> 'Course| Spinning Up in Deep [[RL]] ([[OpenAI]])':  C
-> '2017| Domain Randomization for Transferring Deep NNs from Simulation to the Real World':  PG
-> '2022|. A Study on the Efficiency and Generalization of Light Hybrid Retrievers':  PG
-> '2023| A Survey on Transformers in RL':  PG
-> '2018- Present| Disentanglement and Causal Modeling':  RT
-> '2011| NLP (almost) from Scratch':  PG
-> '2020| [RAG] Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks':  P
-> '2020| [POPLIN] Exploring Model-based Planning with Policy Networks':  P
-> '2022| Continual-T0: Progressively Instructing 50+ Tasks to Language Models without Forgetting':  P
-> '2022|. [FiD-Light] Efficient and Effective Retrieval-Augmented Text Generation':  P
-> '2022| Quality at a Glance: An Audit of Web-Crawled Multilingual Datasets':  PG
-> '2023| Learning to Solve New Sequential Decision-Making Tasks with In-Context Learning':  PG
-> '2023| Are Language Models Worse than Humans at Following Prompts? It's 

In [40]:
for k, v in od_bars.items():
    print(k, len(v))

skip 8
Blog 1
course 13
good_paper 537
paper_to_fix 254
research_threads 2
textbook 8
project 1
conference_related 6
blog 10
thesis 7
talk 13
misc_pages 5
podcast 1


In [36]:
remap_keys = {
    "C": "course",
    "PG": "good_paper",
    "P": "paper_to_fix",
    "RT": "research_threads",
    "T": "textbook",
    "Project": "project",
    "repo": "repo",
    "conf": "conference_related",
    "B": "blog",
    "Thesis": "thesis",
    "Talk": "talk",
    "Notes": "misc_pages",
    "Podcast": "podcast",
}

In [39]:
for old_k, new_k in remap_keys.items():
    od_bars[new_k] = od_bars[old_k]
    del od_bars[old_k]

In [41]:
fu.dump_file(od_bars, '/Users/daniellerothermel/Desktop/sorted_titles_with_bars.pkl')

True

## Extraction!

In [None]:
p1 = RoamNode(roam_data[1])
p1

In [None]:
rps, u2t, tsts = parse_roam_data(roam_data)

In [None]:
rps['October 8th, 2020']

In [None]:
for k, v in tsts.items():
    print(k, len(v))

In [None]:
for k, v in tsts['with_ref'].items():
    if len(v) > 1:
        print(k, v)
        print()

In [None]:
roam_pages = {rd['title']: RoamNode(rd) for rd in roam_data}

In [None]:
bar_titles = [t for t in roam_pages.keys() if "|" in t]

In [None]:
backslash_titles = [t for t in roam_pages.keys() if "/" in t]

In [None]:
#backslash_titles
bar_titles

---

## Examples

### Example of Daily Page

In [None]:
print_roam_node(RoamNode(roam_data[0]))

### Example of New Paper Page Format

In [None]:
print_roam_node(RoamNode(roam_data[2000]))

### Example of Old Paper Page Format

In [None]:
print_roam_node(RoamNode(roam_data[2]))