In [None]:
%pip install pydantic
%pip install typing_extensions

In [None]:
import json
from pathlib import Path

input_path = Path("/home/teo/gdrive_rclone/bookmarks-2024-09-10.json")
        
with open(input_path) as f:
    data = json.load(f)

assert isinstance(data, dict)

In [None]:
from collections import Counter, defaultdict
from typing_extensions import Self
from typing import Optional, Generator
from pydantic import BaseModel, Field

class Bookmark(BaseModel):
    guid: str
    title: str
    index: int
    id: int
    dateAdded: int
    lastModified: int
    type: str
    typeCode: int
    uri: Optional[str] = Field(default=None, required=False)
    iconUri: Optional[str] = Field(default=None, required=False)
    root: str | None = Field(default=None, required=False)
    children: Optional[list[Self]] = Field(default=None, required=False)


Helper functions

In [None]:

IDS = Counter()

def walk(node: Bookmark, path=None) -> Generator[tuple[Bookmark, list[int]], None, None]:
    if path is None:
        path = []

    yield node, path 

    if node.children is not None:        
        for child in node.children:
            yield from walk(child, path=path + [node.id])


def show(b: Bookmark, indent=0, pref = ""):
    # print(pref + " " * indent, b['title'])
    b_no_children = b.model_copy(deep=False)
    if b_no_children.children is not None:
        b_no_children.children = None
    print(f"{b_no_children!r}")
    children = b.children
    IDS[b.id] += 1
    if isinstance(children, list):
        for c in children:
            show(c, indent + 2, pref=f"root: {c.root}")
    else:
        assert children is None

def depth_by_id(bm: Bookmark) -> dict[int, Bookmark]:
    d = {}
    for node, path in walk(bm):
        d[node.id] = len(path)
    return d


def ids_for_uri(bm: Bookmark) -> defaultdict[str, list[int]]:
    d = defaultdict(list)
    for node, _ in walk(bm):
        if node.uri is not None:
            d[node.uri].append(node.id)
    return d


def clean_ids(bm: Bookmark, ids_to_delete: set[int]) -> Bookmark:
    for node, _ in walk(bm):
        if bm.children is not None:
            bm.children = [c for c in bm.children if c.id not in ids_to_delete]

        for child in node.children:
            clean_ids(child, ids_to_delete)

show(bm)


In [None]:



for node, path in walk(bm):
    print(node.id, [id_ for id_ in path])


def compute_ids_to_delete(bm: Bookmark) -> set[int]:

    id2depth = depth_by_id(bm)

    uri2ids = ids_for_uri(bm)

    ids_to_delete = set()

    for ids in uri2ids.values():
        if len(ids) > 1:
            # Delete all but the shallowest 
            sorted_ids = sorted(ids, key=lambda id_: id2depth[id_])
            ids_to_delete |= set(sorted_ids[1:])
    
    print(f"# of ids_to_delete: {len(ids_to_delete)}")
    return ids_to_delete


In [None]:
import json 


In [None]:
from typing import TextIO
import html

cnt_links_out = 0 

def render_header(f: TextIO):
    return f.write("""<!DOCTYPE NETSCAPE-Bookmark-file-1>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
    <TITLE>Bookmarks</TITLE>
    <H1>Bookmarks</H1>
    <DL><p>""")

def escape_html(s: str) -> str:
    return html.escape(s) if isinstance(s, str) else ""


def render_html(f: TextIO, bm: Bookmark, indent=0):
    pref = "  " * indent
    if bm.children is not None and len(bm.children) > 0: # i.e. it is a folder
        f.write(f"""{pref}<DT><H3 ADD_DATE="{bm.dateAdded}" LAST_MODIFIED="{bm.lastModified}">{bm.title}</H3>\n""")
        f.write(f"{pref}<DL><p>\n")
        for child in sorted(bm.children, key=lambda x: x.index):
            render_html(f, child, indent=indent+1)
        f.write(f"{pref}</DL><p>\n")
    elif bm.children is None:
        global cnt_links_out
        cnt_links_out += 1
        f.write(f"""{pref}<DT><A HREF="{escape_html(bm.uri)}" ADD_DATE="{bm.dateAdded}">{bm.title}</A>\n""")

def render_footer(f: TextIO):
    return f.write("""</DL><p>""")



In [None]:

def main():
    root = Bookmark.model_validate(data)
    new_root = root.model_copy(deep=True)
    ids_to_delete = compute_ids_to_delete(new_root)
    clean_ids(new_root, ids_to_delete)

    with input_path.with_suffix('.deduped.html').open('wt') as f:
        render_header(f)
        for child in new_root.children:
            render_html(f, child)
    render_footer(f)

    print(f"cnt_links_out: {cnt_links_out}")


main()    