In [1]:

%pprint
import sys
import os.path as osp, os as os

executable_path = sys.executable; scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts')
py_folder = osp.abspath(osp.join(os.pardir, 'py')); ffmpeg_folder = r'C:\ffmpeg\bin'
if osp.exists(scripts_folder) and (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if osp.exists(py_folder) and (py_folder not in sys.path): sys.path.insert(1, py_folder)
if osp.exists(ffmpeg_folder) and (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
shared_folder = osp.abspath(osp.join(os.pardir, 'share'))
if osp.exists(shared_folder) and (shared_folder not in sys.path): sys.path.insert(1, shared_folder)
from notebook_utils import NotebookUtilities
nu = NotebookUtilities(
    data_folder_path=osp.abspath(osp.join(os.pardir, 'data')),
    saves_folder_path=osp.abspath(osp.join(os.pardir, 'saves'))
)

Pretty printing has been turned OFF


In [2]:

import humanize

# Load the bookmarks data frame
bookmarks_df = nu.load_object('bookmarks_df')
print(
    f'The bookmarks_df data frame contains about {humanize.intword(bookmarks_df.shape[0])}'
    f' records and these columns: {nu.conjunctify_nouns(bookmarks_df.columns)}.'
)

# Load progress if it exists, otherwise initialize
progress = nu.load_object('progress') or {
    'non_working_indices': [], 'processed_indices': set()
}

# Extract non-working indices
non_working_indices = progress['non_working_indices']
print(
    f'The bookmarks_df data frame contains about {humanize.intword(len(non_working_indices))} non-working URLs.'
)

The bookmarks_df data frame contains about 4.2 thousand records and these columns: folder_add_date, folder_last_modified, folder_personal_toolbar_folder, link_href, link_add_date, and link_icon.
The bookmarks_df data frame contains about 3.4 thousand non-working URLs.


In [3]:

# Remove non-working indices from the bookmarks_df
filtered_bookmarks_df = bookmarks_df.drop(index=non_working_indices)
print(
    f'The filtered_bookmarks_df data frame contains about {humanize.intword(filtered_bookmarks_df.shape[0])}'
    f' records and these columns: {nu.conjunctify_nouns(filtered_bookmarks_df.columns)}.'
)

The filtered_bookmarks_df data frame contains about 793 records and these columns: folder_add_date, folder_last_modified, folder_personal_toolbar_folder, link_href, link_add_date, and link_icon.


In [15]:

import json
from bs4 import BeautifulSoup
import re

def parse_bookmarks(file_path):
    
    # Open and parse the HTML file, removing <p> tags and closing the <DT> tags
    with open(file_path, 'r', encoding='utf-8') as file:
        page_html = file.read().replace('<p>', '')
        h3_regex = re.compile('<DT><H3 ADD_DATE="([^"]+)" LAST_MODIFIED="([^"]+)">([^><]+)</H3>')
        page_html = h3_regex.sub(r'<DT><H3 ADD_DATE="\1" LAST_MODIFIED="\2">\3</H3></DT>', page_html)
        a_icon_regex = re.compile('<DT><A HREF="([^"]+)" ADD_DATE="([^"]+)" ICON="([^"]+)">([^><]+)</A>')
        page_html = a_icon_regex.sub(r'<DT><A HREF="\1" ADD_DATE="\2" ICON="(\3">\4</A></DT>', page_html)
        a_no_icon_regex = re.compile('<DT><A HREF="([^"]+)" ADD_DATE="([^"]+)">([^><]+)</A>')
        page_html = a_no_icon_regex.sub(r'<DT><A HREF="\1" ADD_DATE="\2">\3</A></DT>', page_html)
        soup = BeautifulSoup(page_html, 'html.parser')

    # Start parsing from the top-level <dl> tag
    top_level_dl = soup.find('dl')
    bookmarks = parse_folder(top_level_dl)

    return bookmarks

# Convert bookmarks.html to JSON
def convert_to_json(input_file, output_file):
    bookmarks = parse_bookmarks(input_file)
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(bookmarks, json_file, indent=4, ensure_ascii=False)
    print(f"Bookmarks have been converted to JSON and saved to '{output_file}'.")

# Example usage
input_html = '../saves/html/favorites_1_4_25.html'
output_json = '../saves/json/bookmarks.json'
convert_to_json(input_html, output_json)

Bookmarks have been converted to JSON and saved to '../saves/json/bookmarks.json'.


In [30]:

# Open and parse the HTML file, removing <p> tags and closing the <DT> tags
input_html = '../saves/html/favorites_1_4_25.html'
with open(input_html, 'r', encoding='utf-8') as file:
    page_html = file.read().replace('<p>', '')
    h3_regex = re.compile(r'<DT>\s*<H3 ADD_DATE="([^"]+)" LAST_MODIFIED="([^"]+)">([^><]+)</H3>')
    page_html = h3_regex.sub(r'<DT><H3 ADD_DATE="\1" LAST_MODIFIED="\2">\3</H3></DT>', page_html)
    a_icon_regex = re.compile(r'<DT>\s*<A HREF="([^"]+)" ADD_DATE="([^"]+)" ICON="([^"]+)">([^><]+)</A>')
    page_html = a_icon_regex.sub(r'<DT><A HREF="\1" ADD_DATE="\2" ICON="(\3">\4</A></DT>', page_html)
    a_no_icon_regex = re.compile(r'<DT>\s*<A HREF="([^"]+)" ADD_DATE="([^"]+)">([^><]+)</A>')
    page_html = a_no_icon_regex.sub(r'<DT><A HREF="\1" ADD_DATE="\2">\3</A></DT>', page_html)
    page_soup = BeautifulSoup(page_html, 'html.parser')

In [49]:

top_level_dl = page_soup.find('dl')
dt_tag1 = top_level_dl.find_all(['dt'], recursive=False)[0]
print(dt_tag1.text.split('\n')[:8])
dl_tag1 = dt_tag1.find_all(['dl'], recursive=False)[0]
dt_tag2 = dl_tag1.find_all(['dt'], recursive=False)[0]
print(dt_tag2.text.split('\n')[:6])

['Favorites bar', '', 'Tools', '', 'API Examples', '', 'About the Indicators API Documentation – World Bank Data Help Desk', 'API:Main page - MediaWiki']
['Tools']


In [None]:

# Recursive function to parse the bookmarks
def parse_folder(dl_tag):
    folder = []
    dt_tags = dl_tag.find_all(['dt'], recursive=False)
    for dt_tag in dt_tags:
        h3_tags = dt_tag.find_all(['h3'], recursive=False)
        if h3_tags:  # Folder
            h3_tag = h3_tags[0]
            folder_name = h3_tag.text
            add_date = h3_tag.get('add_date', None)
            last_modified = h3_tag.get('last_modified', None)
            subfolders = dt_tag.find_all(['dl'], recursive=False)
            subfolder = subfolders[0] if subfolders else None
            folder.append({
                'type': 'folder',
                'name': folder_name,
                'add_date': add_date,
                'last_modified': last_modified,
                'children': parse_folder(subfolder) if subfolder else []
            })
        subdl_tags = dt_tag.find_all(['dl'], recursive=False)
        if subdl_tags:
            subdt_tags = subdl_tags[0].find_all(['dt'], recursive=False)
            if subdt_tags:
                a_tags = subdt_tags[0].find_all(['a'], recursive=True)  # Bookmarks
                for link in a_tags:
                    folder.append({
                        'type': 'bookmark',
                        'name': link.text,
                        'href': link.get('href'),
                        'add_date': link.get('add_date', None),
                        'icon': link.get('icon', None)
                    })
    
    return folder

In [47]:

from pandas import Series

Series([c.name for c in top_level_dl.children]).value_counts().to_dict()

{'dt': 1}

In [46]:

dt_tags = top_level_dl.find_all(['dt'], recursive=False)
print(len(dt_tags))
for dt_tag in dt_tags:
    display(Series([c.name for c in dt_tag.children]).value_counts().to_dict())

1


{'dt': 36, 'dl': 16, 'h3': 1}

dt    36
dl    16
h3     1
Name: count, dtype: int64

In [48]:

dt_h3_tags = dt_tags[0].find_all(['h3'], recursive=False)
print(len(dt_h3_tags))
for dt_h3_tag in dt_h3_tags:
    display(Series([c.name for c in dt_h3_tag.children]).value_counts().to_dict())
    display(dt_h3_tag.text)

1


{}

'Favorites bar'

In [44]:

dt_dl_tags = dt_tags[0].find_all(['dl'], recursive=False)
print(len(dt_dl_tags))
for dt_dl_tag in dt_dl_tags:
    display(Series([c.name for c in dt_dl_tag.children]).value_counts().to_dict())
    # print(dt_dl_tag)

16


{'dt': 33, 'dl': 4}

{'dt': 3024}

{'dt': 25, 'dl': 1}

{'dt': 18}

{'dt': 3}

{'dt': 7, 'dl': 6}

{'dt': 14}

{'dt': 10, 'dl': 10}

{'dt': 31}

{'dt': 21}

{'dt': 9, 'dl': 6}

{'dt': 3}

{'dt': 1}

{'dt': 2}

{'dt': 28, 'dl': 13}

{'dt': 3}

In [43]:

dt_dt_tags = dt_tags[0].find_all(['dt'], recursive=False)
print(len(dt_dt_tags))
for dt_dt_tag in dt_dt_tags:
    display(Series([c.name for c in dt_dt_tag.children]).value_counts().to_dict())
    # print(dt_dt_tag)

36


{'h3': 1}

{'h3': 1}

{'h3': 1}

{'h3': 1}

{'h3': 1}

{'h3': 1}

{'h3': 1}

{'h3': 1}

{'h3': 1}

{'h3': 1}

{'h3': 1}

{'h3': 1}

{'h3': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'a': 1}

{'h3': 1}

{'h3': 1}

In [9]:

import json

rows_list = []
output_json = '../saves/json/bookmarks.json'
with open(output_json, 'r', encoding='utf-8') as json_file:
    for row_dict in json.load(json_file):
        flattened_dict = nu.get_flattened_dictionary(row_dict, key_prefix='top')
        rows_list.append(flattened_dict)

In [10]:

from pandas import DataFrame

df = DataFrame(rows_list)

In [11]:

df.columns.tolist()[:100]

['top__type', 'top__name', 'top__add_date', 'top__last_modified', 'top__children00_type', 'top__children00_name', 'top__children00_add_date', 'top__children00_last_modified', 'top__children00_children0_type', 'top__children00_children0_name', 'top__children00_children0_add_date', 'top__children00_children0_last_modified', 'top__children00_children1_type', 'top__children00_children1_name', 'top__children00_children1_href', 'top__children00_children1_add_date', 'top__children00_children1_icon', 'top__children00_children2_type', 'top__children00_children2_name', 'top__children00_children2_href', 'top__children00_children2_add_date', 'top__children00_children2_icon', 'top__children00_children3_type', 'top__children00_children3_name', 'top__children00_children3_href', 'top__children00_children3_add_date', 'top__children00_children3_icon', 'top__children01_type', 'top__children01_name', 'top__children01_href', 'top__children01_add_date', 'top__children01_icon', 'top__children02_type', 'top__

In [13]:

df.top__name.unique().tolist()

['Keeper Security GitHub Group']


----

In [10]:

html_prefix = '<!DOCTYPE NETSCAPE-Bookmark-file-1>\n<!-- This is an automatically generated file.\n'
html_prefix += '     It will be read and overwritten.\n     DO NOT EDIT! -->\n<META HTTP-EQUIV="Content-Type" CONTENT="text/html;'
html_prefix += ' charset=UTF-8">\n<TITLE>Bookmarks</TITLE>\n<H1>Bookmarks</H1>\n<DL><p>'

In [None]:

from datetime import datetime

# Function to generate the Netscape Bookmark File Format
def generate_bookmarks_html(dataframe, output_file):
    
    # Start the HTML file with the required Netscape Bookmark header
    html_content = html_prefix + '\n'

    # Group bookmarks by folder
    grouped = dataframe.groupby(['link_href', 'link_add_date', 'link_icon'])
    for (link_href, link_add_date, link_icon), group in grouped:
        # Add bookmarks directly
        html_content += f'    <DT><A HREF="{link_href}" ADD_DATE="{link_add_date}" ICON="{link_icon}">{row["link_title"]}</A>\n'

    # Close the HTML structure
    html_content += "</DL><p>\n"

    # Write the content to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(html_content)

    print(f"Bookmarks have been exported to '{output_file}'.")

# Filter the bookmarks_df to exclude non-working indices
filtered_bookmarks_df = bookmarks_df.drop(index=non_working_indices)

# Ensure the DataFrame has the necessary columns for the Netscape format
# Required columns: 'link_href', 'link_add_date', 'link_title', and optionally 'link_icon' and 'folder'
if not all(col in filtered_bookmarks_df.columns for col in ['link_href', 'link_add_date', 'link_title']):
    raise ValueError("The DataFrame must contain 'link_href', 'link_add_date', and 'link_title' columns.")

# Generate the bookmarks.html file
generate_bookmarks_html(filtered_bookmarks_df, 'bookmarks.html')

In [84]:

# Open and parse the HTML file, removing <p> tags
with open(input_html, 'r', encoding='utf-8') as file:
    page_html = file.read()
    page_soup = BeautifulSoup(page_html.replace('<p>', ''), 'html.parser')
top_level_dl = page_soup.find('dl')
dt_tag1 = top_level_dl.find_all(['dt'], recursive=False)[0]
print(dt_tag1.text.split('\n')[:8])
dl_tag1 = dt_tag1.find_all(['dl'], recursive=False)[0]
dt_tag2 = dl_tag1.find_all(['dt'], recursive=False)[0]
print(dt_tag2.text.split('\n')[:6])
dl_tag2 = dt_tag2.find_all(['dl'], recursive=False)[0]
dt_tag3 = dl_tag2.find_all(['dt'], recursive=False)[0]
print(dt_tag3.text.split('\n')[:4])
dl_tag3 = dt_tag3.find_all(['dl'], recursive=False)[0]
dt_tag4 = dl_tag3.find_all(['dt'], recursive=False)[0]
print(dt_tag4.text.split('\n')[:2])

['Favorites bar', '', 'Tools', '', 'API Examples', '', 'About the Indicators API Documentation – World Bank Data Help Desk', 'API:Main page - MediaWiki']
['Tools', '', 'API Examples', '', 'About the Indicators API Documentation – World Bank Data Help Desk', 'API:Main page - MediaWiki']
['API Examples', '', 'About the Indicators API Documentation – World Bank Data Help Desk', 'API:Main page - MediaWiki']
['About the Indicators API Documentation – World Bank Data Help Desk', 'API:Main page - MediaWiki']


In [18]:

[f'results.{fn}' for fn in dir(results) if not fn.startswith('_')]

['results.cancel_join_thread', 'results.close', 'results.empty', 'results.full', 'results.get', 'results.get_nowait', 'results.join_thread', 'results.put', 'results.put_nowait', 'results.qsize']

In [3]:

from pandas import DataFrame
import pyperclip

# Open and read the bookmarks.html file
file_path = r"C:\Users\daveb\Downloads\bookmarks_1_3_25.html"
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read().replace('<p>', '')
    pyperclip.copy(html_content)

In [62]:

srs = bookmarks_df.link_href
print([f'srs.{fn}' for fn in dir(srs) if 'item' in fn])

['srs.__delitem__', 'srs.__getitem__', 'srs.__setitem__', 'srs._check_setitem_copy', 'srs._clear_item_cache', 'srs._getitem_slice', 'srs._gotitem', 'srs._item_cache', 'srs.item', 'srs.items']


In [63]:

srs.items?

[1;31mSignature:[0m [0msrs[0m[1;33m.[0m[0mitems[0m[1;33m([0m[1;33m)[0m [1;33m->[0m [1;34m'Iterable[tuple[Hashable, Any]]'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Lazily iterate over (index, value) tuples.

This method returns an iterable tuple (index, value). This is
convenient if you want to create a lazy iterator.

Returns
-------
iterable
    Iterable of tuples containing the (index, value) pairs from a
    Series.

See Also
--------
DataFrame.items : Iterate over (column name, Series) pairs.
DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs.

Examples
--------
>>> s = pd.Series(['A', 'B', 'C'])
>>> for index, value in s.items():
...     print(f"Index : {index}, Value : {value}")
Index : 0, Value : A
Index : 1, Value : B
Index : 2, Value : C
[1;31mFile:[0m      c:\programdata\anaconda3\lib\site-packages\pandas\core\series.py
[1;31mType:[0m      method