In [1]:

%pprint
%matplotlib inline
import sys
import os.path as osp
import os

executable_path = sys.executable
scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts'); assert osp.exists(scripts_folder)
py_folder = osp.abspath(osp.join(os.pardir, 'py')); assert osp.exists(py_folder), "Create the py folder"
ffmpeg_folder = r'C:\ffmpeg\bin'; assert osp.exists(ffmpeg_folder)
shared_folder = osp.abspath(osp.join(os.pardir, os.pardir, 'share')); assert osp.exists(shared_folder)

if (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if (py_folder not in sys.path): sys.path.insert(1, py_folder)
if (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
if shared_folder not in sys.path: sys.path.insert(1, shared_folder)

from notebook_utils import NotebookUtilities
nu = NotebookUtilities(
    data_folder_path=osp.abspath(osp.join(os.pardir, 'data')),
    saves_folder_path=osp.abspath(osp.join(os.pardir, 'saves'))
)

# Import needed libraries
from pandas import DataFrame
import ast

Pretty printing has been turned OFF


In [2]:

if nu.pickle_exists('bookmark_files'): bookmark_files = nu.load_object('bookmark_files')
else:
    bookmark_files = []
    common_bookmark_names = [
        'Bookmarks',          # Chrome, Edge
        'bookmarks.html',     # Firefox (exported)
        'safari_bookmarks.html',  # Safari (exported)
        'Bookmarks.plist',    # Safari (plist file)
        'places.sqlite'       # Firefox (database file)
    ]
    for start_directory in ['C:\\']:
        for parent_directory, child_directories, files in os.walk(start_directory):
            for file in files:
                if file in common_bookmark_names:
                    bookmark_files.append(os.path.join(parent_directory, file))
    nu.store_objects(bookmark_files=bookmark_files)


# Look at the Chrome and Edge bookmarks

In [3]:

# A recursive call modifies the same list object in place because lists are mutable in Python
def get_bookmark(bookmark_obj):
    """
    Process a bookmark object and extract URLs into a dictionary.

    Args:
        bookmark_obj (dict or list): The bookmark object to process, which can be a dictionary or a list.

    Returns:
        None
    """
    # Handle dictionary objects
    if isinstance(bookmark_obj, dict):
        # Check if the bookmark object is a URL type
        if bookmark_obj.get('type') == 'url':
            # Handle missing name or url
            name = bookmark_obj.get('name', 'Unnamed')
            url = bookmark_obj.get('url', 'Missing URL')
            URL_DICT[name].append(url)  # Append URL to handle duplicate names
        else:
            # Recursively process nested dictionaries
            for value in bookmark_obj.values():
                get_bookmark(value)

    # Handle list objects
    elif isinstance(bookmark_obj, list):
        for item in bookmark_obj:
            get_bookmark(item)

In [4]:

import requests
from tqdm import tqdm_notebook as tqdm

def validate_urls(url_dict):
    """
    Validate URLs in the given dictionary and remove keys whose URLs return an error.

    Parameters:
        url_dict (dict): Dictionary where keys are names and values are lists of URLs.

    Returns:
        dict: Updated dictionary with only valid URLs.
    """
    keys_to_remove = []  # Keep track of keys to remove

    for name, urls in tqdm(url_dict.items(), total=len(url_dict), desc='Test the Bookmarks'):
        valid_urls = []
        for url in urls:
            
            # Send a HEAD request to check the URL
            try:
                response = requests.head(url, timeout=5)  # Timeout of 5 seconds
                if response.status_code < 400:
                    valid_urls.append(url)  # Keep valid URLs
                else:
                    # print(f"Invalid URL [{response.status_code}]: {url}")
                    continue
            
            # Handle request errors (e.g., timeout, connection error)
            except requests.RequestException as e:
                # print(f"Error with URL: {url} - {e}")
                pass
        
        # If no valid URLs remain, mark the key for removal
        if not valid_urls:
            keys_to_remove.append(name)
        
        # Update the dictionary with only valid URLs
        else:
            url_dict[name] = valid_urls

    # Remove invalid keys from the dictionary
    for key in keys_to_remove:
        del url_dict[key]
        # print(f"Removed key: {key} (all URLs invalid)")

    return url_dict

In [5]:

if nu.pickle_exists('bookmarks_dict'): bookmarks_dict = nu.load_object('bookmarks_dict')
else:
    from collections import defaultdict
    
    URL_DICT = defaultdict(list)  # Use a defaultdict to handle duplicate names
    
    # Convert the entries into data frame rows
    rows_list = []
    for bookmark_path in tqdm(bookmark_files, total=len(bookmark_files), desc='Load the Bookmarks'):
        if osp.basename(bookmark_path) == 'Bookmarks':
            with open(bookmark_path, 'r', encoding='utf-8') as file:
                bookmarks_data = json.load(file)
                get_bookmark(bookmarks_data)
    
    # Convert the list of URLs to a set to remove duplicates
    for key in tqdm(URL_DICT, total=len(URL_DICT), desc='Convert the Dictionary'):
        URL_DICT[key] = set(URL_DICT[key])
    
    # display(nu.get_random_subdictionary(URL_DICT))
    updated_dict = validate_urls(URL_DICT)
    nu.store_objects(bookmarks_dict=updated_dict)

In [14]:

display(nu.get_random_subdictionary(bookmarks_dict))

{'Security Concepts': ['http://www.subspacefield.org/security/security_concepts/index.html'], 'Hacker News': ['http://news.ycombinator.com/rss'], 'thewebsbest.net - How to tune your Ubuntu PC for faster performance.': ['http://thewebsbest.net/index.php?option=com_content&task=view&id=121&Itemid=30'], '30 Awesome Anchor Charts to Spice Up Your Classroom': ['https://www.boredteachers.com/post/30-awesome-anchor-charts?fbclid=IwAR3gmxWj0oMthXxUxQ5HYdAJIruDLjWBKGZqiRELhEOJRrvZAHYPklnvzg0'], 'Reliable video hosting': ['http://vidreel.com/video/OTAxNDU0/']}


# Look at the Firefox database files

In [10]:

import sqlite3
import pandas as pd

rows_list = []
for bookmark_path in bookmark_files:
    if bookmark_path.endswith('.sqlite'):
        conn = sqlite3.connect(bookmark_path)
        tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
        for table_name in tables.name:
            table_name_data = pd.read_sql_query(f"SELECT * FROM {table_name};", conn)
            if table_name_data.shape[0]:
                contains_http = False
                for column in table_name_data.select_dtypes(include=['object']).columns:
                    if table_name_data[column].str.startswith('http').any():
                        contains_http = True
                        break
                if contains_http:
                    row_dict = {
                        'bookmark_path': bookmark_path,
                        'bookmark_name': osp.basename(bookmark_path),
                        'table_name': table_name,
                        'table_column': column,
                    }
                    rows_list.append(row_dict)
        conn.close()
bookmark_columns_df = DataFrame(rows_list)

In [12]:

bookmark_columns_df.groupby('table_name').size()

table_name
moz_bookmarks        1
moz_inputhistory     1
moz_origins         19
moz_places          19
dtype: int64

In [14]:

mask_series = bookmark_columns_df.table_name.isin(['moz_bookmarks', 'moz_inputhistory'])
for _, row_series in bookmark_columns_df[mask_series].iterrows():
    bookmark_path = row_series.bookmark_path
    table_name = row_series.table_name
    print(table_name)
    conn = sqlite3.connect(bookmark_path)
    table_name_df = pd.read_sql_query(f"SELECT * FROM {table_name};", conn)
    display(table_name_df)
    conn.close()

moz_inputhistory


Unnamed: 0,place_id,input,use_count
0,15672,im,0.122334
1,15672,imgur,0.147057
2,6468,geni.com,0.184753
3,20058,firs,0.16156
4,1463,turbo,0.188064
5,169,turbo,0.188064
6,4428,old.,0.37588
7,9087,python round float to,0.320043
8,4428,old,2.184488
9,20658,alight,0.422822


moz_bookmarks


Unnamed: 0,id,type,fk,parent,position,title,keyword_id,folder_type,dateAdded,lastModified,guid,syncStatus,syncChangeCounter
0,1,2,,0,0,,,,1645056347488000,1677789729327000,root________,2,1
1,2,2,,1,0,menu,,,1645056347488000,1659903526644000,menu________,2,0
2,3,2,,1,1,toolbar,,,1645056347488000,1677789729327000,toolbar_____,2,0
3,4,2,,1,2,tags,,,1645056347488000,1668086390799000,tags________,2,3
4,5,2,,1,3,unfiled,,,1645056347488000,1648674029536000,unfiled_____,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7664,12034,1,23441.0,12033,0,,,,1667560469120000,1667560469120000,a8504Wy3kcXO,1,2
7665,12035,1,23771.0,102,11,📙 Emojipedia — 😃 Home of Emoji Meanings 💁👌🎍😍,,,1668086378935000,1668086397225000,iVh7gNzf77cL,2,0
7666,12036,2,,4,9,emoji,,,1668086390799000,1668086390809000,g7SbGMWcA1sc,1,2
7667,12037,1,23771.0,12036,0,,,,1668086390809000,1668086390809000,LT8NAQx1q4YU,1,2



----