## Safari

In [None]:
import sqlite3
import os
from datetime import datetime

history_db = os.path.expanduser("~/Library/Safari/HistoryCopy.db")
history_db = "HistoryCopy.db"
conn = sqlite3.connect(history_db)
cursor = conn.cursor()

query = """
SELECT datetime(visit_time + 978307200, 'unixepoch') as visit_date, url, title
FROM history_visits
INNER JOIN history_items ON history_items.id = history_visits.history_item
ORDER BY visit_date DESC
"""

for row in cursor.execute(query):
    print(row)

conn.close()


## Firefox

In [None]:
import sqlite3
import os
import pandas as pd

db_path = "places.sqlite"

conn = sqlite3.connect(db_path)
# cursor = conn.cursor()

# Connect to the copied database
# conn = sqlite3.connect()

# SQL query to extract visit time, url, and title
query = """
SELECT
    datetime(moz_historyvisits.visit_date/1000000, 'unixepoch') as visit_time,
    moz_places.url,
    moz_places.title
FROM moz_places
JOIN moz_historyvisits
ON moz_places.id = moz_historyvisits.place_id
ORDER BY visit_time DESC
"""

# Load into pandas DataFrame
df = pd.read_sql_query(query, conn)

# Close the connection
conn.close()

# Optional: Preview the first few rows
print(df.head())


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import os

# Filenames
output_file = "firefox_history_with_text.csv"

# Load previous progress if exists
if os.path.exists(output_file):
    df = pd.read_csv(output_file)
    print(f"Resuming from existing file: {output_file}")
else:

    df['page_text'] = None
# df['page_text'] = None
# Set up headers
headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Mac OS X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0 Safari/537.36'
}

# Iterate and resume based on page_text column
for i, row in tqdm(df.iterrows(), total=len(df)):
    # Skip if already processed
    if pd.notna(row['page_text']):
        continue
    
    title = str(row['title'])
    url = row['url']
    # print(title)
    # Skip Google Search result pages
    if title is not None and title.strip().endswith("- Google Search"):
        df.at[i, 'page_text'] = title
        continue

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)
        df.at[i, 'page_text'] = text[:5000]  # Truncate long text
    except Exception as e:
        df.at[i, 'page_text'] = f"[ERROR] {str(e)}"

    # Save progress after each row
    df.to_csv(output_file, index=False)

# Done
print("Scraping complete.")
