## Data processing
- Goal to reduce prompt size
- Removing repeated websites that are in quick succession
- Remove websites with common keywords
- Extracts titles and timestamp

In [1]:
! wget -q -O History.json https://raw.githubusercontent.com/EricXu1728/Browser-History-Sample/main/browser_history_limited.json

In [2]:
import json
from datetime import datetime

# Define the input and output file names
input_file = './History.json'
x = 3  # Number of previous unique titles to check against

# Read the JSON data from the input file
with open(input_file, 'r') as file:
    data = json.load(file)

def share_word(title1, title2):
    words1 = set(title1.lower().split())
    words2 = set(title2.lower().split())
    return not words1.isdisjoint(words2)

def share_word_with_previous(title, previous_titles):
    for prev_title in previous_titles:
        if share_word(title, prev_title):
            return True
    return False


def convert_time_usec_to_readable(time_usec):
    # Convert microseconds to seconds
    time_seconds = time_usec / 1000000

    # Convert Unix timestamp to a datetime object
    dt_object = datetime.fromtimestamp(time_seconds)

    # Format the datetime object to a readable string
    readable_time = dt_object.strftime("%Y-%m-%d %H:%M:%S")

    return readable_time


In [3]:
# Define the input and output file names
input_file = "./History.json"
output_file = "titles_with_timestamp.txt"

# Read the JSON data from the input file
with open(input_file, "r") as file:
    data = json.load(file)

# Extract titles and timestamps from the JSON data, removing consecutive duplicates, entries sharing words, and instances of "new tab"
entries = []
previous_titles = []

for entry in data.get("Browser History", []):
    title = entry.get("title", "").strip()
    time_usec = entry.get("time_usec", 0)
    if title.lower() != "new tab" and title and (not previous_titles or title != previous_titles[-1]):
        if not share_word_with_previous(title, previous_titles):
            entries.append((title, time_usec))
            previous_titles.append(title)
            if len(previous_titles) > x:
                previous_titles.pop(0)

# Write the extracted titles with timestamp to the output file
with open(output_file, "w") as file:
    for title, time_usec in entries:
        readable_timestamp = convert_time_usec_to_readable(time_usec)
        file.write(f"{readable_timestamp}: {title}\n")

print(f"Extracted {len(entries)} titles and stored them along with timestamp in {output_file}")

Extracted 113 titles and stored them along with timestamp in titles_with_timestamp.txt
