In [1]:
import os
import requests
import pandas as pd
from datetime import datetime, timedelta

from bs4 import BeautifulSoup
from bs4.element import Comment

import sys
sys.path.insert(0, "../")
import utils
from chromadb_tools import get_chroma_collection, run_chroma_ingest

In [20]:
history = utils.get_browser_history()

10474 urls from Firefox
73 urls from Chrome
16 urls from Arc


In [21]:
history = history.dropna(subset=['url_hash'])

In [22]:
len(set(history['url_hash']))

7433

In [23]:
now_utc = datetime.utcnow()

yesterday_start = (now_utc - timedelta(days=300)).replace(hour=0, minute=0, second=0, microsecond=0)
yesterday_end = yesterday_start + timedelta(days=1, microseconds=-1)

In [25]:
history = history.loc[history['datetime_utc'].dt.date >= yesterday_start.date()]
len(history)

7433

In [26]:
history_pages_dir = "../data/history_pages/"
utils.make_dir(history_pages_dir)
for i, (_, row) in enumerate(history.iterrows()):
    if i % 10 == 0:
        print(i)
    dest_f = os.path.join(history_pages_dir, f"{int(row['url_hash'])}.html")
    if os.path.exists(dest_f):
        continue
    if row['url'][:8] == "file:///":
        continue
    try:
        response = requests.get(row['url'])
    except:
        print(f"Failed request for {row['url']}")
        continue
    with open(dest_f, 'w') as outfile:
        outfile.write(response.text)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
Failed request for https://www.bioinsidernetwork.com/
510
520
530
540
550
560
570
580
590
600
610
620
Failed request for http://vulnerabilityassessments.life/avs/en/dt/mcafee-4.php?c=5vz65053z7qz1&k=2df193bf0d52ae353cf1dc59c1335a60&country_code=US&carrier=Verizon&country_name=United%20States&region=New%20York&city=Buffalo&isp=MCI%20Communications%20Services,%20Inc.%20d/b/a%20Verizon%20Business&lang=en&os=Mac%20OS%20X&osv=10.15&browser=Firefox&browserv=121&brand=Desktop&model=Desktop&marketing_name=Desktop&tablet=4&rheight=768&rwidth=768&e=5
Failed request for http://vulnerabilityassessments.life/avs/en/dt/avg-1.php?c=5vz65053z7qz1&k=2df193bf0d52ae353cf1dc59c1335a60&country_code=US&carrier=Verizon&country_name=United%20States&region=New%20York&city=Buffalo&isp=MCI%20Communications%20Services,%20In

# Parse text from HTML

In [8]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

def text_from_url_hash(url_hash):
    html_path = os.path.join(history_pages_dir, f"{url_hash}.html")
    if not os.path.exists(html_path):
        return "No html"
    with open(html_path, 'r') as infile:
        html = infile.read()
    return text_from_html(html)

# Add to chromadb

In [9]:
chroma_collection = get_chroma_collection(collection_name="history_full_text")

In [10]:
history['full_text'] = history['url_hash'].apply(lambda x: text_from_url_hash(x))

  texts = soup.findAll(text=True)


In [11]:
df = history.loc[history['full_text'].str.len() > 50]
len(df)

1043

In [12]:
run_chroma_ingest(df, chroma_collection)

Insert of existing embedding ID: 1570044905883802425
Insert of existing embedding ID: 2078036887709956766
Insert of existing embedding ID: 465351758596575036
Insert of existing embedding ID: 1147670435578958047
Insert of existing embedding ID: 1795477675041903260
Insert of existing embedding ID: 1845290341525160447
Insert of existing embedding ID: 360064471792638577
Insert of existing embedding ID: 2173751364628448710
Insert of existing embedding ID: 2244093962196857770
Insert of existing embedding ID: 2293464213754761243
Insert of existing embedding ID: 1123155588584180439
Insert of existing embedding ID: 574905316281922921
Insert of existing embedding ID: 2223182224855028112
Insert of existing embedding ID: 2134971482993242665
Insert of existing embedding ID: 718660366117881787
Insert of existing embedding ID: 893915584777742624
Insert of existing embedding ID: 1259759830639824606
Insert of existing embedding ID: 857394777468621085
Insert of existing embedding ID: 1106120542184747709

Successfully added 1043 documents to chromadb


In [13]:
chroma_res = chroma_collection.get(include=['embeddings', 'documents'], 
                                           where={"timestamp" : {"$gte": 1722916800}})

In [16]:
len(chroma_res['ids'])

309