# Exploring summarization of browser history

In [53]:
import os
import html
import shutil
import requests
import subprocess
import pandas as pd
from datetime import datetime, timedelta

from bs4 import BeautifulSoup
from bs4.element import Comment

import sys
sys.path.insert(0, "../")
import utils
from chromadb_tools import get_chroma_collection, run_chroma_ingest, chroma_search_results_to_df

In [2]:
history_pages_dir = "../data/history_pages/"
found_text = set()

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    texts = list()
    for t in visible_texts:
        t = t.strip()
        if t not in found_text and len(t) > 10:
            texts.append(t)
        found_text.add(t)
    return u" ".join(texts).strip()

def get_html(row):
    html_path = os.path.join(history_pages_dir, f"{row['url_hash']}.html")
    if os.path.exists(html_path):
        with open(html_path, 'r') as infile:
            return infile.read()
        
    try:
        response = requests.get(row['url'])
    except:
        print(f"Failed request for {row['url']}")
        return ""
    with open(html_path, 'w') as outfile:
        outfile.write(response.text)
    return response.text

In [3]:
chroma_collection = get_chroma_collection(collection_name="browser_history")
history = utils.get_browser_history()

10504 urls from Firefox
73 urls from Chrome
16 urls from Arc


In [4]:
text = "agentic rag"
top_n = 100

chroma_search_results = chroma_collection.query(
            query_texts=[text],
            n_results=top_n
    )
results_df = chroma_search_results_to_df(chroma_search_results=chroma_search_results)
results_df = results_df.loc[results_df['distance'] <= 1.2]

results_history = history.loc[history['url'].isin(results_df['url'])]
len(results_history)

23

In [5]:
results_history['html'] = results_history.apply(lambda row: get_html(row), axis=1)
results_history['html_text'] = results_history['html'].apply(lambda x: text_from_html(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_history['html'] = results_history.apply(lambda row: get_html(row), axis=1)
  texts = soup.findAll(text=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_history['html_text'] = results_history['html'].apply(lambda x: text_from_html(x))


In [64]:
results_history = results_history.drop_duplicates(subset=['html_text'])
results_history = results_history.loc[results_history['html_text'].str.len() > 10]
len(results_history)

18

In [65]:
test_dir = '../data/test_dir'
results_history['html_f'] = results_history['url_hash'].apply(lambda x :os.path.join(history_pages_dir, f"{x}.html"))
results_history['html_f_test'] = results_history['url_hash'].apply(lambda x :os.path.join(test_dir, f"{x}.html"))
for i, row in results_history.iterrows():
    if os.path.exists(row['html_f']):
        shutil.copy(row['html_f'], row['html_f_test'])

In [66]:
len(results_history)

18

In [67]:
def get_url_text(row):
    return f"""Access time: {row['datetime_local']}\n
    Web Page text: {row['html_text']}\n
    """

In [68]:
topic = text
pre_prompt = f"""Below are webpages a user has been looking at related to the topic of {topic} 
    along with the timestamp the webpage was accessed. """

In [69]:
results_history = results_history.sort_values(by='datetime_local', ascending=True)

In [70]:
prompt = pre_prompt
for i, row in results_history.iterrows():
    prompt += get_url_text(row)
prompt += "Create a summary of the users's research on the topic."

In [71]:
len(prompt)

208701

In [72]:
prompt



In [15]:
subprocess.run("pbcopy", text=True, input=prompt)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


CompletedProcess(args='pbcopy', returncode=0)

In [17]:
results_history['thumbnail_url'] = results_history.apply(lambda row: utils.get_thumbnail_url(row['url'], row['html']), axis=1)

# Try summarizing using Local LLM

In [19]:
from mlx_lm import load, generate

MLX_LLM_MODEL = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"

  from .autonotebook import tqdm as notebook_tqdm


In [73]:
model, tokenizer = load(MLX_LLM_MODEL) 

In [74]:
def get_summarize_prompt(html_text):
    # prompt = f"""Below is text from a webpage.\n {html_text}\n 
    # Extract the key points from the webpage in relation to {topic}. Key Points:\n"""
    prompt = f"""Below is text from a webpage.\n {html_text}\n 
        Create a short bullet-point TLDR summary in relation to {topic}. Only use the 
        text provided. Summary:\n"""
    return prompt

In [75]:
results_history['summary'] = results_history['html_text'].apply(lambda x: generate(model, tokenizer, prompt=get_summarize_prompt(x), max_tokens=100))

In [116]:
def get_summary_html(row):
    # Check for thumbnail and adjust HTML accordingly
    thumbnail_html = ""
    if row['thumbnail_url']:
        thumbnail_html = f"""
            <div class="thumbnail-container">
                <a href="{row['url']}" target="_blank">
                    <img src="{row['thumbnail_url']}" alt="Thumbnail for {row['title']}" class="content-thumbnail">
                </a>
            </div>
        """

    # Escape HTML special characters in the summary and convert newlines to HTML breaks
    escaped_summary = html.escape(row['summary']).replace('\n', '<br>')

    # Combine all parts
    html_content = f"""
        <div class="content-container" data-content-id="{row['id']}">
            {thumbnail_html}
            <div class="text-container">
                <a href="{row['url']}" target="_blank" onclick="trackClick({row['id']});" class="content-title">{row['title']}</a>
                <div class="summary">{escaped_summary}</div>
            </div>
        </div>
    """
    return html_content

def generate_full_html(results_history, topic):
    styles = '''
    <style>
        .content-container {
            display: flex; /* Flexbox layout to align image and text side by side */
            border-bottom: 1px solid #ccc; /* Adds a border between entries */
            padding-bottom: 10px; /* Spacing below each item */
            margin-bottom: 10px; /* Spacing between items */
        }
        .thumbnail-container {
            flex: 0 0 auto; /* Flex item does not grow or shrink */
            margin-right: 10px; /* Space between the image and the text */
        }
        .text-container {
            flex: 1; /* Allows the text container to take up remaining space */
        }
        .content-thumbnail {
            width: 100px; /* Sets a fixed width */
            height: 100px; /* Sets a fixed height */
            object-fit: contain; /* Ensures the image fits within dimensions without cropping */
        }
        .summary {
            white-space: pre-wrap; /* Maintains whitespace formatting */
        }
        .date-header {
            font-size: 18px; /* Size of date header */
            font-weight: bold; /* Make date header bold */
            margin-top: 20px; /* Top margin for spacing */
            margin-bottom: 10px; /* Bottom margin before content starts */
        }
        .header {
            font-size: 24px; /* Larger font size for header */
            text-align: center; /* Center-align the header text */
            margin: 20px 0; /* Top and bottom margin for spacing */
        }
    </style>
    '''
    header_html = f'<div class="header">Topic: {topic}</div>'

    summaries_html = styles + header_html
    
    results_history = results_history.sort_values('datetime_local', ascending=False)
    results_history['day_accessed'] = results_history['datetime_local'].dt.date

    for date in results_history.day_accessed.unique():
        date_df = results_history.loc[results_history['day_accessed'] == date]
        summaries_html += f'<div class="date-header">Accessed on: {date}</div>'
        for i, row in date_df.iterrows():
            summaries_html += get_summary_html(row)

    return summaries_html

html_output = generate_full_html(results_history, topic)

2024-08-13
2024-08-06
2024-08-05
2024-08-03
2024-08-02
2024-07-30
2024-07-02
2024-05-01


In [117]:
output_dir = "../data/output_summaries/"
with open(os.path.join(output_dir, "agentic_rag_summary.html"), 'w') as outfile:
    outfile.write(html_output)

# Create summary of summary

In [119]:
def get_url_text(row):
    return f"""Access time: {row['datetime_local']}\n
    Web Page summary: {row['summary']}\n
    """

def get_summaries_summary_prompt(df):
    prompt = f"Below are summaries extracted from different webpages related to the topic {topic}. \n"
    for i, row in df.iterrows():
        prompt += get_url_text(row)
    prompt += "Create a summary of the below summaries focusing on {topic}. Answer: \n"

In [None]:
summary_prompt = get_summaries_summary_prompt(results_history)


In [118]:
results_history

Unnamed: 0,id,url,title,rev_host,visit_count,hidden,typed,frecency,last_visit_date,guid,...,datetime_utc,datetime_local,html,html_text,html_f,html_f_test,thumbnail_url,summary,summary_o,day_accessed
9518,10883.0,https://www.reddit.com/r/LocalLLaMA/comments/1...,What Embedding Models Are You Using For RAG? :...,moc.tidder.www.,2.0,0.0,0.0,127.0,1723579000000000.0,wU57JKVMb3nl,...,2024-08-13 19:53:08.265459061+00:00,2024-08-13 15:53:08.265459061-04:00,"\n <!DOCTYPE html>\n <html lang=""en-US"" ...",Server error We have encountered an error. Ple...,../data/history_pages/472063029123780562.html,../data/test_dir/472063029123780562.html,,‚Ä¢ The server encountered an error.\n ...,The error message indicates that the s...,2024-08-13
6250,7479.0,https://github.com/Marker-Inc-Korea/RAGchain/t...,RAGchain/RAGchain/retrieval at main ¬∑ Marker-I...,moc.buhtig.,2.0,0.0,0.0,107.0,1723573000000000.0,9HFVC4Dobsu0,...,2024-08-13 18:08:45.815433979+00:00,2024-08-13 14:08:45.815433979-04:00,"\n\n\n\n\n\n<!DOCTYPE html>\n<html\n lang=""en...",bm25_retrieval.py vectordb_retrieval.py,../data/history_pages/1563417946622143631.html,../data/test_dir/1563417946622143631.html,https://opengraph.githubassets.com/455d2b4e1e9...,‚Ä¢ The code is for two different retrie...,- The code is used for retrieval-based...,2024-08-13
6233,7462.0,https://github.com/NomaDamas/RAGchain,Marker-Inc-Korea/RAGchain: Extension of Langch...,moc.buhtig.,3.0,0.0,0.0,37.0,1723562000000000.0,vAlqpL-OYHlL,...,2024-08-13 15:20:05.493707895+00:00,2024-08-13 11:20:05.493707895-04:00,"\n\n\n\n\n\n\n<!DOCTYPE html>\n<html\n lang=""...",Extension of Langchain for RAG. Easy benchmark...,../data/history_pages/888551556676856608.html,../data/test_dir/888551556676856608.html,https://opengraph.githubassets.com/455d2b4e1e9...,RAGchain is a framework for developing...,RAGchain is a framework for developing advance...,2024-08-13
11371,12736.0,https://langchain-ai.github.io/langgraph/tutor...,Self-RAG using local LLMs,oi.buhtig.ia-niahcgnal.,1.0,0.0,0.0,94.0,1722966000000000.0,sGf0hx8Etis-,...,2024-08-06 17:46:21.157686949+00:00,2024-08-06 13:46:21.157686949-04:00,"\n<!doctype html>\n<html lang=""en"" class=""no-j...",Self RAG using local LLMs %capture --no-stderr...,../data/history_pages/1905697565505754408.html,../data/test_dir/1905697565505754408.html,https://langchain-ai.github.io/langgraph/stati...,‚Ä¢ Agentic RAG is a framework for build...,* Agentic RAG is a framework for build...,2024-08-06
11266,12631.0,https://langchain-ai.github.io/langgraph/tutor...,Self-RAG,oi.buhtig.ia-niahcgnal.,2.0,0.0,0.0,185.0,1722966000000000.0,KheTWfX378HV,...,2024-08-06 17:46:04.360167980+00:00,2024-08-06 13:46:04.360167980-04:00,"\n<!doctype html>\n<html lang=""en"" class=""no-j...",Self-RAG is a strategy for RAG that incorporat...,../data/history_pages/1303557538656901825.html,../data/test_dir/1303557538656901825.html,https://langchain-ai.github.io/langgraph/stati...,‚Ä¢ Agentic RAG is a strategy that incor...,‚Ä¢ Agentic RAG is a strategy that incorporates ...,2024-08-06
11169,12534.0,https://langchain-ai.github.io/langgraph/tutor...,Agentic RAG,oi.buhtig.ia-niahcgnal.,3.0,0.0,1.0,2039.0,1722966000000000.0,6817PCBNxPht,...,2024-08-06 17:43:45.099234104+00:00,2024-08-06 13:43:45.099234104-04:00,"\n<!doctype html>\n<html lang=""en"" class=""no-j...",Agent state Nodes and Edges Retrieval Agents a...,../data/history_pages/783017007441617444.html,../data/test_dir/783017007441617444.html,https://langchain-ai.github.io/langgraph/stati...,‚Ä¢ Agentic RAG is a type of RAG that us...,‚Ä¢ Agentic RAG is a type of RAG that us...,2024-08-06
11245,12610.0,https://www.youtube.com/watch?v=fkBkNWivq-s,Autonomous RAG | The next evolution of RAG AI ...,moc.ebutuoy.www.,1.0,0.0,0.0,94.0,1722886000000000.0,j60uoBJTDtCJ,...,2024-08-05 19:25:43.749269962+00:00,2024-08-05 15:25:43.749269962-04:00,"<!DOCTYPE html><html style=""font-size: 10px;fo...",Policy & Safety How YouTube works Test new fea...,../data/history_pages/1941920220060162641.html,../data/test_dir/1941920220060162641.html,https://i.ytimg.com/vi/fkBkNWivq-s/hqdefault.jpg,‚Ä¢ YouTube is a platform that works in ...,\n ‚Ä¢ YouTube is a video-sharing platform own...,2024-08-05
11251,12616.0,https://www.perplexity.ai/search/i-want-to-bui...,I want to build an agentic rag system allowing...,ia.ytixelprep.www.,1.0,0.0,0.0,94.0,1722886000000000.0,E9ki-qZGGx0h,...,2024-08-05 19:29:57.355839014+00:00,2024-08-05 15:29:57.355839014-04:00,"<!DOCTYPE html><html lang=""en-US""><head><title...",Enable JavaScript and cookies to continue,../data/history_pages/888927757549760350.html,../data/test_dir/888927757549760350.html,,Enable JavaScript and cookies to conti...,TLDR: Agentic rag is a type of rag tha...,2024-08-05
11264,12629.0,https://langchain-ai.github.io/langgraph/tutor...,Adaptive RAG,oi.buhtig.ia-niahcgnal.,1.0,0.0,0.0,94.0,1722887000000000.0,elF-rJO3D9BK,...,2024-08-05 19:43:58.414294004+00:00,2024-08-05 15:43:58.414294004-04:00,"\n<!doctype html>\n<html lang=""en"" class=""no-j...",Adaptive RAG Initializing search How-to Guides...,../data/history_pages/1584516194633218520.html,../data/test_dir/1584516194633218520.html,https://langchain-ai.github.io/langgraph/stati...,Agentic RAG is a strategy for RAG that...,* Agentic RAG is a strategy for RAG th...,2024-08-05
11265,12630.0,https://langchain-ai.github.io/langgraph/tutor...,Adaptive RAG using local LLMs,oi.buhtig.ia-niahcgnal.,1.0,0.0,0.0,94.0,1722887000000000.0,54FlEPfqKrTg,...,2024-08-05 19:44:02.906191111+00:00,2024-08-05 15:44:02.906191111-04:00,"\n<!doctype html>\n<html lang=""en"" class=""no-j...",%capture --no-stderr\n%pip install -U langchai...,../data/history_pages/1397372920769757798.html,../data/test_dir/1397372920769757798.html,https://langchain-ai.github.io/langgraph/stati...,‚Ä¢ AlphaCodium is a new approach for co...,The AlphaCodium paper introduces a new...,2024-08-05
