# Exploring summarization of browser history

In [53]:
import os
import html
import shutil
import requests
import subprocess
import pandas as pd
from datetime import datetime, timedelta

from bs4 import BeautifulSoup
from bs4.element import Comment

import sys
sys.path.insert(0, "../")
import utils
from chromadb_tools import get_chroma_collection, run_chroma_ingest, chroma_search_results_to_df

In [2]:
history_pages_dir = "../data/history_pages/"
found_text = set()

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    texts = list()
    for t in visible_texts:
        t = t.strip()
        if t not in found_text and len(t) > 10:
            texts.append(t)
        found_text.add(t)
    return u" ".join(texts).strip()

def get_html(row):
    html_path = os.path.join(history_pages_dir, f"{row['url_hash']}.html")
    if os.path.exists(html_path):
        with open(html_path, 'r') as infile:
            return infile.read()
        
    try:
        response = requests.get(row['url'])
    except:
        print(f"Failed request for {row['url']}")
        return ""
    with open(html_path, 'w') as outfile:
        outfile.write(response.text)
    return response.text

In [3]:
chroma_collection = get_chroma_collection(collection_name="browser_history")
history = utils.get_browser_history()

10504 urls from Firefox
73 urls from Chrome
16 urls from Arc


In [4]:
text = "agentic rag"
top_n = 100

chroma_search_results = chroma_collection.query(
            query_texts=[text],
            n_results=top_n
    )
results_df = chroma_search_results_to_df(chroma_search_results=chroma_search_results)
results_df = results_df.loc[results_df['distance'] <= 1.2]

results_history = history.loc[history['url'].isin(results_df['url'])]
len(results_history)

23

In [5]:
results_history['html'] = results_history.apply(lambda row: get_html(row), axis=1)
results_history['html_text'] = results_history['html'].apply(lambda x: text_from_html(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_history['html'] = results_history.apply(lambda row: get_html(row), axis=1)
  texts = soup.findAll(text=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_history['html_text'] = results_history['html'].apply(lambda x: text_from_html(x))


In [64]:
results_history = results_history.drop_duplicates(subset=['html_text'])
results_history = results_history.loc[results_history['html_text'].str.len() > 10]
len(results_history)

18

In [65]:
test_dir = '../data/test_dir'
results_history['html_f'] = results_history['url_hash'].apply(lambda x :os.path.join(history_pages_dir, f"{x}.html"))
results_history['html_f_test'] = results_history['url_hash'].apply(lambda x :os.path.join(test_dir, f"{x}.html"))
for i, row in results_history.iterrows():
    if os.path.exists(row['html_f']):
        shutil.copy(row['html_f'], row['html_f_test'])

In [66]:
len(results_history)

18

In [67]:
def get_url_text(row):
    return f"""Access time: {row['datetime_local']}\n
    Web Page text: {row['html_text']}\n
    """

In [68]:
topic = text
pre_prompt = f"""Below are webpages a user has been looking at related to the topic of {topic} 
    along with the timestamp the webpage was accessed. """

In [69]:
results_history = results_history.sort_values(by='datetime_local', ascending=True)

In [17]:
results_history['thumbnail_url'] = results_history.apply(lambda row: utils.get_thumbnail_url(row['url'], row['html']), axis=1)

# Try summarizing using Local LLM

In [19]:
from mlx_lm import load, generate

MLX_LLM_MODEL = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"

  from .autonotebook import tqdm as notebook_tqdm


In [73]:
model, tokenizer = load(MLX_LLM_MODEL) 

In [74]:
def get_summarize_prompt(html_text):
    # prompt = f"""Below is text from a webpage.\n {html_text}\n 
    # Extract the key points from the webpage in relation to {topic}. Key Points:\n"""
    prompt = f"""Below is text from a webpage.\n {html_text}\n 
        Create a short bullet-point TLDR summary in relation to {topic}. Only use the 
        text provided. Summary:\n"""
    return prompt

In [75]:
results_history['summary'] = results_history['html_text'].apply(lambda x: generate(model, tokenizer, prompt=get_summarize_prompt(x), max_tokens=100))

In [116]:
def get_summary_html(row):
    # Check for thumbnail and adjust HTML accordingly
    thumbnail_html = ""
    if row['thumbnail_url']:
        thumbnail_html = f"""
            <div class="thumbnail-container">
                <a href="{row['url']}" target="_blank">
                    <img src="{row['thumbnail_url']}" alt="Thumbnail for {row['title']}" class="content-thumbnail">
                </a>
            </div>
        """

    # Escape HTML special characters in the summary and convert newlines to HTML breaks
    escaped_summary = html.escape(row['summary']).replace('\n', '<br>')

    # Combine all parts
    html_content = f"""
        <div class="content-container" data-content-id="{row['id']}">
            {thumbnail_html}
            <div class="text-container">
                <a href="{row['url']}" target="_blank" onclick="trackClick({row['id']});" class="content-title">{row['title']}</a>
                <div class="summary">{escaped_summary}</div>
            </div>
        </div>
    """
    return html_content

def generate_full_html(results_history, topic):
    styles = '''
    <style>
        .content-container {
            display: flex; /* Flexbox layout to align image and text side by side */
            border-bottom: 1px solid #ccc; /* Adds a border between entries */
            padding-bottom: 10px; /* Spacing below each item */
            margin-bottom: 10px; /* Spacing between items */
        }
        .thumbnail-container {
            flex: 0 0 auto; /* Flex item does not grow or shrink */
            margin-right: 10px; /* Space between the image and the text */
        }
        .text-container {
            flex: 1; /* Allows the text container to take up remaining space */
        }
        .content-thumbnail {
            width: 100px; /* Sets a fixed width */
            height: 100px; /* Sets a fixed height */
            object-fit: contain; /* Ensures the image fits within dimensions without cropping */
        }
        .summary {
            white-space: pre-wrap; /* Maintains whitespace formatting */
        }
        .date-header {
            font-size: 18px; /* Size of date header */
            font-weight: bold; /* Make date header bold */
            margin-top: 20px; /* Top margin for spacing */
            margin-bottom: 10px; /* Bottom margin before content starts */
        }
        .header {
            font-size: 24px; /* Larger font size for header */
            text-align: center; /* Center-align the header text */
            margin: 20px 0; /* Top and bottom margin for spacing */
        }
    </style>
    '''
    header_html = f'<div class="header">Topic: {topic}</div>'

    summaries_html = styles + header_html
    
    results_history = results_history.sort_values('datetime_local', ascending=False)
    results_history['day_accessed'] = results_history['datetime_local'].dt.date

    for date in results_history.day_accessed.unique():
        date_df = results_history.loc[results_history['day_accessed'] == date]
        summaries_html += f'<div class="date-header">Accessed on: {date}</div>'
        for i, row in date_df.iterrows():
            summaries_html += get_summary_html(row)

    return summaries_html

html_output = generate_full_html(results_history, topic)

2024-08-13
2024-08-06
2024-08-05
2024-08-03
2024-08-02
2024-07-30
2024-07-02
2024-05-01


In [117]:
output_dir = "../data/output_summaries/"
with open(os.path.join(output_dir, "agentic_rag_summary.html"), 'w') as outfile:
    outfile.write(html_output)

# Create summary of summary

In [123]:
def get_url_text(row):
    return f"""Access time: {row['datetime_local']}\n
    Web Page summary: {row['summary']}\n
    """

def get_summaries_summary_prompt(df):
    prompt = f"Below are summaries extracted from different webpages related to the topic {topic}. \n"
    for i, row in df.iterrows():
        prompt += get_url_text(row)
    prompt += "Create a summary of the below summaries focusing on {topic}. Answer: \n"
    return prompt

In [124]:
summary_prompt = get_summaries_summary_prompt(results_history)

In [125]:
summary_summary = generate(model, tokenizer, prompt=summary_prompt, max_tokens=200)

In [126]:
print(summary_summary)

Agentic RAG is a transformative approach to Retrieval-Augmented Generation (RAG) technology, integrating agentic capabilities to create intelligent systems that reason over retrieved information, execute multi-step actions, and synthesize insights from diverse sources. This adaptive approach empowers users to conduct comprehensive research and achieve unparalleled efficiency. Agentic RAG has the potential to revolutionize information retrieval and analysis, blurring the boundaries between human and machine intelligence. The technology holds profound promise for the future of information retrieval and analysis, with applications in various fields, including research, education, and business. Agentic RAG is a game-changer in the field of information retrieval, offering a more efficient and effective way to access and analyze information. The technology has the potential to transform the way we interact with information, making it more accessible and user-friendly. Agentic RAG is a powerf