In [1]:
import os
import requests
import urllib
import numpy as np
import polars as pl
import asyncio
from data_quality_utils import Crawler
from sentence_transformers import SentenceTransformer, util
from pdfminer.high_level import extract_text
import requests
import logging
logging.basicConfig(level=logging.ERROR)

In [2]:
# embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# suppress warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# get data from datasette
datasette_base_url = "https://datasette.planning.data.gov.uk/digital-land.csv"

query = """
select * 
from source as s
left join organisation as o
on s.organisation=o.organisation 
where s.collection = "conservation-area"
"""
encoded_query = urllib.parse.urlencode({"sql": query})

r = requests.get(f"{datasette_base_url}?{encoded_query}", auth=('user', 'pass'))

filename = "datasette_data.csv"
with open(filename, "wb") as f_out:
    f_out.write(r.content)

In [4]:
# group by organisation as we're looking for one page per council
data = (
    pl.read_csv(filename)
    .group_by("name")
    .agg(pl.col("website").first(), pl.col("documentation_url"))
)
data

name,website,documentation_url
str,str,list[str]
"""Malvern Hills District Council""","""https://www.malvernhills.gov.u…",[null]
"""London Borough of Barnet""","""https://www.barnet.gov.uk""","[null, ""https://open.barnet.gov.uk/dataset/20yo8/conservation-areas"", ""https://open.barnet.gov.uk/dataset/2nx73""]"
"""Southampton City Council""","""https://www.southampton.gov.uk…","[""https://data.gov.uk/dataset/de1d30d6-24d2-40e4-9b1d-60f6e503ebaa/conservation-areas""]"
"""Ministry of Housing, Communiti…","""https://www.gov.uk/government/…","[""https://dluhc-datasets.planning-data.dev/dataset/conservation-area-document-type"", ""https://github.com/digital-land/conservation-area-data/tree/main/data/Output/Missing"", … ""https://dataset-editor.development.planning.data.gov.uk/dataset/conservation-area-document-type""]"
"""Newark and Sherwood District C…","""https://www.newark-sherwooddc.…",[null]
…,…,…
"""Wealden District Council""","""https://www.wealden.gov.uk""",[null]
"""Redcar and Cleveland Borough C…","""https://www.redcar-cleveland.g…",[null]
"""Lancaster City Council""","""https://www.lancaster.gov.uk""",[null]
"""Warrington Borough Council""","""https://www.warrington.gov.uk""",[null]


In [5]:
def get_pdf_text_from_url(urls):
    pdf_embeddings = []
    for url in urls:
        response = requests.get(url)
        pdf_filename = "pdf_temp_file.pdf"
        with open(pdf_filename, "wb") as f:
            f.write(response.content)
        
        pdf_text = extract_text(pdf_filename)
        pdf_embedding = embedding_model.encode(pdf_text)
        pdf_embeddings.append(pdf_embedding)
    return np.array(pdf_embeddings)

In [17]:
def get_similar_pdfs(crawl_data, prompt, num_results=None):
    """
    Embeds crawled webpage data, computes similarity to a given prompt, and returns the top N most similar pages.

    Parameters:
    - crawl_data (list of tuples): A list of tuples containing (url, markdown) for each crawled page.
    - prompt (str): The text prompt to compare against the crawled page embeddings.
    - num_results (int, optional): The number of top similar pages to return. If None, returns all pages.

    Returns:
    - polars.DataFrame: A DataFrame containing:
        - "url": The webpage URL.
        - "markdown": The extracted markdown content.
        - "embedding": The computed embedding for the content.
        - "similarity": The cosine similarity score with the prompt.
    """
    pdf_embeddings = get_pdf_text_from_url(crawl_data)
    df = pl.DataFrame({"url": crawl_data, "embedding": pdf_embeddings})
    
    prompt_embedding = np.array(embedding_model.encode(prompt, convert_to_numpy=True), dtype='float64')
    
    # get similarity scores
    sim=util.cos_sim(
        prompt_embedding.astype(np.float32), 
        pdf_embeddings.astype(np.float32)
    )
    # get indices of top n most similar urls
    if not num_results:
        num_results = len(df)
    indices = np.argsort(sim).numpy().flatten()[:-num_results-1:-1]
    sorted_df = (
        df[indices]
        .with_columns(similarity=np.sort(sim).flatten()[:-num_results-1:-1])
    )
    return sorted_df

In [7]:
def pretty_print_results(sorted_df, num_results):
    # print top n urls with similarity scores
    print("\nTop Similar PDFs:\n" + "="*40)
    for i in range(min(num_results, len(sorted_df))):
        url = sorted_df.get_column("url")[i]
        score = sorted_df.get_column("similarity")[i]
        print(f"{i+1}. {url.ljust(60)} | Similarity: {score:.4f}")

In [14]:
async def process_council(
    council_names,
    max_depth=6,
    keyword_scorer=None,
    filters=None,
    prompt="A page about conservation areas.",
    cache_enabled=False,
    num_results=10,
    crawl_type="html",
):
    crawler = Crawler(
        max_depth=max_depth,
        keyword_scorer=keyword_scorer,
        filters=filters,
        cache_enabled=cache_enabled,
        crawl_type=crawl_type,
    )
    
    for council_name in council_names:
        council_data = data.filter(pl.col("name").str.contains(council_name))
        full_name = council_data.get_column("name")[0]
        homepage = council_data.get_column("website")[0]
        print("="*40 + f"\nProcessing {full_name}...\n")
        
        # crawl url
        crawl_data = await crawler.deep_crawl(homepage)
        
        # get pdf embeddings
        sorted_df = get_similar_pdfs(crawl_data, prompt, num_results=num_results)
        
        pretty_print_results(sorted_df, num_results)

In [19]:
async def get_council_pdfs(
    council_names,
    max_depth=6,
    keyword_scorer=None,
    filters=None,
    cache_enabled=False,
    crawl_type="html",
):
    crawler = Crawler(
        max_depth=max_depth,
        keyword_scorer=keyword_scorer,
        filters=filters,
        cache_enabled=cache_enabled,
        crawl_type=crawl_type,
    )
    
    for council_name in council_names:
        council_data = data.filter(pl.col("name").str.contains(council_name))
        full_name = council_data.get_column("name")[0]
        homepage = council_data.get_column("website")[0]
        print("="*40 + f"\nCrawling {full_name}...\n")
        
        # crawl url
        crawl_data = await crawler.deep_crawl(homepage)
        
        for url in crawl_data:
            print(url)

## Our approach

Our approach involves 2 main steps: a web crawler and an embedding similarity search. Below is a description of these steps.

### Web crawler

The web crawler takes a homepage URL of an organisation (council website) and crawls it to look for pages talking about conservation areas.

The crawler will look for links on a single page, put them in a queue and then iteratively check them until it finds what it was looking for or it reaches a stopping criterion, such as maximum depth (how many clicks away from home page). 

In order to save time, we can define some scorers or filters which tell the crawler which pages to prioritise or ignore. In this case, some common patterns of what a user needs to click to get to the page of interest are _"planning"_, _"building"_, _"heritage"_ or _"conservation"_.

The crawler uses a *"best first strategy"*, which utilises the scorers or filters to visit most relevant sites first, rather than a depth-first or breath-first search.

The crawler extracts the HTML from the pages and turns them into markdown. This is because it's more readable and easier to work with in the next steps. The crawler returns a list of pairs of (_url_, _markdown_).

### Embedding search

To be filled

### The next few cells show how to use the tools to find conservation area pages.

You can define your own parameters, such as maximum depth, how many results you want to see and any scorers or filters. Below is a template showing how to defin each scorer/filter type correctly - all you need to do is change the keywords or patterns.

You can also define a prompt - this is what will be used to get embeddings scores for a webpage. The more similar the prompt is to what a conservation area page usually looks like, the more accurate the results.

Lastly, you can await the `process_council` function, which will run the functionality described above and print the results. You can use it for one council only or for a list of councils.

## Template for how to define filters or scorers
##### Pick the types you need and adjust their parameter (keywords, threshold, ...) and pass them to the crawler function.
    keyword_scorer = {
        "keywords": ["conservation", "conservation area", "planning", "building", "urban", "heritage", "resident"],
        "weight": 0.8,
    }
        
    filters=[
        {"type": "SEOFilter", "threshold": 0.6, "keywords": ["conservation", "area", "planning", "heritage", "resident"]},
        {"type": "ContentRelevanceFilter", "query": "conservation area or planning data", "threshold": 0.2},
        {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
        {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*"]},
    ]

### Gedling - full example retrieving similar PDFs

In [18]:
# finds all PDFs at the URL patterns, extracts text and uses embedding similarity to search for best matches
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html", "application/pdf"]},
    {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*", "*heritage*"]},
]

prompt = """
The text describes a conservation area. It includes information about
where the conservation area is, its history, archeology, boundaries
and any additional planning data. Usually it includes images or maps
showing the boundary of the conservation area.
"""

await process_council(
    council_names=["Gedling"], 
    max_depth=max_depth, 
    filters=filters, 
    prompt=prompt, 
    num_results=num_results,
    crawl_type="pdf"
)

Processing Gedling Borough Council...

[INIT].... → Crawl4AI 0.5.0.post4
Crawled 285 pages in total

Top Similar PDFs:
1. https://www.gedling.gov.uk/media/gedlingboroughcouncil/documents/planningpolicy/conservationareas/Papplewick CAAMP - 2018 Adopted.pdf | Similarity: 0.5857
2. https://www.gedling.gov.uk/media/gedlingboroughcouncil/documents/planningpolicy/conservationareas/Calverton CACA _FINAL DRAFT jan07 with maps-1.pdf | Similarity: 0.5658
3. https://www.gedling.gov.uk/media/gedlingboroughcouncil/documents/planningpolicy/conservationareas/Woodborough Conservation Area - as designated 2017.pdf | Similarity: 0.5036
4. https://www.gedling.gov.uk/media/gedlingboroughcouncil/documents/planningbuildingcontrol/planningpolicy/PoliciesMapSE.pdf | Similarity: 0.4822
5. https://www.gedling.gov.uk/media/gedlingboroughcouncil/documents/planningpolicy/conservationareas/Final version WOODBOROUGH CONSERVATION AREA 11102017.pdf | Similarity: 0.4686
6. https://www.gedling.gov.uk/media/gedlingboroug

## Examples retrieving PDFs only

### South Gloucestershire

In [21]:
# only searches for all PDFs at the URL patterns
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html", "application/pdf"]},
    {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*", "*heritage*"]},
]

await get_council_pdfs(
    council_names=["South Gloucestershire"], 
    max_depth=max_depth, 
    filters=filters,
    crawl_type="pdf"
)

Crawling South Gloucestershire Council...

[INIT].... → Crawl4AI 0.5.0.post4
Crawled 357 pages in total
https://beta.southglos.gov.uk/static/2c6d8bb6f19040eab5d82d2c2416dccf/Window-repairs-in-historic-buildings-2024.pdf
https://beta.southglos.gov.uk/static/5aaf9f4e282db13dfb7a3c97dbc5b351/Our-approach-to-determining-planning-applications.pdf
https://beta.southglos.gov.uk/static/392e16066eda232bfc7f52d9bb109b18/Pointing-of-historic-buildings-2024.pdf
https://beta.southglos.gov.uk/static/8f1945edd3a5825665750e04a7105150/Community-infrastructure-levy-and-section-106-planning-obligations-guide-spd.pdf
https://beta.southglos.gov.uk/static/c9fa3b3572a5ccf9d4056e6832121ae6/Listed-building-guidance-2024.pdf
https://beta.southglos.gov.uk/static/f3d1bf2b9ebc3ffdbd087421e4c3f3e2/Lime-render-on-historic-buildings-2024.pdf
https://beta.southglos.gov.uk/static/9a58b8212b02cb1cd3dabcac5fca6fd5/Understanding-heritage-assets-2024.pdf
https://beta.southglos.gov.uk/static/322138571f0a1e7da7d05a695418099e

### Bournemouth, Christchurch and Poole

In [22]:
# only searches for all PDFs at the URL patterns
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html", "application/pdf"]},
    {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*", "*heritage*"]},
]

await get_council_pdfs(
    council_names=["Bournemouth, Christchurch and Poole"], 
    max_depth=max_depth, 
    filters=filters,
    crawl_type="pdf"
)

Crawling Bournemouth, Christchurch and Poole Council...

[INIT].... → Crawl4AI 0.5.0.post4
Crawled 2990 pages in total
https://www.bcpcouncil.gov.uk/documents/planning-and-building-control/planning-enforcement-plan.pdf
https://www.bcpcouncil.gov.uk/Assets/Leisure-culture-and-local-heritage/Sport-Poole-Constitution-since-AGM-June-2020.pdf
https://www.bcpcouncil.gov.uk/Assets/Planning-and-building-control/Local-plan-and-cil/CIL-DCS-Submission-Statement.pdf
https://www.bcpcouncil.gov.uk/Assets/Planning-and-building-control/Local-plan-and-cil/Local-Plan-Submission-statement.pdf
https://www.bcpcouncil.gov.uk/Assets/Planning-and-building-control/Local-plan-and-cil/EXAM-28-Post-hearing-letter-March-2025-Final.pdf
https://www.bcpcouncil.gov.uk/Assets/Planning-and-building-control/Infrastructure-Funding-Statement-2023-24.pdf
https://www.bcpcouncil.gov.uk/Assets/Planning-and-building-control/Land-Supply-Position-Statement.pdf
https://www.bcpcouncil.gov.uk/Assets/Planning-and-building-control/Loc

### Warrington

In [23]:
# only searches for all PDFs at the URL patterns
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html", "application/pdf"]},
    {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*", "*heritage*"]},
]

await get_council_pdfs(
    council_names=["Warrington"], 
    max_depth=max_depth, 
    filters=filters,
    crawl_type="pdf"
)

Crawling Warrington Borough Council...

[INIT].... → Crawl4AI 0.5.0.post4
Crawled 92 pages in total
https://www.warrington.gov.uk/sites/default/files/2019-08/planning_enforcement_process.pdf
https://www.warrington.gov.uk/sites/default/files/2021-06/house_extensions_supplementary_planning_document_-_june_2021.pdf
https://www.warrington.gov.uk/sites/default/files/2019-08/buttermarket_street_conservation_area_appraisal.pdf
https://www.warrington.gov.uk/sites/default/files/2019-08/bridge_street_conservation_area_appraisal.pdf
https://www.warrington.gov.uk/sites/default/files/2019-08/bewsey_street_conservation_area_appraisal.pdf
https://www.warrington.gov.uk/sites/default/files/2019-08/church_street_conservation_area_appraisal.pdf
https://www.warrington.gov.uk/sites/default/files/2020-09/Environmental%20protection%20-%20supplementary%20planning%20guidance.pdf
https://www.warrington.gov.uk/sites/default/files/2019-08/town_hall_conservation_area_appraisal.pdf
https://www.warrington.gov.uk/sit

### Stoke on Trent

In [24]:
# only searches for all PDFs at the URL patterns
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html", "application/pdf"]},
    {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*", "*heritage*"]},
]

await get_council_pdfs(
    council_names=["Stoke"], 
    max_depth=max_depth, 
    filters=filters,
    crawl_type="pdf"
)

Crawling Stoke-on-Trent City Council...

[INIT].... → Crawl4AI 0.5.0.post4
Crawled 142 pages in total
