In [1]:
import os
import requests
import urllib
import numpy as np
import polars as pl
import asyncio
from crawler import Crawler
from sentence_transformers import SentenceTransformer, util

In [2]:
# embedding model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# suppress warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# get data from datasette
datasette_base_url = "https://datasette.planning.data.gov.uk/digital-land.csv"

query = """
select * 
from source as s
left join organisation as o
on s.organisation=o.organisation 
where s.collection = "conservation-area"
"""
encoded_query = urllib.parse.urlencode({"sql": query})

r = requests.get(f"{datasette_base_url}?{encoded_query}", auth=('user', 'pass'))

filename = "datasette_data.csv"
with open(filename, "wb") as f_out:
    f_out.write(r.content)

In [4]:
# group by organisation as we're looking for one page per council
data = (
    pl.read_csv(filename)
    .group_by("name")
    .agg(pl.col("website").first(), pl.col("documentation_url"))
)
data

name,website,documentation_url
str,str,list[str]
"""Corby Borough Council""","""https://www.corby.gov.uk""",[null]
"""Chorley Borough Council""","""https://www.chorley.gov.uk""",[null]
"""Stockton-on-Tees Borough Counc…","""https://www.stockton.gov.uk""",[null]
"""Tonbridge and Malling Borough …","""https://www.tmbc.gov.uk""",[null]
"""Harlow District Council""","""https://www.harlow.gov.uk""",[null]
…,…,…
"""Royal Borough of Windsor and M…","""https://www.rbwm.gov.uk/""","[""https://data.gov.uk/dataset/739ffcb2-54ba-49a0-b6ec-b273fe46f20d/rbwm-conservation-areas""]"
"""London Borough of Brent""","""https://www.brent.gov.uk""",[null]
"""Aylesbury Vale District Counci…","""https://www.aylesburyvaledc.go…",[null]
"""Torridge District Council""","""https://www.torridge.gov.uk""",[null]


In [5]:
def get_similar_urls(crawl_data, prompt, num_results=None):
    """
    Embeds crawled webpage data, computes similarity to a given prompt, and returns the top N most similar pages.

    Parameters:
    - crawl_data (list of tuples): A list of tuples containing (url, markdown) for each crawled page.
    - prompt (str): The text prompt to compare against the crawled page embeddings.
    - num_results (int, optional): The number of top similar pages to return. If None, returns all pages.

    Returns:
    - polars.DataFrame: A DataFrame containing:
        - "url": The webpage URL.
        - "markdown": The extracted markdown content.
        - "embedding": The computed embedding for the content.
        - "similarity": The cosine similarity score with the prompt.
    """
    res = []
    for url, markdown in crawl_data:
        embedding = embedding_model.encode(markdown, convert_to_numpy=True).tolist()
        res.append((url, markdown, embedding))
                
    crawl_df = pl.DataFrame(res, schema=["url", "markdown", "embedding"], orient="row")
    embeddings = np.stack(crawl_df["embedding"].to_list())
    
    prompt_embedding = np.array(embedding_model.encode(prompt, convert_to_numpy=True), dtype='float64')
    
    # get similarity scores
    sim=util.cos_sim(
        prompt_embedding.astype(np.float32), 
        embeddings.astype(np.float32)
    )
    # get indices of top n most similar urls
    if not num_results:
        num_results = len(crawl_df)
    indices = np.argsort(sim).numpy().flatten()[:-num_results-1:-1]
    sorted_df = (
        crawl_df[indices]
        .with_columns(similarity=np.sort(sim).flatten()[:-num_results-1:-1])
    )
    return sorted_df

In [6]:
def pretty_print_results(sorted_df, num_results):
    # print top n urls with similarity scores
    print("\nTop Similar Pages:\n" + "="*40)
    for i in range(min(num_results, len(sorted_df))):
        url = sorted_df.get_column("url")[i]
        score = sorted_df.get_column("similarity")[i]
        print(f"{i+1}. {url.ljust(60)} | Similarity: {score:.4f}")

In [7]:
async def process_council(
    council_names,
    max_depth=6,
    keyword_scorer=None,
    filters=None,
    prompt="A page about conservation areas.",
    cache_enabled=False,
    num_results=10,
):
    crawler = Crawler(
            max_depth=max_depth,
            keyword_scorer=keyword_scorer,
            filters=filters,
            cache_enabled=cache_enabled,
        )
    
    for council_name in council_names:
        council_data = data.filter(pl.col("name").str.contains(council_name))
        full_name = council_data.get_column("name")[0]
        homepage = council_data.get_column("website")[0]
        prompt = prompt.format((full_name).replace('\n', ''))
        print("="*40 + f"\nProcessing {full_name}...\n")
        
        # crawl url
        crawl_data = await crawler.deep_crawl(homepage)
        
        # get markdown embeddings
        sorted_df = get_similar_urls(crawl_data, prompt)
        
        pretty_print_results(sorted_df, num_results)

## Template for how to define filters or scorers
##### Pick the types you need and adjust their parameter (keywords, threshold, ...) and pass them to the crawler function.
    keyword_scorer = {
        "keywords": ["conservation", "conservation area", "planning", "building", "urban", "heritage", "resident"],
        "weight": 0.8,
    }
        
    filters=[
        {"type": "SEOFilter", "threshold": 0.6, "keywords": ["conservation", "area", "planning", "heritage", "resident"]},
        {"type": "ContentRelevanceFilter", "query": "conservation area or planning data", "threshold": 0.2},
        {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
        {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*"]},
    ]

### Gedling

In [14]:
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*"]},
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

df = await process_council(
    council_names=["Gedling"], 
    max_depth=max_depth, 
    filters=filters, 
    prompt=prompt, 
    num_results=num_results
)

Processing Gedling Borough Council...

[INIT].... → Crawl4AI 0.5.0.post4
Crawled 137 pages in total

Top Similar Pages:
1. https://www.gedling.gov.uk/conservation/                     | Similarity: 0.6600
2. https://www.gedling.gov.uk/conservation/#page                | Similarity: 0.6600
3. http://www.gedling.gov.uk/resident/planningandbuildingcontrol/ | Similarity: 0.6419
4. http://www.gedling.gov.uk/resident/planningandbuildingcontrol/#page | Similarity: 0.6273
5. https://www.gedling.gov.uk/resident/planningandbuildingcontrol/ | Similarity: 0.6152
6. https://www.gedling.gov.uk/resident/planningandbuildingcontrol/#page | Similarity: 0.6152
7. http://www.gedling.gov.uk/resident/planningandbuildingcontrol/planningapplications/ | Similarity: 0.5828
8. https://www.gedling.gov.uk/resident/planningandbuildingcontrol/planningpolicy/consultations/ | Similarity: 0.5787
9. https://www.gedling.gov.uk/resident/planningandbuildingcontrol/planningpolicy/consultations/#page | Similarity: 0.5787
10.

### South Gloucestershire

In [8]:
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*"]},
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["South Gloucestershire"], 
    max_depth=max_depth, 
    filters=filters, 
    prompt=prompt, 
    num_results=num_results
)

Processing South Gloucestershire Council...

[INIT].... → Crawl4AI 0.5.0.post4
Crawled 332 pages in total

Top Similar Pages:
1. https://beta.southglos.gov.uk/conservation-area              | Similarity: 0.6868
2. https://beta.southglos.gov.uk/conservation-area/             | Similarity: 0.6852
3. https://beta.southglos.gov.uk/planning-and-development/conservation-and-regeneration/ | Similarity: 0.6736
4. https://beta.southglos.gov.uk/planning-and-development/conservation-and-regeneration/regeneration/ | Similarity: 0.6475
5. https://beta.southglos.gov.uk/planning-and-development/conservation-and-regeneration/historic-environment-and-listed-buildings/ | Similarity: 0.6379
6. https://beta.southglos.gov.uk/planning-and-development/conservation-and-regeneration/trees-and-hedges/ | Similarity: 0.6340
7. http://www.southglos.gov.uk/environment-and-planning/search-planning-applications/ | Similarity: 0.6165
8. http://www.southglos.gov.uk/planning-and-development/planning-applications/ | Simi

### Bournemouth, Christchurch and Poole

In [13]:
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*"]},
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["Bournemouth, Christchurch and Poole"], 
    max_depth=max_depth, 
    filters=filters, 
    prompt=prompt, 
    num_results=num_results
)

Processing Bournemouth, Christchurch and Poole Council...

[INIT].... → Crawl4AI 0.5.0.post4
Crawled 504 pages in total

Top Similar Pages:
1. https://www.bcpcouncil.gov.uk/planning-and-building-control/urban-design-trees-and-conservation/conservation-areas | Similarity: 0.6143
2. https://www.bcpcouncil.gov.uk/planning-and-building-control/urban-design-trees-and-conservation/conservation-areas#guide-contents | Similarity: 0.6143
3. https://www.bcpcouncil.gov.uk/planning-and-building-control/urban-design-trees-and-conservation/conservation-areas#main | Similarity: 0.6143
4. https://bcpcouncil.gov.uk/planning-and-building-control/local-land-charges | Similarity: 0.6074
5. https://bcpcouncil.gov.uk/planning-and-building-control/local-land-charges#guide-contents | Similarity: 0.6074
6. https://bcpcouncil.gov.uk/planning-and-building-control/local-land-charges#main | Similarity: 0.6074
7. https://bcpcouncil.gov.uk/planning-and-building-control/urban-design-trees-and-conservation/listed-buil

### Warrington

In [9]:
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*"]},
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["Warrington"], 
    max_depth=max_depth, 
    filters=filters, 
    prompt=prompt, 
    num_results=num_results
)

Processing Warrington Borough Council...

[INIT].... → Crawl4AI 0.5.0.post4
Crawled 67 pages in total

Top Similar Pages:
1. https://www.warrington.gov.uk/nature-conservation#main-content | Similarity: 0.6056
2. https://www.warrington.gov.uk/nature-conservation            | Similarity: 0.6056
3. https://online.warrington.gov.uk/planning/index.html         | Similarity: 0.5629
4. https://online.warrington.gov.uk/planning/                   | Similarity: 0.5513
5. https://www.warrington.gov.uk/supplementary-planning-documents-consultation | Similarity: 0.5456
6. https://www.warrington.gov.uk/supplementary-planning-documents-consultation#main-content | Similarity: 0.5456
7. https://www.warrington.gov.uk/supplementary-planning-documents | Similarity: 0.5359
8. https://www.warrington.gov.uk/supplementary-planning-documents#main-content | Similarity: 0.5359
9. https://www.warrington.gov.uk/planning-policy-archives#main-content | Similarity: 0.5312
10. https://www.warrington.gov.uk/planning-p

### Stoke on Trent

In [10]:
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*"]},
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["Stoke"], 
    max_depth=max_depth, 
    filters=filters, 
    prompt=prompt, 
    num_results=num_results
)

Processing Stoke-on-Trent City Council...

[INIT].... → Crawl4AI 0.5.0.post4
Crawled 129 pages in total

Top Similar Pages:
1. https://www.stoke.gov.uk/info/20005/planning_and_building/547/conservation_areas#content | Similarity: 0.8232
2. https://www.stoke.gov.uk/info/20005/planning_and_building/547/conservation_areas | Similarity: 0.8232
3. https://www.stoke.gov.uk/news/article/1509/council_announces_new_conservation_areas#content | Similarity: 0.8073
4. https://www.stoke.gov.uk/news/article/1509/council_announces_new_conservation_areas | Similarity: 0.8073
5. https://www.stoke.gov.uk/news/article/1285/changes_to_conservation_areas_in_stoke-on-trent | Similarity: 0.7924
6. https://www.stoke.gov.uk/news/article/1285/changes_to_conservation_areas_in_stoke-on-trent#content | Similarity: 0.7924
7. https://www.stoke.gov.uk/conservationareas#content           | Similarity: 0.7902
8. http://www.stoke.gov.uk/conservationareas                    | Similarity: 0.7902
9. https://www.stoke.gov.u

### Redbridge

In [14]:
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*"]},
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["Redbridge"], 
    max_depth=max_depth, 
    filters=filters, 
    prompt=prompt, 
    num_results=num_results
)

Processing London Borough of Redbridge...

[INIT].... → Crawl4AI 0.5.0.post4
Crawled 762 pages in total

Top Similar Pages:
1. https://www.redbridge.gov.uk/planning-and-building/protected-buildings-and-conservation-areas/article-4-direction-c4-hmos/ | Similarity: 0.6607
2. https://www.redbridge.gov.uk/planning-and-building/protected-buildings-and-conservation-areas/article-4-direction-c4-hmos/#accessibilitySkip | Similarity: 0.6607
3. https://www.redbridge.gov.uk/planning-and-building/protected-buildings-and-conservation-areas/ | Similarity: 0.6513
4. https://www.redbridge.gov.uk/planning-and-building/protected-buildings-and-conservation-areas/#accessibilitySkip | Similarity: 0.6513
5. https://www.redbridge.gov.uk/Account/Register?returnurl=%2Fplanning-and-building%2Fprotected-buildings-and-conservation-areas%2Farticle-4-direction-c4-hmos%2F&clientid=RedbridgeCMSLive#accessibilitySkip | Similarity: 0.6512
6. https://www.redbridge.gov.uk/Account/Register?returnurl=%2Fplanning-and-buildi

### York

In [25]:
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {"type": "URLPatternFilter", "patterns": [r"*[Cc]onservation*", r"*[Pp]lanning*", r"*[Bb]uilding*"]},
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["York"], 
    max_depth=max_depth, 
    filters=filters, 
    prompt=prompt, 
    num_results=num_results
)

Processing York...

[INIT].... → Crawl4AI 0.5.0.post4
Crawled 3260 pages in total

Top Similar Pages:
1. https://planningaccess.york.gov.uk/online-applications/#top  | Similarity: 0.4857
2. https://planningaccess.york.gov.uk/online-applications/#pageheading | Similarity: 0.4857
3. https://planningaccess.york.gov.uk/online-applications/      | Similarity: 0.4857
4. https://www.york.gov.uk/conservation-listed-buildings/york-central-historic-core-conservation-area-appraisal-hccaa#content | Similarity: 0.4832
5. https://www.york.gov.uk/conservation-listed-buildings/york-central-historic-core-conservation-area-appraisal-hccaa | Similarity: 0.4832
6. https://www.york.gov.uk/conservation-listed-buildings/york-central-historic-core-conservation-area-appraisal-hccaa#js-menu | Similarity: 0.4832
7. https://www.york.gov.uk/conservation-listed-buildings/york-central-historic-core-conservation-area-appraisal-hccaa#top | Similarity: 0.4832
8. https://www.york.gov.uk/conservation-listed-buildings/yor

### Malvern Hills

In [None]:
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {"type": "URLPatternFilter", "patterns": [r"*[Cc]onservation*", r"*[Pp]lanning*", r"*[Bb]uilding*"]},
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["Malvern Hills"], 
    max_depth=max_depth, 
    filters=filters, 
    prompt=prompt, 
    num_results=num_results
)

Processing Malvern Hills...

[INIT].... → Crawl4AI 0.5.0.post4


## Multiple councils

You can define any list of councils and their processing will be executed sequentially

In [29]:
np.random.seed(4321)
num_examples = 10
example_idx = np.random.randint(0, len(data), num_examples)
examples = data[example_idx]
examples

name,website,documentation_url
str,str,list[str]
"""Canterbury City Council""","""https://www.canterbury.gov.uk""","[""https://mapping.canterbury.gov.uk/arcgis/rest/services/External/Planning_Constraints_New/MapServer""]"
"""Leicester City Council""","""https://www.leicester.gov.uk""","[""https://data.gov.uk/dataset/e702ebff-27b4-4e86-acbb-74d066f3c4e7/conservation-areas"", ""https://data.leicester.gov.uk/explore/?q=conservation+area&sort=modified"", ""https://data.leicester.gov.uk/explore/?q=conservation+area&sort=modified""]"
"""Gateshead Metropolitan Borough…","""https://www.gateshead.gov.uk""","[null, ""https://gateshead-council-open-data-gateshead.hub.arcgis.com/datasets/Gateshead::spe-hx-conservation-areas/about""]"
"""Runnymede Borough Council""","""https://www.runnymede.gov.uk""",[null]
"""Birmingham City Council""","""https://www.birmingham.gov.uk""","[null, null, … ""https://openplanningdata-sbham.hub.arcgis.com/datasets/ee06653284154ccfb4fec1e9428760e1/about""]"
"""Thurrock Thames Gateway Develo…","""https://www.gov.uk/government/…",[null]
"""Basildon Borough Council""","""https://www.basildon.gov.uk""",[null]
"""London Borough of Richmond upo…","""https://www.richmond.gov.uk""",[null]
"""Ministry of Housing, Communiti…","""https://www.gov.uk/government/…","[""https://dluhc-datasets.planning-data.dev/dataset/conservation-area-document-type"", ""https://github.com/digital-land/conservation-area-data/tree/main/data/Output/Missing"", … ""https://dataset-editor.development.planning.data.gov.uk/dataset/conservation-area-document-type""]"
"""Newcastle-under-Lyme Borough C…","""https://www.newcastle-staffs.g…",[null]


In [None]:
max_depth = 6
num_results = 10
filters=[
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*"]},
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=examples.get_column("name"), 
    max_depth=max_depth, 
    filters=filters, 
    prompt=prompt, 
    num_results=num_results
)