In [None]:
import os
import urllib

import numpy as np
import polars as pl
import requests
from sentence_transformers import SentenceTransformer, util

from data_quality_utils import Crawler

In [None]:
# embedding model
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# suppress warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# get data from datasette
datasette_base_url = "https://datasette.planning.data.gov.uk/digital-land.csv"

query = """
select *
from source as s
left join organisation as o
on s.organisation=o.organisation
where s.collection = "conservation-area"
"""
encoded_query = urllib.parse.urlencode({"sql": query})

r = requests.get(f"{datasette_base_url}?{encoded_query}", auth=("user", "pass"))

filename = "datasette_data.csv"
with open(filename, "wb") as f_out:
    f_out.write(r.content)

In [None]:
# group by organisation as we're looking for one page per council
data = (
    pl.read_csv(filename)
    .group_by("name")
    .agg(pl.col("website").first(), pl.col("documentation_url"))
)
data

In [None]:
def get_similar_urls(crawl_data, prompt, num_results=None):
    """
    Embeds crawled webpage data, computes similarity to a given prompt, and returns the top N
    most similar pages.

    Parameters:
    - crawl_data (list of tuples): A list of tuples containing (url, markdown) for each crawled page.
    - prompt (str): The text prompt to compare against the crawled page embeddings.
    - num_results (int, optional): The number of top similar pages to return. If None, returns all pages.

    Returns:
    - polars.DataFrame: A DataFrame containing:
        - "url": The webpage URL.
        - "markdown": The extracted markdown content.
        - "embedding": The computed embedding for the content.
        - "similarity": The cosine similarity score with the prompt.
    """
    res = []
    for url, markdown in crawl_data:
        embedding = embedding_model.encode(markdown, convert_to_numpy=True).tolist()
        res.append((url, markdown, embedding))

    crawl_df = pl.DataFrame(res, schema=["url", "markdown", "embedding"], orient="row")
    embeddings = np.stack(crawl_df["embedding"].to_list())

    prompt_embedding = np.array(
        embedding_model.encode(prompt, convert_to_numpy=True), dtype="float64"
    )

    # get similarity scores
    sim = util.cos_sim(
        prompt_embedding.astype(np.float32), embeddings.astype(np.float32)
    )
    # get indices of top n most similar urls
    if not num_results:
        num_results = len(crawl_df)
    indices = np.argsort(sim).numpy().flatten()[: -num_results - 1 : -1]
    sorted_df = crawl_df[indices].with_columns(
        similarity=np.sort(sim).flatten()[: -num_results - 1 : -1]
    )
    return sorted_df

In [None]:
def pretty_print_results(sorted_df, num_results):
    # print top n urls with similarity scores
    print("\nTop Similar Pages:\n" + "=" * 40)
    for i in range(min(num_results, len(sorted_df))):
        url = sorted_df.get_column("url")[i]
        score = sorted_df.get_column("similarity")[i]
        print(f"{i+1}. {url.ljust(60)} | Similarity: {score:.4f}")

In [None]:
async def process_council(
    council_names,
    max_depth=6,
    keyword_scorer=None,
    filters=None,
    prompt="A page about conservation areas.",
    cache_enabled=False,
    num_results=10,
):
    crawler = Crawler(
        max_depth=max_depth,
        keyword_scorer=keyword_scorer,
        filters=filters,
        cache_enabled=cache_enabled,
    )

    for council_name in council_names:
        council_data = data.filter(pl.col("name").str.contains(council_name))
        full_name = council_data.get_column("name")[0]
        homepage = council_data.get_column("website")[0]
        prompt = prompt.format((full_name).replace("\n", ""))
        print("=" * 40 + f"\nProcessing {full_name}...\n")

        # crawl url
        crawl_data = await crawler.deep_crawl(homepage)

        # get markdown embeddings
        sorted_df = get_similar_urls(crawl_data, prompt)

        pretty_print_results(sorted_df, num_results)

## Our approach

Our approach involves 2 main steps: a web crawler and an embedding similarity search. Below is a description of these steps.

### Web crawler

The web crawler takes a homepage URL of an organisation (council website) and crawls it to look for pages talking about conservation areas.

The crawler will look for links on a single page, put them in a queue and then iteratively check them until it finds what it was looking for or it reaches a stopping criterion, such as maximum depth (how many clicks away from home page). 

In order to save time, we can define some scorers or filters which tell the crawler which pages to prioritise or ignore. In this case, some common patterns of what a user needs to click to get to the page of interest are _"planning"_, _"building"_, _"heritage"_ or _"conservation"_.

The crawler uses a *"best first strategy"*, which utilises the scorers or filters to visit most relevant sites first, rather than a depth-first or breath-first search.

The crawler extracts the HTML from the pages and turns them into markdown. This is because it's more readable and easier to work with in the next steps. The crawler returns a list of pairs of (_url_, _markdown_).

### Embedding search

To be filled

### The next few cells show how to use the tools to find conservation area pages.

You can define your own parameters, such as maximum depth, how many results you want to see and any scorers or filters. Below is a template showing how to defin each scorer/filter type correctly - all you need to do is change the keywords or patterns.

You can also define a prompt - this is what will be used to get embeddings scores for a webpage. The more similar the prompt is to what a conservation area page usually looks like, the more accurate the results.

Lastly, you can await the `process_council` function, which will run the functionality described above and print the results. You can use it for one council only or for a list of councils.

#### Template for how to define filters or scorers
##### Pick the types you need and adjust their parameter (keywords, threshold, ...) and pass them to the crawler function.
    keyword_scorer = {
        "keywords": ["conservation", "conservation area", "planning", "building", "urban", "heritage", "resident"],
        "weight": 0.8,
    }
        
    filters=[
        {"type": "SEOFilter", "threshold": 0.6, "keywords": ["conservation", "area", "planning", "heritage", "resident"]},
        {"type": "ContentRelevanceFilter", "query": "conservation area or planning data", "threshold": 0.2},
        {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
        {"type": "URLPatternFilter", "patterns": ["*conservation*", "*planning*", "*building*"]},
    ]

### Gedling

In [None]:
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {
        "type": "URLPatternFilter",
        "patterns": ["*conservation*", "*planning*", "*building*"],
    },
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

df = await process_council(
    council_names=["Gedling"],
    max_depth=max_depth,
    filters=filters,
    prompt=prompt,
    num_results=num_results,
)

### South Gloucestershire

In [None]:
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {
        "type": "URLPatternFilter",
        "patterns": ["*conservation*", "*planning*", "*building*"],
    },
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["South Gloucestershire"],
    max_depth=max_depth,
    filters=filters,
    prompt=prompt,
    num_results=num_results,
)

### Bournemouth, Christchurch and Poole

In [None]:
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {
        "type": "URLPatternFilter",
        "patterns": ["*conservation*", "*planning*", "*building*"],
    },
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["Bournemouth, Christchurch and Poole"],
    max_depth=max_depth,
    filters=filters,
    prompt=prompt,
    num_results=num_results,
)

### Warrington

In [None]:
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {
        "type": "URLPatternFilter",
        "patterns": ["*conservation*", "*planning*", "*building*"],
    },
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["Warrington"],
    max_depth=max_depth,
    filters=filters,
    prompt=prompt,
    num_results=num_results,
)

### Stoke on Trent

In [None]:
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {
        "type": "URLPatternFilter",
        "patterns": ["*conservation*", "*planning*", "*building*"],
    },
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["Stoke"],
    max_depth=max_depth,
    filters=filters,
    prompt=prompt,
    num_results=num_results,
)

### Redbridge

In [None]:
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {
        "type": "URLPatternFilter",
        "patterns": ["*conservation*", "*planning*", "*building*"],
    },
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["Redbridge"],
    max_depth=max_depth,
    filters=filters,
    prompt=prompt,
    num_results=num_results,
)

### York

In [None]:
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {
        "type": "URLPatternFilter",
        "patterns": [r"*[Cc]onservation*", r"*[Pp]lanning*", r"*[Bb]uilding*"],
    },
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["York"],
    max_depth=max_depth,
    filters=filters,
    prompt=prompt,
    num_results=num_results,
)

### Malvern Hills

In [None]:
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {
        "type": "URLPatternFilter",
        "patterns": [r"*[Cc]onservation*", r"*[Pp]lanning*", r"*[Bb]uilding*"],
    },
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=["Malvern Hills"],
    max_depth=max_depth,
    filters=filters,
    prompt=prompt,
    num_results=num_results,
)

## Multiple councils

You can define any list of councils and their processing will be executed sequentially

In [None]:
np.random.seed(4321)
num_examples = 10
example_idx = np.random.randint(0, len(data), num_examples)
examples = data[example_idx]
examples

In [None]:
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html"]},
    {
        "type": "URLPatternFilter",
        "patterns": ["*conservation*", "*planning*", "*building*"],
    },
]

# please write the prompt such that there is a curly bracket where the council
# name will be inserted
prompt = """
The text discusses conservation areas from the {} and includes data on 
planning data, areas, interactive maps, appraisals, notices, boundaries, 
links and similar.
"""

await process_council(
    council_names=examples.get_column("name"),
    max_depth=max_depth,
    filters=filters,
    prompt=prompt,
    num_results=num_results,
)