# Document URLs

Every entity should have an associated document which is the *legal instrument* that defines the entity. These should be stored under the `document_url` field of the entity. This notebook uses the webcrawler and search functionality included in the repository to find the correct document url for a sample of conservation areas.

In [None]:
import logging
import urllib

import numpy as np
import polars as pl
import requests
from pdfminer.high_level import extract_text

from data_quality_utils import Crawler

logging.basicConfig(level=logging.ERROR)

# 1. Load Sample

In [None]:
test_cases_df = pl.read_csv("data/test_data.csv")
DATA_FILE = "datasette_data.csv"
QUERY_DATA = False

In [None]:
if QUERY_DATA:
    # get data from datasette
    datasette_base_url = "https://datasette.planning.data.gov.uk/digital-land.csv"

    query = """
    select 
    l.entity,
    o.website,
    o.organisation 
    from lookup as l
    left join organisation as o
    on l.organisation=o.organisation 
    where l.entity in {}
    and o.website != 'https://historicengland.org.uk'
    """.format(
        tuple(test_cases_df["entity"].to_list())
    )
    encoded_query = urllib.parse.urlencode({"sql": query})

    r = requests.get(f"{datasette_base_url}?{encoded_query}", auth=("user", "pass"))

    with open(DATA_FILE, "wb") as f_out:
        f_out.write(r.content)
data = pl.read_csv(DATA_FILE)
test_cases_df = test_cases_df.join(data, on="entity")

In [None]:
test_cases_df

In [None]:
def get_pdf_text_from_url(urls):
    pdf_embeddings = []
    for url in urls:
        response = requests.get(url)
        pdf_filename = "pdf_temp_file.pdf"
        with open(pdf_filename, "wb") as f:
            f.write(response.content)

        pdf_text = extract_text(pdf_filename)
        pdf_embedding = embedding_model.encode(pdf_text)
        pdf_embeddings.append(pdf_embedding)
    return np.array(pdf_embeddings)

In [None]:
def pretty_print_results(sorted_df, num_results):
    # print top n urls with similarity scores
    print("\nTop Similar PDFs:\n" + "=" * 40)
    for i in range(min(num_results, len(sorted_df))):
        url = sorted_df.get_column("url")[i]
        score = sorted_df.get_column("similarity")[i]
        print(f"{i+1}. {url.ljust(60)} | Similarity: {score:.4f}")

## Our approach

Our approach involves 2 main steps: a web crawler and an embedding similarity search. Below is a description of these steps.

### Web crawler

The web crawler takes a homepage URL of an organisation (council website) and crawls it to look for pages talking about conservation areas.

The crawler will look for links on a single page, put them in a queue and then iteratively check them until it finds what it was looking for or it reaches a stopping criterion, such as maximum depth (how many clicks away from home page). 

In order to save time, we can define some scorers or filters which tell the crawler which pages to prioritise or ignore. In this case, some common patterns of what a user needs to click to get to the page of interest are _"planning"_, _"building"_, _"heritage"_ or _"conservation"_.

The crawler uses a *"best first strategy"*, which utilises the scorers or filters to visit most relevant sites first, rather than a depth-first or breath-first search.

The crawler extracts the HTML from the pages and turns them into markdown. This is because it's more readable and easier to work with in the next steps. The crawler returns a list of pairs of (_url_, _markdown_).

### Embedding search

To be filled

### Gedling - full example retrieving similar PDFs

In [None]:
# finds all PDFs at the URL patterns, extracts text and uses embedding similarity to search for best matches
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html", "application/pdf"]},
    {
        "type": "URLPatternFilter",
        "patterns": ["*conservation*", "*planning*", "*building*", "*heritage*"],
    },
]

prompt = """
The text describes a conservation area. It includes information about
where the conservation area is, its history, archeology, boundaries
and any additional planning data. Usually it includes images or maps
showing the boundary of the conservation area.
"""

await process_council(
    council_names=["Gedling"],
    max_depth=max_depth,
    filters=filters,
    prompt=prompt,
    num_results=num_results,
    crawl_type="pdf",
)

### Retrieving All Test PDFs

In [None]:
async def get_council_pdfs(
    council_names,
    max_depth=6,
    keyword_scorer=None,
    filters=None,
    cache_enabled=False,
    crawl_type="html",
):
    crawler = Crawler(
        max_depth=max_depth,
        keyword_scorer=keyword_scorer,
        filters=filters,
        cache_enabled=cache_enabled,
        crawl_type=crawl_type,
    )

    for council_name in council_names:
        council_data = data.filter(pl.col("name").str.contains(council_name))
        full_name = council_data.get_column("name")[0]
        homepage = council_data.get_column("website")[0]
        print("=" * 40 + f"\nCrawling {full_name}...\n")

        # crawl url
        crawl_data = await crawler.deep_crawl(homepage)

        for url in crawl_data:
            print(url)

In [None]:
# only searches for all PDFs at the URL patterns
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html", "application/pdf"]},
    {
        "type": "URLPatternFilter",
        "patterns": ["*conservation*", "*planning*", "*building*", "*heritage*"],
    },
]

await get_council_pdfs(
    council_names=["South Gloucestershire"],
    max_depth=max_depth,
    filters=filters,
    crawl_type="pdf",
)

### Bournemouth, Christchurch and Poole

In [None]:
# only searches for all PDFs at the URL patterns
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html", "application/pdf"]},
    {
        "type": "URLPatternFilter",
        "patterns": ["*conservation*", "*planning*", "*building*", "*heritage*"],
    },
]

await get_council_pdfs(
    council_names=["Bournemouth, Christchurch and Poole"],
    max_depth=max_depth,
    filters=filters,
    crawl_type="pdf",
)

### Warrington

In [None]:
# only searches for all PDFs at the URL patterns
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html", "application/pdf"]},
    {
        "type": "URLPatternFilter",
        "patterns": ["*conservation*", "*planning*", "*building*", "*heritage*"],
    },
]

await get_council_pdfs(
    council_names=["Warrington"], max_depth=max_depth, filters=filters, crawl_type="pdf"
)

### Stoke on Trent

In [None]:
# only searches for all PDFs at the URL patterns
max_depth = 6
num_results = 10
filters = [
    {"type": "ContentTypeFilter", "allowed_types": ["text/html", "application/pdf"]},
    {
        "type": "URLPatternFilter",
        "patterns": ["*conservation*", "*planning*", "*building*", "*heritage*"],
    },
]

await get_council_pdfs(
    council_names=["Stoke"], max_depth=max_depth, filters=filters, crawl_type="pdf"
)