In [None]:
import requests
import json
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import os

## Getting all relevant URLs

In [None]:
BASE_CLAUSE_URL = "https://chancerylaneproject.org/clauses/"
BASE_GLOSSARY_URL = "https://chancerylaneproject.org/glossary/"
BASE_GUIDE_URL = "https://chancerylaneproject.org/guides/"

In [None]:
def get_all_urls(base_url, filter_function, total_pages):
    urls = []
    page = 1

    while True:
        url = f"{base_url}?pagenumber={page}"
        response = requests.get(url)
        if response.status_code != 200:
            break

        soup = BeautifulSoup(response.content, "html.parser")
        links = soup.find_all("a", href=True)
        page_urls = [
            urljoin(base_url, link["href"])
            for link in links
            if filter_function(link["href"])
        ]

        if not page_urls:
            break

        # Avoid duplicate links
        urls.extend(list(set(page_urls) - set(urls)))
        print(f"Valid URLs on page {page}: {page_urls}")

        page += 1

        if page > total_pages:
            break

    return urls

### Clause URLs

In [None]:
clause_filter = (
    lambda href: "/clauses/" in href and not "?" in href and href != "/clauses/"
)

In [None]:
clause_urls = get_all_urls(BASE_CLAUSE_URL, clause_filter, 15)

In [None]:
len(clause_urls)

### Glossary URLs

In [None]:
glossary_filter = (
    lambda href: "/glossary/" in href
    and href != "/glossary/"
    and not href.endswith("?")
)

In [None]:
glossary_urls = get_all_urls(BASE_GLOSSARY_URL, glossary_filter, 1)

In [None]:
len(glossary_urls)

### Guide URLs

In [None]:
guide_filter = (
    lambda href: "/guides/" in href and href != "/guides/" and not href.endswith("?")
)

In [None]:
guide_urls = get_all_urls(BASE_GUIDE_URL, guide_filter, 1)

In [None]:
len(guide_urls)

### Combining All URLs

In [None]:
all_urls = clause_urls + glossary_urls + guide_urls

## Scraping

In [None]:
output_dir = "output_jsons"
os.makedirs(output_dir, exist_ok=True)

In [None]:
def scrape_url_content(url):
    """
    Scrapes content from a URL and processes it based on the type of content (e.g., glossary term or clause).
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # Example logic to determine the type of page
        if "/glossary/" in url:
            post_type = "glossary-term"
        elif "/clauses/" in url:
            post_type = "clause"
        else:
            post_type = "unknown"

        title = (
            soup.find("title").get_text(strip=True)
            if soup.find("title")
            else "No Title"
        )
        cleaned_content = ""

        if post_type == "glossary-term":
            # Extract all definitions for glossary terms
            definitions = []
            for definition_section in soup.select(
                ".definition-section"
            ):  # Replace with actual selector
                definitions.append(definition_section.get_text(separator="\n").strip())
            cleaned_content = "\n\n".join(definitions)

        elif post_type == "clause":
            # Extract clause content
            full_content = (
                soup.find("body").get_text(separator="\n").strip()
            )  # Replace with actual content selector
            cleaned_content = full_content

            # Remove "Drafting Notes" if needed
            cleaned_content = re.sub(
                r"\[\s*Drafting note:.*?\]",
                "",
                cleaned_content,
                flags=re.DOTALL | re.IGNORECASE,
            )

        else:
            # Fallback for unknown post types
            content = (
                soup.find("body").get_text(separator="\n").strip()
                if soup.find("body")
                else ""
            )
            cleaned_content = content

        return {
            "url": url,
            "post_type": post_type,
            "title": title,
            "content": cleaned_content,
        }

    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch URL {url}: {e}")
        return None

In [None]:
def save_as_json(data, output_dir):
    """
    Saves the scraped data as a JSON file.
    """
    filename = f"{data['title'][:50].replace(' ', '_').replace('/', '_')}.json"
    filepath = os.path.join(output_dir, filename)

    with open(filepath, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4)

    print(f"Saved JSON: {filepath}")

In [None]:
for url in all_urls:
    scraped_data = scrape_url_content(url)
    if scraped_data:
        save_as_json(scraped_data, output_dir)

In [None]:
def scrape_relevant(soup):
    jurisdiction = soup.find("span", class_="cfc-taxonomy").get_text(strip=True)
    name = soup.find("p", class_="cfc-page-header__kicker").get_text(strip=True)
    title = soup.find("h1", class_="cfc-page-header__title").get_text(strip=True)
    subhead = soup.find("div", class_="cfc-page-header__text").get_text(strip=True)
    clause_does = (
        soup.find("h2", id="h-what-this-clause-does")
        .find_next("p")
        .get_text(strip=True)
    )
    clauses_div = soup.find(
        "div", class_="cfc-pattern-clause-callout has-navy-4-background-color"
    )
    clauses_text = clauses_div.get_text(separator="\n").split("Download this clause")[0]
    definitions_heading = soup.find("h2", id="h-definitions")
    definitions_section = definitions_heading.find_all_next(
        ["p", "ul", "li"], limit=50
    )  # Increase limit as necessary
    definitions_text = "\n".join(
        [element.get_text(strip=True) for element in definitions_section]
    )
    definitions_text = definitions_text.split("We'd like to hear")[0]
    return {
        "jurisdiction": jurisdiction,
        "name": name,
        "title": title,
        "subhead": subhead,
        "clause summary": clause_does,
        "clause text": clauses_text,
        "definitions text": definitions_text,
    }

In [None]:
for url in all_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    data = scrape_relevant(soup)
    print(json.dumps(data, indent=4))