# document_url Analysis

**Author:** Adam Miriam\
**Date:** 23rd Feb 2024\
**Data Scope:** One of the selected primary datasets\
**Report Type:** Exploratory

This Jupyter notebook should collect the provided document-urls from the desired dataset, and test whether they are live, accessible links.

In [None]:
import pandas as pd
import urllib.parse
import ipywidgets as widgets
import urllib3
import urllib

This cell will grab the entries of the dataset selected, along with document_urls.

In [None]:
global dataset_options    
dataset_options = {
    "Article 4 Direction": "article-4-direction",
    "Conservation Area Document": "conservation-area-document",
    "Tree Preservation Order": "tree-preservation-order",

}

def get_document_urls(dataset_options):
    df = pd.read_csv(f"https://files.planning.data.gov.uk/dataset/{dataset_options}.csv")
    df = df[["prefix","reference","organisation-entity", "document-url"]]
    return df

collection_dropdown = widgets.Dropdown(
    options=dataset_options,
    description="Select dataset combination:",
)

This cell contains a function returning entities with a document-url equalling null.

In [None]:
def get_entries_with_null_document_urls(dataset_options):
    global null_dataset_document_urls
    dataset_document_urls = get_document_urls(dataset_options)
    null_dataset_document_urls = dataset_document_urls[dataset_document_urls['document-url'].isna()].reset_index(drop=True)
    return null_dataset_document_urls
    
widgets.interact(get_entries_with_null_document_urls, dataset_options=dataset_options)
initial_organisation = collection_dropdown.value

In [None]:
download = input("Do you want to download the table? (yes/no): ")

if download.lower() == "yes":
    null_dataset_document_urls.to_csv("null_dataset_document_urls.csv", index=False)
    print("Query result downloaded as 'null_dataset_document_urls.csv'") 

The following cell contains a function which checks all provided document_url values of the chosen dataset and returns all document_urls which returned expections.

In [None]:
def check_url(url):
    try:
        response = urllib.request.urlopen(url)
        response_status = response.status
    except (Exception) as e:
        response_status = e
    return response_status
    
def check_document_urls(dataset_options):
    global problem_dataset_document_urls
    problem_dataset_document_urls = get_document_urls(dataset_options).dropna().reset_index(drop=True)
    problem_dataset_document_urls["response"] = problem_dataset_document_urls['document-url'].apply(check_url)
    problem_dataset_document_urls = problem_dataset_document_urls[problem_dataset_document_urls['response'] != 200]
    return problem_dataset_document_urls

widgets.interact(check_document_urls, dataset_options=dataset_options)
initial_organisation = collection_dropdown.value

In [None]:
download = input("Do you want to download the table? (yes/no): ")

if download.lower() == "yes":
    problem_dataset_document_urls.to_csv("problem_dataset_document_urls.csv", index=False)
    print("Query result downloaded as 'problem_dataset_document_urls.csv'") 