In [None]:
from datetime import datetime as dt
from IPython.display import display
import ipywidgets as widgets
import urllib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
dataset_input=['article-4-direction', 'article-4-direction-area', 'brownfield-land', 'conservation-area',  'listed-building-outline', 'tree', 'tree-preservation-order', 'tree-preservation-zone']
severity_input=[] # list of issue severities you want to get e.g ["error", "warning", "info", "notice"]
issue_type_input= input("Please enter a single issue type or list of issue types to search for, or leave blank for all issue types. \n List should seperated by commas with no spaces (e.g: unknown entity,invalid geometry - fixed): ") # list of issue types you want to get e.g ["unknown entity", "invalid geometry"]
if issue_type_input == '':
    issue_type_input = []
else:
    issue_type_input = issue_type_input.split(",")
    print("Issue types chosen: ", issue_type_input)
line_number_input=''

datasette_url = "https://datasette.planning.data.gov.uk/"

# Collect list of organisations
params = urllib.parse.urlencode({
        "sql": f"""
        select organisation, name, entity as organisation_entity
        from organisation
        """,
        "_size": "max"
        })
url = f"https://datasette.planning.data.gov.uk/digital-land.csv?{params}"
organisations_df = pd.read_csv(url)

dataset_dfs=[]
output_dataset_names=[]
for dataset in dataset_input:
    dataset_organisations_dfs=[]
    for organisation in organisations_df.itertuples():
        params = urllib.parse.urlencode({
            "sql": f"""
            select x.resource, x.endpoint, x.organisation, x.name, x.entry_date, x.endpoint_url, x.collection
            from (
                select re.resource, re.endpoint, s.organisation, o.name, s.collection, s.entry_date, e.endpoint_url,
                    row_number() over (partition by s.organisation order by s.entry_date desc) as row_number
                from resource_endpoint re
                inner join endpoint e
                on re.endpoint = e.endpoint
                inner join source s
                on e.endpoint = s.endpoint
                inner join resource_organisation ro
                on re.resource = ro.resource
                inner join organisation o
                on ro.organisation = o.organisation
                inner join source_pipeline sp
                on s.source = sp.source
                inner join resource r
                on re.resource = r.resource
                where sp.pipeline = '{dataset}'
                and s.organisation = '{organisation[1]}'
                and s.end_date = ''                
            ) x
            where x.row_number = 1
            """,
            "_size": "max"
        })

        url = f"{datasette_url}digital-land.csv?{params}"
        endpoint_df = pd.read_csv(url)
        
        if (not endpoint_df.empty):
            params = urllib.parse.urlencode({
            "sql": f"""
            select
                r.resource
            from
                endpoint e
                inner join resource_endpoint re on e.endpoint = re.endpoint
                inner join resource r on re.resource = r.resource
            where
                e.endpoint_url='{endpoint_df.iloc[0]['endpoint_url']}'
            order by
                r.entry_date desc
            limit 1
            """,
            "_size": "max"
            })
    
            url = f"{datasette_url}digital-land.csv?{params}"
            resource_df = pd.read_csv(url)
            endpoint_df.iloc[0]['resource'] = resource_df.iloc[0]['resource']
            dataset_organisations_dfs.append(endpoint_df)
    # Check if dataset_organisations_dfs is empty
    if (not dataset_organisations_dfs):
        print("\033[1m No results found for ", dataset)
    else:
        dataset_organisations_dfs = pd.concat(dataset_organisations_dfs)
        dataset_dfs.append(dataset_organisations_dfs)
        output_dataset_names.append(dataset)

In [None]:
params = urllib.parse.urlencode({
    "sql": f"""
    select description, issue_type, severity, responsibility
    from issue_type
    """,
    "_size": "max"
})

url = f"{datasette_url}digital-land.csv?{params}"
issue_type_df = pd.read_csv(url)

In [None]:
def get_issues_by_issue_type(resource, issue_type, query):
    params = urllib.parse.urlencode({
        "sql": f"""
        select field,issue_type,dataset,resource,value, line_number
        from issue
        where resource = '{resource}'
        and issue_type = '{issue_type}'
        {query}
        """,
        "_size": "max"
        })
    url = f"{datasette_url}{dataset_input[idx]}.csv?{params}"
    issues_by_type_df = pd.read_csv(url)
    return issues_by_type_df


issues_dfs=[]
for idx, dataset_df in enumerate(dataset_dfs):
    query=""
    if line_number_input:
        query = f" and line_number = '{line_number_input}'"

    resources = dataset_df['resource'].tolist()
    issues = []
    for resource in resources:
        if issue_type_input == []:
            issue_type_input = issue_type_df['issue_type'].tolist()
        for issue_type in issue_type_input:
            # print(dataset_df[dataset_df['resource'] == resource]['dataset'].iloc[0], dataset_df[dataset_df['resource'] == resource]['organisation'].iloc[0], resource, issue_type )
            issues_by_type_df = get_issues_by_issue_type(resource, issue_type, query)
            issues.append(issues_by_type_df)
    df1 = pd.concat(issues, ignore_index=True)
    issues_with_type = df1.merge(issue_type_df, left_on='issue_type', right_on='issue_type')
    issues_dfs.append(issues_with_type)

In [None]:
output_dfs=[]
for idx, dataset_df in enumerate(dataset_dfs):
    dataset_issues_df = dataset_df.merge(issues_dfs[idx], left_on='resource', right_on='resource')
    dataset_issues_df = dataset_issues_df.reindex(columns=['resource', 'organisation', 'name', 'dataset', 'entry_date', 'field', 'line_number', 'issue_type', 'value', 'severity', 'responsibility', 'description', 'endpoint', 'endpoint_url']).reset_index(drop=True)
    if (severity_input):
        dataset_issues_df = dataset_issues_df.loc[dataset_issues_df['severity'].isin(severity_input)]
    output_dfs.append(dataset_issues_df)

In [None]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)

def filter_column_by_value(df, column, value):
    return df.loc[df[column].isin(value)]

def compute_output_df(dataset, error, warning, info, notice, search):
    index = output_dataset_names.index(dataset)
    selected_data = []
    output_df = output_dfs[index]
    if error or warning or info or notice:
        selected_data = []
        for i in range(0, len(severity_checkboxes)):
            if severity_checkboxes[i].value == True:
                selected_data = selected_data + [severity_checkboxes[i].description]
        output_df = filter_column_by_value(output_dfs[index], 'severity', selected_data)
    if search:
        mask = np.column_stack([output_df[col].astype('str').str.contains(search, na=False) for col in output_df])
        output_df = output_df.loc[mask.any(axis=1)]
    return output_df

def display_output_df(dataset, error, warning, info, notice, search):
    output_df = compute_output_df(dataset, error, warning, info, notice, search)
    display(output_df.head(1000))

def download_df(dataset, error, warning, info, notice, search):
    output_df = compute_output_df(dataset, error, warning, info, notice, search)
    output_df.to_csv(dataset + "-issues.csv")

severity_options = ["error", "warning", "info", "notice"]
severity_checkboxes = [widgets.Checkbox(value=False, description=severity) for severity in severity_options]

dataset_selector = widgets.RadioButtons(
    options=output_dataset_names,
    description='Select dataset to display:',
    disabled=False
)
download_button = widgets.Button(
    description = "Download output table",
    layout=widgets.Layout(width='200px'),
)
download_button.on_click(lambda b: download_df(dataset_selector.value, severity_checkboxes[0].value, severity_checkboxes[1].value, severity_checkboxes[2].value, severity_checkboxes[3].value, search_box.value))
search_box = widgets.Text(placeholder="Search table", layout=widgets.Layout(width='200px'))

severity_filter = widgets.VBox(severity_checkboxes, layout = widgets.Layout(flex_flow='row wrap'))
ui = widgets.VBox([dataset_selector, search_box, download_button, severity_filter])
out = widgets.interactive_output(display_output_df, {'dataset': dataset_selector, "error": severity_checkboxes[0], "warning": severity_checkboxes[1], "info": severity_checkboxes[2], "notice": severity_checkboxes[3], "search": search_box})
display(ui, out)

In [None]:
def display_output_df(dataset, error, warning, info, notice, search):
    output_df = compute_output_df(dataset, error, warning, info, notice, search)
    fields = output_df['field']
    field_count = {}
    
    for field in fields:
        if field in field_count:
            field_count[field] += 1
        else:
            field_count[field] = 1
    x_values = list(field_count.keys())
    y_values = list(field_count.values())

    print("Issue count for",dataset,"is",sum(y_values))
    figsize = (20, 5) if dataset == "brownfield-land" else (5, 5)

    fig,ax1 = plt.subplots(1,1,figsize=figsize)
    
    for i, value in enumerate(y_values):
        ax1.text(i, value, str(value),ha='center', va='bottom')

    ax1.bar(x_values, y_values)
    ax1.set_xlabel('issue field')
    ax1.set_ylabel('count')
    ax1.set_title('Issues Type')
    ax1.set_xticks(x_values)
    ax1.tick_params(axis='x', rotation=35)
    
ui = widgets.VBox([dataset_selector, severity_filter])
out = widgets.interactive_output(display_output_df, {'dataset': dataset_selector, "error": severity_checkboxes[0], "warning": severity_checkboxes[1], "info": severity_checkboxes[2], "notice": severity_checkboxes[3], "search": search_box})

display(ui,out)