# Overview of Datasets and Frequency report
**Author**:  Kena Vyas <br>
**Date**:  December 2024 <br>
**Dataset Scope**: <br>
Overview of Dataset table includes all Datasets <br>
Frequency report has Non-ODP and Non-category type datasets <br>
**Report Type**: Ad-hoc <br>

**Purpose**: <br>
Overview of Datasets:
Retrieves the overview of datasets from the Config Manager and improves the report by including additional fields such as the organization of the most recent resource and the entity count.

Frequency Report:
Identifies all datasets that have not been updated within their assigned refresh frequency, highlighting those overdue for updates.


In [1]:
# pip install pandas beautifulsoup4 lxml requests

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import json

In [3]:
response = requests.get("https://config-manager-prototype.herokuapp.com/reporting/overview-of-datasets")
bs = BeautifulSoup(response.content, 'html.parser')
table = bs.find('table')
overview_of_datasets = pd.read_html(str(table))[0]

  overview_of_datasets = pd.read_html(str(table))[0]


In [4]:
datasette_url = "https://datasette.planning.data.gov.uk/"

params = urllib.parse.urlencode({
    "sql": f"""
    select dataset, name from dataset where collection != "" group by dataset
    """,
    "_size": "max"
})
url = f"{datasette_url}digital-land.csv?{params}"
get_dataset_name = pd.read_csv(url)
overview_of_datasets.columns = overview_of_datasets.columns.str.lower()
overview_of_datasets = pd.merge(get_dataset_name, overview_of_datasets, on='name', how='inner')

In [5]:
datasette_url = "https://datasette.planning.data.gov.uk/"

#get organisation for each dataset
params = urllib.parse.urlencode({
    "sql": f"""
    SELECT 
    organisation,
    organisation_name,
    dataset,
    rn
FROM (
    SELECT *,
           ROW_NUMBER() OVER (PARTITION BY dataset ORDER BY resource_start_date DESC) AS rn
    FROM reporting_historic_endpoints
) AS ranked
WHERE rn = 1 group by dataset;
    """,
    "_size": "max"
})
url = f"{datasette_url}performance.csv?{params}"
get_org = pd.read_csv(url)
overview_of_datasets = pd.merge(overview_of_datasets, get_org[['dataset', 'organisation', 'organisation_name']], on='dataset',how='left')

### Overview of Datasets table

In [6]:
def fetch_entity_count(pipeline):
    url = f"https://www.planning.data.gov.uk/entity.json{f'?prefix={pipeline}' if pipeline else ''}"
    try:
        with urllib.request.urlopen(url) as response:
            data = json.load(response)
            return data.get("count", 0)
    except Exception as e:
        print("Error fetching data:",e)
        return 0
        
overview_of_datasets['entity_count'] = overview_of_datasets['dataset'].apply(fetch_entity_count)
overview_of_datasets

Unnamed: 0,dataset,name,typology,organisations (current/expected),started,endpoints (active/total),most recent endpoint,active resources,days since update,frequency of updates,organisation,organisation_name,entity_count
0,agricultural-land-classification,Agricultural land classification,geography,1 / 1,2024-07-15,1 / 1,2024-07-09,1.0,141.0,Unknown,government-organisation:PB202,Natural England,585
1,ancient-woodland,Ancient woodland,geography,1 / 1,2021-05-26,1 / 1,2024-09-26,1.0,67.0,Monthly,government-organisation:PB202,Natural England,44373
2,ancient-woodland-status,Ancient woodland status,category,1 / 1,2021-12-03,1 / 1,2021-12-03,1.0,1096.0,Monthly,government-organisation:PB202,Natural England,2
3,area-of-outstanding-natural-beauty,Area of outstanding natural beauty,geography,1 / 1,2024-10-03,2 / 2,2024-10-01,2.0,57.0,Ad hoc,government-organisation:PB202,Natural England,34
4,article-4-direction,Article 4 direction,legal-instrument,20 / 307,2022-04-25,18 / 19,2024-12-02,23.0,0.0,Ad hoc,local-authority:BIR,Birmingham City Council,263
...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,tree-preservation-zone-type,Tree preservation zone type,category,0 / 1,2023-09-11,1 / 1,2023-09-11,1.0,449.0,Ad hoc,government-organisation:D1342,"Ministry of Housing, Communities & Local Gover...",3
99,ward,Ward,geography,1 / 1,2024-07-08,1 / 1,2024-07-08,1.0,148.0,Quarterly,government-organisation:D303,Office for National Statistics,0
100,waste-authority,Waste disposal authority,organisation,1 / 2,2022-06-06,1 / 1,2022-05-30,1.0,160.0,,government-organisation:D1342,"Ministry of Housing, Communities & Local Gover...",0
101,world-heritage-site,World heritage site,geography,1 / 1,2021-05-27,2 / 2,2024-09-18,2.0,3.0,Ad hoc,government-organisation:PB1164,Historic England,20


### Downloads Overview table

In [7]:
overview_of_datasets.to_excel('overview_of_datasets.xlsx', index=False)

### Frequency report

In [10]:
remove_dataset=['article-4-direction','article-4-direction-area','conservation-area', 'conservation-area-document', 'listed-building', 'listed-building-outline', 'tree-preservation-order', 'tree', 'tree-preservation-zone']
overview_of_datasets = overview_of_datasets[overview_of_datasets['typology'] != 'category']
overview_of_datasets = overview_of_datasets[~overview_of_datasets['dataset'].isin(remove_dataset)]
frequency_to_days = {
    'Fortnightly': 14,
    'Monthly': 30,
    'Quarterly': 90,
    'Annually': 365,
}
overview_of_datasets['frequency_in_days'] = overview_of_datasets['frequency of updates'].map(frequency_to_days)
overview_of_datasets = overview_of_datasets.dropna(subset=['frequency_in_days'])
frequency_df = overview_of_datasets[(overview_of_datasets['days since update'] > overview_of_datasets['frequency_in_days'])]
frequency_df

Unnamed: 0,dataset,name,typology,organisations (current/expected),started,endpoints (active/total),most recent endpoint,active resources,days since update,frequency of updates,organisation,organisation_name,entity_count,frequency_in_days
1,ancient-woodland,Ancient woodland,geography,1 / 1,2021-05-26,1 / 1,2024-09-26,1.0,67.0,Monthly,government-organisation:PB202,Natural England,44373,30.0
10,building-preservation-notice,Building preservation notice,geography,1 / 1,2021-05-28,1 / 1,2024-10-02,1.0,32.0,Fortnightly,government-organisation:PB1164,Historic England,6,14.0
35,flood-risk-zone,Flood risk zone,geography,1 / 1,2023-08-24,/,,0.0,467.0,Quarterly,government-organisation:EA199,Environment Agency,550621,90.0
42,infrastructure-funding-statement,Infrastructure funding statement,document,1 / 1,2022-01-05,/,,0.0,1063.0,Annually,,,197,365.0
99,ward,Ward,geography,1 / 1,2024-07-08,1 / 1,2024-07-08,1.0,148.0,Quarterly,government-organisation:D303,Office for National Statistics,0,90.0


### Downloads Frequency table

In [11]:
frequency_df.to_excel('frequency_of_datasets.xlsx', index=False)