In [9]:
import requests
import pandas as pd
import os


In [10]:
# Base CKAN API endpoint
BASE_URL = "https://data.gov.uk/api/3/action/"

def list_dataset_ids(limit=50):
    """List dataset IDs from CKAN"""
    url = f"{BASE_URL}package_list"
    response = requests.get(url)
    response.raise_for_status()
    dataset_ids = response.json()["result"]
    return dataset_ids[:limit]

def fetch_dataset_metadata(dataset_id):
    """Fetch full metadata for a given dataset ID"""
    url = f"{BASE_URL}package_show?id={dataset_id}"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()["result"]

def flatten_metadata(record):
    """Flatten the CKAN metadata for DataFrame storage"""
    org = record.get("organization", {})

    # Safe access for resources
    resources = record.get("resources", [])
    resource = resources[0] if resources else {}

    return {
        "id": record.get("id", ""),
        "title": record.get("title", ""),
        "name": record.get("name", ""),
        "notes": record.get("notes", ""),
        "type": record.get("type", ""),
        "theme-primary": record.get("theme-primary", ""),
        "metadata_created": record.get("metadata_created", ""),
        "metadata_modified": record.get("metadata_modified", ""),
        "isopen": record.get("isopen", ""),
        "private": record.get("private", ""),
        "state": record.get("state", ""),
        "license_title": record.get("license_title", ""),
        "license_id": record.get("license_id", ""),
        "license_url": record.get("license_url", ""),
        "contact-name": record.get("contact-name", ""),
        "contact-email": record.get("contact-email", ""),
        "contact-phone": record.get("contact-phone", ""),
        "foi-name": record.get("foi-name", ""),
        "foi-email": record.get("foi-email", ""),
        "foi-phone": record.get("foi-phone", ""),
        "foi-web": record.get("foi-web", ""),
        "creator_user_id": record.get("creator_user_id", ""),
        "owner_org": record.get("owner_org", ""),
        "schema-vocabulary": record.get("schema-vocabulary", ""),
        "codelist": record.get("codelist", ""),

        # Tags
        "tags": ", ".join([tag.get("name", "") for tag in record.get("tags", [])]),

        # Organization info
        "org_name": org.get("name", ""),
        "org_title": org.get("title", ""),
        "org_description": org.get("description", ""),
        "org_created": org.get("created", ""),
        "org_state": org.get("state", ""),

        # Resource info (first only, or blank if none)
        "resource_name": resource.get("name", ""),
        "resource_description": resource.get("description", ""),
        "resource_format": resource.get("format", ""),
        "resource_url": resource.get("url", ""),
        "resource_mimetype": resource.get("mimetype", ""),
        "resource_created": resource.get("created", ""),
        "resource_metadata_modified": resource.get("metadata_modified", ""),
        "resource_datafile_date": resource.get("datafile-date", ""),
    }



In [11]:
# Get dataset IDs and process metadata
dataset_ids = list_dataset_ids(limit=100)

records = []
for ds_id in dataset_ids:
    try:
        data = fetch_dataset_metadata(ds_id)
        flat = flatten_metadata(data)
        records.append(flat)
    except Exception as e:
        print(f"Error with {ds_id}: {e}")

# Convert to DataFrame
df = pd.DataFrame(records)


In [12]:
output_dir = "harvested_data"
os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "ckan_data_gov_uk_metadata.csv")
df.to_csv(output_path, index=False)


In [13]:
import requests
import pandas as pd

BASE_URL = "https://data.gov.uk/api/3/action/package_search"

# Try the first 1 result
params = {"start": 0, "rows": 1}
response = requests.get(BASE_URL, params=params)
response.raise_for_status()
sample_record = response.json()["result"]["results"][0]

# See all top-level fields
list(sample_record.keys())

['creator_user_id',
 'id',
 'isopen',
 'license_id',
 'license_title',
 'metadata_created',
 'metadata_modified',
 'name',
 'notes',
 'num_resources',
 'num_tags',
 'organization',
 'owner_org',
 'private',
 'state',
 'title',
 'type',
 'url',
 'version',
 'extras',
 'resources',
 'tags',
 'groups',
 'relationships_as_subject',
 'relationships_as_object']