In [1]:
# CKAN metadata harvester using package_search (data.gov.uk)

import requests
import pandas as pd
import os
import time

# --- Settings ---
BASE_URL = "https://data.gov.uk/api/3/action/package_search"
BATCH_SIZE = 100  # max = 1000, but start small
MAX_RECORDS = 55000  # estimated size of full catalog
OUTPUT_DIR = "harvested_data"
OUTPUT_FILE = "ckan_data_gov_uk_package_search.csv"
os.makedirs(OUTPUT_DIR, exist_ok=True)
output_path = os.path.join(OUTPUT_DIR, OUTPUT_FILE)

# --- Helper Function ---
def flatten_search_result(record):
    org = record.get("organization", {})
    resources = record.get("resources", [])
    resource = resources[0] if resources else {}

    return {
        "id": record.get("id", ""),
        "name": record.get("name", ""),
        "title": record.get("title", ""),
        "notes": record.get("notes", ""),
        "url": record.get("url", ""),
        "type": record.get("type", ""),
        "version": record.get("version", ""),
        "state": record.get("state", ""),
        "private": record.get("private", ""),
        "isopen": record.get("isopen", ""),
        "metadata_created": record.get("metadata_created", ""),
        "metadata_modified": record.get("metadata_modified", ""),
        "license_id": record.get("license_id", ""),
        "license_title": record.get("license_title", ""),
        "creator_user_id": record.get("creator_user_id", ""),
        "owner_org": record.get("owner_org", ""),
        "organization": org.get("title", ""),
        "num_resources": record.get("num_resources", 0),
        "num_tags": record.get("num_tags", 0),
        "tags": ", ".join([tag.get("name", "") for tag in record.get("tags", [])]),
        "resource_name": resource.get("name", ""),
        "resource_format": resource.get("format", ""),
        "resource_url": resource.get("url", ""),
    }






In [2]:
# --- Harvesting Loop ---
all_records = []

for start in range(0, MAX_RECORDS, BATCH_SIZE):
    print(f"Fetching records {start} to {start + BATCH_SIZE - 1}...")
    params = {"start": start, "rows": BATCH_SIZE}
    try:
        response = requests.get(BASE_URL, params=params)
        response.raise_for_status()
        results = response.json()["result"]["results"]

        for record in results:
            flat = flatten_search_result(record)
            all_records.append(flat)

        time.sleep(0.5)  # avoid rate limiting

    except Exception as e:
        print(f"Error fetching batch starting at {start}: {e}")
        break

# --- Save to CSV ---
df = pd.DataFrame(all_records)
df.to_csv(output_path, index=False)
print(f"Saved {len(df)} records to {output_path}")

Fetching records 0 to 99...
Fetching records 100 to 199...
Fetching records 200 to 299...
Fetching records 300 to 399...
Fetching records 400 to 499...
Fetching records 500 to 599...
Fetching records 600 to 699...
Fetching records 700 to 799...
Fetching records 800 to 899...
Fetching records 900 to 999...
Fetching records 1000 to 1099...
Fetching records 1100 to 1199...
Fetching records 1200 to 1299...
Fetching records 1300 to 1399...
Fetching records 1400 to 1499...
Fetching records 1500 to 1599...
Fetching records 1600 to 1699...
Fetching records 1700 to 1799...
Fetching records 1800 to 1899...
Fetching records 1900 to 1999...
Fetching records 2000 to 2099...
Fetching records 2100 to 2199...
Fetching records 2200 to 2299...
Fetching records 2300 to 2399...
Fetching records 2400 to 2499...
Fetching records 2500 to 2599...
Fetching records 2600 to 2699...
Fetching records 2700 to 2799...
Fetching records 2800 to 2899...
Fetching records 2900 to 2999...
Fetching records 3000 to 3099...


Fetching records 24100 to 24199...
Fetching records 24200 to 24299...
Fetching records 24300 to 24399...
Fetching records 24400 to 24499...
Fetching records 24500 to 24599...
Fetching records 24600 to 24699...
Fetching records 24700 to 24799...
Fetching records 24800 to 24899...
Fetching records 24900 to 24999...
Fetching records 25000 to 25099...
Fetching records 25100 to 25199...
Fetching records 25200 to 25299...
Fetching records 25300 to 25399...
Fetching records 25400 to 25499...
Fetching records 25500 to 25599...
Fetching records 25600 to 25699...
Fetching records 25700 to 25799...
Fetching records 25800 to 25899...
Fetching records 25900 to 25999...
Fetching records 26000 to 26099...
Fetching records 26100 to 26199...
Fetching records 26200 to 26299...
Fetching records 26300 to 26399...
Fetching records 26400 to 26499...
Fetching records 26500 to 26599...
Fetching records 26600 to 26699...
Fetching records 26700 to 26799...
Fetching records 26800 to 26899...
Fetching records 269

Fetching records 47600 to 47699...
Fetching records 47700 to 47799...
Fetching records 47800 to 47899...
Fetching records 47900 to 47999...
Fetching records 48000 to 48099...
Fetching records 48100 to 48199...
Fetching records 48200 to 48299...
Fetching records 48300 to 48399...
Fetching records 48400 to 48499...
Fetching records 48500 to 48599...
Fetching records 48600 to 48699...
Fetching records 48700 to 48799...
Fetching records 48800 to 48899...
Fetching records 48900 to 48999...
Fetching records 49000 to 49099...
Fetching records 49100 to 49199...
Fetching records 49200 to 49299...
Fetching records 49300 to 49399...
Fetching records 49400 to 49499...
Fetching records 49500 to 49599...
Fetching records 49600 to 49699...
Fetching records 49700 to 49799...
Fetching records 49800 to 49899...
Fetching records 49900 to 49999...
Fetching records 50000 to 50099...
Fetching records 50100 to 50199...
Fetching records 50200 to 50299...
Fetching records 50300 to 50399...
Fetching records 504