In [13]:
import json
import logging
import requests
import jsonschema

In [14]:
logger = logging.getLogger(__name__)

In [15]:
DATA_GOUV_API = "https://www.data.gouv.fr/api/1"
RECOMMENDATION_SCORE = 50
CATALOG_SCHEMAS = 'https://schema.data.gouv.fr/schemas/schemas.json'
JSONSCHEMA_URL = "https://raw.githubusercontent.com/opendatateam/udata-recommendations/master/udata_recommendations/schema.json"

In [None]:
def get_all_from_api_query(
    base_query,
    next_page='next_page',
    ignore_errors=False,
    mask=None,
):
    def get_link_next_page(elem, separated_keys):
        result = elem
        for k in separated_keys.split('.'):
            result = result[k]
        return result
    # /!\ only for paginated endpoints
    headers = {'X-fields': mask + f',{next_page}'} if mask else None
    r = requests.get(base_query, headers=headers)
    if not ignore_errors:
        r.raise_for_status()
    for elem in r.json()["data"]:
        yield elem
    while get_link_next_page(r.json(), next_page):
        r = requests.get(get_link_next_page(r.json(), next_page), headers=headers)
        if not ignore_errors:
            r.raise_for_status()
        for data in r.json()['data']:
            yield data

In [16]:
def consolidated_schemas():
    """Find TableSchema schemas that are consolidated"""
    r = requests.get(CATALOG_SCHEMAS)
    schemas = r.json()['schemas']
    return { 
        s['name']: s['consolidation_dataset_id'] 
        for s in schemas 
        if s['consolidation_dataset_id'] and s['schema_type'] == 'tableschema'
    }

In [17]:
def datasets_for_schema(schema):
    """Fetch datasets on datagouv with the schema attribute set to a specific value"""
    url = f"{DATA_GOUV_API}/datasets?schema={schema}"
    r = get_all_from_api_query(
        base_query=url,
        mask='data{id}'
    )
    return [d['id'] for d in r]

In [18]:
def build_recommendation(consolidated_dataset_id, dataset_id):
    return {
        "id": dataset_id,
        "recommendations": [
            {"id": consolidated_dataset_id, "score": RECOMMENDATION_SCORE}
        ],
    }

In [19]:
def validate_recommendations(recommendations):
    """" Validate recommendations according to the JSON schema"""
    r = requests.get(JSONSCHEMA_URL, timeout=10)
    r.raise_for_status()
    schema = r.json()

    jsonschema.validate(recommendations, schema=schema)

In [20]:
recommendations = []
for schema_id, schema_details in consolidated_schemas().items():
    consolidated_dataset_id = schema_details
    logger.info(
        f"Working on schema {schema_id}, consolidated on {consolidated_dataset_id}"
    )

    dataset_ids = datasets_for_schema(schema_id)
    logger.info(f"Found {len(dataset_ids)} associated with schema {schema_id}")

    recommendations.extend([
        build_recommendation(consolidated_dataset_id, d) for d in dataset_ids
    ])

ids = []
recommendations_clean = []
for r in recommendations:
    if r["id"] not in ids:
        ids.append(r["id"])
        recommendations_clean.append(r)
validate_recommendations(recommendations_clean)

etalab/schema-irve
etalab/schema-lieux-covoiturage
etalab/schema-stationnement


In [None]:
with open(TMP_FOLDER + '/recommendations.json', 'w') as fp:
    json.dump(recommendations_clean, fp, indent=2)