In [11]:
!pip install pysolr pyeuropeana python-dotenv pyyaml

Collecting python-frontmatter
  Downloading python_frontmatter-1.0.0-py3-none-any.whl (9.0 kB)
Installing collected packages: python-frontmatter
Successfully installed python-frontmatter-1.0.0


In [27]:
import requests
import json
import os
import yaml
import re

%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [6]:
# get 10 records from data.gov
CKAN_ENDPOINT = 'https://catalog.data.gov/api/3/action/package_search'
params = {
    'q': '*:*',
    'rows': 10,
    'start': 0,
    'sort': 'metadata_modified desc'
}

# Get the data
response = requests.get(CKAN_ENDPOINT, params=params)
data = json.loads(response.text)

{'help': 'https://catalog.data.gov/api/3/action/help_show?name=package_search',
 'success': True,
 'result': {'count': 250686,
  'facets': {},
  'results': [{'author': None,
    'author_email': None,
    'creator_user_id': '2b785922-9f13-491b-a3c2-2a40acbd80c2',
    'id': '31354610-c274-400b-bb64-31aaf8086b17',
    'isopen': False,
    'license_id': 'other-license-specified',
    'license_title': 'other-license-specified',
    'maintainer': 'HealthData.gov Team',
    'maintainer_email': 'HealthData@hhs.gov',
    'metadata_created': '2022-01-25T02:04:26.214564',
    'metadata_modified': '2023-04-02T18:35:42.496083',
    'name': 'covid-19-public-therapeutic-locator',
    'notes': "Locations of publicly available COVID-19 Therapeutics. Dataset only includes locations for Evusheld (monoclonal antibody), Molnupiravir (antiviral), and Paxlovid (Antiviral). COVID-19 therapeutics require a prescription to obtain. Limitations: public contact information.\n\n<b> To filter, click 'View Data' belo

In [28]:
import re

def slugify(s):
    """Take a string and 'slugify' it

    Parameters
    ----------
    s : str
        A string to format without spaces, lower case, basic sluggify
    """
    s = s.lower().strip()
    s = re.sub(r'[^\w\s-]', '', s)
    s = re.sub(r'[\s_-]+', '-', s)
    s = re.sub(r'^-+|-+$', '', s)
    return s

In [26]:
for record in data['result']['results']:
    doc = {
        'id': record['id'],
        'layout': 'record',
        'title': record['title'],
        'licence': record['license_title'],
        'type': record['type'],
        'description': record['notes'],
        'resource_count': record['num_resources'],
        'record_url': 'https://catalog.data.gov/' + record['type'] + '/' + record['name'],
        'author': record['author'],
        'data_provider': record['organization']['title'],
        'date': record['metadata_created'],
        'format': record['resources'][0]['format']
    }

    frontmatter = yaml.dump(doc, sort_keys=False)
    filepath = "../_records/" + slugify(record['title']) + '.md'

    with open(filepath, 'w') as f:
        f.write('---\n')
        f.write(frontmatter)
        f.write('---\n\n')


In [29]:
# get 1 pages of Europeana records

# https://github.com/europeana/rd-europeana-python-api#usage
import pyeuropeana.apis as apis
import pyeuropeana.utils as utils

results = apis.search(
    query = '*',
    # qf = '(skos_concept:"http://data.europeana.eu/concept/base/48" AND TYPE:IMAGE)',
    reusability = 'open AND permission',
    media = True,
    thumbnail = True,
    landingpage = True,
    colourpalette = '#0000FF',
    theme = 'photography',
    sort = 'europeana_id',
    # profile = 'rich',
    rows = 10,
) # this gives you full response metadata along with cultural heritage object metadata


In [47]:
for item in results['items']:
    title_string = ' '.join(item['title'])
    doc = {
        'id': item['id'],
        'layout': 'record',
        'title': title_string,
        'licence': item['rights'],
        'description': item.get('description'),
        'thumbnail': item['edmPreview'],
        'record_url': item['guid'],
        'data_provider': item['dataProvider'],
        # 'languages': item['dcLanguage'],
        # 'latitude': item['edmPlaceLatitude'],
        # 'longitude': item['edmPlaceLongitude'],
        'date': item['timestamp_created']
    }

    frontmatter = yaml.dump(doc, sort_keys=False)
    filepath = "../_records/" + slugify(title_string) + '.md'

    with open(filepath, 'w') as f:
        f.write('---\n')
        f.write(frontmatter)
        f.write('---\n\n')