In [50]:
import requests
import json
import csv
import pandas as pd

In [97]:
import time

In [51]:
query_terms = pd.read_csv("/Users/anesterov/reps/OutdatedObjectDescriptions/query_terms.csv")

In [55]:
query_list = query_terms['query'].to_list()

In [62]:
query_terms_list = [t for t in query_list if type(t) != float] # nan is float 

In [64]:
len(query_terms_list)

40

In [94]:
query_terms_list[0]

'sodomi*'

In [95]:
def get_hits_per_term(term:str) -> tuple:
    '''
    Getting N hits per query terms returned in the collectienederland API search
    term: str;
    Returns dict {str:int/str} str in case of a request error
    '''

    # API query params
    url = "https://data.collectienederland.nl/api/search/v2/"
    params = {"q":term,
              "format":"json"}
    headers = {}

    # sending a request / getting json
    r = requests.get(url,params=params,headers=headers)
    
    if r.ok:
            
        hits = r.json()
        n = hits['result']['query']['numfound']
        
    else:
        n = "error"

    return (term,n)

In [98]:
hits_dict = {}
for term in query_terms_list:
    t_n = get_hits_per_term(term)
    time.sleep(3)
    hits_dict[term] = t_n[1]

In [99]:
hits_dict

{'sodomi*': 28,
 'urani*': 795,
 'onani*': 4,
 'tegennatuurlijk': 0,
 'travesti*': 225,
 '248 bis': 9,
 'COC': 151,
 'homofi*': 35,
 'lesbi*': 66,
 'flikker': 7,
 'potten OR poten': 13675,
 'AIDS': 202,
 'eonis*': 0,
 '"seksuele inversie"': 0,
 'homoseksu*': 177,
 'geïnverteerd*': 1,
 'faggot*': 38,
 '"sea queens" OR "sea queen" ': 0,
 'tribad*': 7,
 'fricatrice*': 0,
 'pederas*': 0,
 'MtF*': 65,
 'FtM*': 67,
 'banjee*': 0,
 'dandy*': 93,
 'clones': 0,
 'GRID': 121,
 'hermafrodi*': 24,
 'vermomming': 38,
 'fairy OR fairies': 188,
 'gay': 424,
 'homo': 945,
 'queer': 17,
 'trans': 322,
 'transgenderisme': 0,
 'transseksualisme': 0,
 '"intersekse variaties"': 0,
 'genderdysforie': 0,
 'genderidentiteitsstoornis': 0,
 'stoornissen AND seksuele AND ontwikkeling': 0}

In [102]:
# export hits csv
# name of csv file

with open("/Users/anesterov/reps/OutdatedObjectDescriptions/query_hits.csv", 'w') as csvfile:

    csvwriter = csv.writer(csvfile)

    header =["term","n_hits"]
    csvwriter.writerow(header)

    rows = [[t,h] for t,h in hits_dict.items()]
    csvwriter.writerows(rows)

In [None]:
def get_objects(term:str) -> dict:
    '''
    Retrieving object's info containing a query term:

    {term:[{"aggregatedCHO":"URL","found_in":field}]}
    '''

    objects_info = []

    # request
    url = "https://data.collectienederland.nl/api/search/v2/"
    params = {"q":term,
              "page":"1", # !NB check if str or int
              "format":"json"}
    headers = {}

    r = requests.get(url,params=params,headers=headers)

    # _parse_objects(hits)
    # check output (if r is ok)
    # return 
    # objects_info.extend(return)
    
    # pagination
    last_page = hits['resul']['pagination']['lastPage']

    for p in range(2,last_page+1):
        params['page'] = p
        r = requests.get(url,params=params,headers=headers)
        # _parse_objects(hits)
        # objects_info.extend(return)
    


In [None]:
# the function below might not be necessary

In [None]:
def _parse_object(request_return, term:str) -> list:
    '''
    Parsing object's info from the request results
    A subfunction of get_objects
    Returns a list of objects with info per page
    request_return
    '''

    list_per_page = []

    if request_return.ok:
        hits = r.json()
        # parsing

        for item in hits['result']['items']:
            
            dict_per_item = {}

            # getting object's link
            object_url = item['item']['fields']['edm_aggregatedCHO'][0]['raw']
            dict_per_item['edm_aggregatedCHO'] = object_url

            # check where the query term was found
             # fields, in which a query term will be searched (after the results are retrieved)
            search_fields = ['dc_coverage','dc_description','dc_subject','dc_title','skos_prefLabel']
            found_in = [] # storing the field names, in which the term was found
            
            for field in search_fields:
                # if field in item['item']['fields'].keys() ?
                for field_value in item['item']['fields'][field]:
                    if field_value['language'] == 'nl':
                        field_value['raw'] # search the term here
                        # if found:
                        # record the field name
                        # found_in.append(field)
                    else:
                        continue

            dict_per_item['found_in'] = found_in
            list_per_page.append(dict_per_item)
                
    return list_per_page

In [None]:
# searching over object terms
# &qf[]=dc_subject_facet:term

In [77]:
type(hits['result']['pagination']['lastPage'])

int

In [47]:
# 25 items per page
len(hits['result']['items'])

25

In [68]:
r.ok

True

In [None]:
['dc_coverage','dc_description','dc_subject','dc_title','skos_prefLabel']

In [82]:
hits['result']['items'][0]['item'].keys()

dict_keys(['doc_id', 'doc_type', 'fields'])

In [91]:
hits['result']['items'][0]['item']['fields']['dc_coverage']

[{'language': 'nl',
  '@type': 'Literal',
  'value': 'tweede kwart 19e eeuw',
  'raw': 'tweede kwart 19e eeuw'},
 {'language': 'en',
  '@type': 'Literal',
  'value': 'second quarter 19th century',
  'raw': 'second quarter 19th century'}]

In [33]:
for item in hits['result']['items']:
    print(item['item']['fields'])
    break

{'dc_coverage': [{'language': 'nl', '@type': 'Literal', 'value': 'derde kwart 18e eeuw', 'raw': 'derde kwart 18e eeuw'}, {'language': 'en', '@type': 'Literal', 'value': 'third quarter 18th century', 'raw': 'third quarter 18th century'}], 'dc_creator': [{'language': 'en', '@type': 'Literal', 'value': 'anonymous', 'raw': 'anonymous'}, {'language': 'nl', '@type': 'Literal', 'value': 'anoniem', 'raw': 'anoniem'}, {'language': 'nl', '@type': 'Literal', 'value': 'kopie naar\xa0', 'raw': 'kopie naar\xa0'}], 'dc_description': [{'language': 'nl', '@type': 'Literal', 'value': 'Portret van Anna van Hannover (1709-59), zuster van Frederick Louis, de Prins van Wales, en echtgenote van Prins Willem IV. Buste naar links, aanziend. Naar een origineel uit 1736 door Bernardus Accama (I) in Rijksmuseum Paleis Het Loo, Apeldoorn. Vroeger geïnterpreteerd als een portret van Charles Edward Stuart (1720-88). Bonnie Prince Charlie in travestie in de japon van het dienstmeisje Betty Burke bij zijn vlucht van B

### a single record metadata (json-ld)

In [35]:
with open("/Users/anesterov/Downloads/M013571.json-ld.json",'r') as jf:
    record = json.load(jf)

In [51]:
record

[{'@id': 'http://data.collectienederland.nl/resource/document/joods-historisch/M013571',
  '@type': ['http://www.europeana.eu/schemas/edm/ProvidedCHO'],
  'http://purl.org/dc/elements/1.1/coverage': [{'@value': 'Nederland'}],
  'http://purl.org/dc/elements/1.1/creator': [{'@value': 'Wesly, Jenny E. (1948-2016)'}],
  'http://purl.org/dc/elements/1.1/description': [{'@value': 'Een groep demonstrerende mannen en vrouwen op een zomerse dag. Ze dragen spandoeken met zich mee.'}],
  'http://purl.org/dc/elements/1.1/identifier': [{'@value': 'M013571'}],
  'http://purl.org/dc/elements/1.1/source': [{'@value': 'Joods Museum'}],
  'http://purl.org/dc/elements/1.1/subject': [{'@value': 'Utrecht (stad)'},
   {'@value': 'demonstratie'},
   {'@value': 'homoseksualiteit'},
   {'@value': 'belangenorganisatie'},
   {'@value': 'Wesly, Jenny E. (1948-2016)'},
   {'@value': 'Sjalhomo'},
   {'@value': 'spandoek'},
   {'@value': 'portret'}],
  'http://purl.org/dc/elements/1.1/title': [{'@value': 'Sjalhomo o

### Parsing institutions and N objects listed on CN

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException,WebDriverException,TimeoutException
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re
import requests
import json

In [19]:
import pandas as pd

In [20]:
cn_providers = pd.DataFrame(columns=["institution","n_objects"])

In [21]:
cn_providers

Unnamed: 0,institution,n_objects


In [10]:
path_to_driver = "/Users/anesterov/Downloads/chromedriver-mac-x64/chromedriver"

In [11]:
browser = webdriver.Chrome(executable_path=path_to_driver)

  browser = webdriver.Chrome(executable_path=path_to_driver)


In [12]:
browser.get("https://data.collectienederland.nl/search/?")

In [13]:
soup = BeautifulSoup(browser.page_source, 'html.parser')

In [22]:
for li in soup.find("li", {"id": "edm_dataProvider"}).find("ul").find_all("li"):

    provider = [li["data-value"],int(li["data-count"])]
    cn_providers.loc[len(cn_providers)] = provider

<li data-count="747" data-value="Airborne Museum 'Hartenstein'" role="presentation">
<a class="facet-link" href="?q=&amp;qf[]=edm_dataProvider%3AAirborne Museum 'Hartenstein'" rel="nofollow" role="menuitem" tabindex="-1">
<i class="fa fa-fw fa-square-o"></i>
                                        Airborne Museum 'Hartenstein' (747)
                                    </a>
</li> Airborne Museum 'Hartenstein' 747
<li data-count="198938" data-value="Allard Pierson" role="presentation">
<a class="facet-link" href="?q=&amp;qf[]=edm_dataProvider%3AAllard Pierson" rel="nofollow" role="menuitem" tabindex="-1">
<i class="fa fa-fw fa-square-o"></i>
                                        Allard Pierson (198938)
                                    </a>
</li> Allard Pierson 198938
<li data-count="107901" data-value="Amsterdam Museum" role="presentation">
<a class="facet-link" href="?q=&amp;qf[]=edm_dataProvider%3AAmsterdam Museum" rel="nofollow" role="menuitem" tabindex="-1">
<i class="fa fa-fw f

In [24]:
cn_providers.to_csv("/Users/anesterov/cn_providers.csv")

In [25]:
browser.close()