In [2]:
import pyeuropeana.apis as apis
import pyeuropeana.utils as utils
import os
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# Goal

This notebook is to extract suitable Europeana records to test the platforms for the evaluation campaigns within the project Europeana translate. 
The datasets chosen by the aggregators are shown here https://docs.google.com/spreadsheets/d/1M7dms5rEVKbijxZPNBhZh3JxnwA_T0KWei4zdR74Lb0/edit#gid=0 and the fields to be tested are signaled here https://docs.google.com/spreadsheets/d/1iBS1EvK37Jm8ZF3TVilvryw8bXXiM05FDxuvmldseQ8/edit#gid=0

#  Record selection strategy

The general idea is to go through the datasets identified by the aggregators and select meaningful records with relevant fields, according to the list of fields above, that are populated. 

In [3]:
#setting enviroment variable
os.environ['EUROPEANA_API_KEY'] = 'api2demo'

In [4]:
 def search_api(qf, n_rows):
    ''' This function extracts the europeana ids of records
    given a quiery'''
    response = apis.search(
    query = '*:*',
    qf=qf,
    rows = n_rows, 
    profile='rich'
    )
    df_search=utils.search2df(response).europeana_id
    return df_search

In [29]:
def record_api(items_id):
    ''' This function provides the record, given the europeana id '''
    df_list=[]
    for item in items_id:
        data=apis.record(f'{item}')
        data_jnorm=pd.json_normalize(data)
        df_list.append(data_jnorm)
    df_jnorm_tot = pd.concat(df_list, ignore_index=True, axis=0)
    return df_jnorm_tot

In [7]:
def record_api_proxy(items_id, proxy_nr):
    ''' This function provides the record, given the europeana id but selects the information on the provider proxy '''
    df_list=[]
    for item in items_id:
        data=apis.record(f'{item}')
        df_0=pd.json_normalize(data,['object','proxies'])
        df_proxy_provider=df_0.iloc[proxy_nr] #selcting provider proxy - there are the info I am interested in
        df_proxy_provider=pd.DataFrame(df_proxy_provider)
        df_proxy_provider=df_proxy_provider.transpose()
        df_proxy_provider['item_id']=item
        df_list.append(df_proxy_provider)
    df_proxy_tot = pd.concat(df_list, ignore_index=True, axis=0)
    return df_proxy_tot

In [8]:
def retrieve_norm_data(query, n_rows,proxy_nr):
    ''' This function returns records given a query. Also the proxy can be chosen. 
    query:'''
    search_results_list=search_api(query, n_rows)
    record_data=record_api_proxy(search_results_list,proxy_nr)
    return record_data

In [9]:
def fin_records(query, n_records, proxy):
    query=query
    df=retrieve_norm_data(query, n_records, proxy)
    list_of_item_ids_json=[]
    list_of_item_web=[]
    for item_id in list(df.item_id):
        europeana_json=f'https://search-api.europeana.eu/record{item_id}.json?wskey=api2demo'
        item_web=f'https://www.europeana.eu/item/{item_id}'
        list_of_item_ids_json.append(europeana_json)
        list_of_item_web.append(item_web)
    return df, list_of_item_ids_json,list_of_item_web             

In [16]:
query='(edm_datasetName:2048205*)'

df, lst_json, lst_web=fin_records(query,2,2)

In [17]:
df

Unnamed: 0,about,proxyIn,proxyFor,lineage,edmType,europeanaProxy,dcCreator.def,dcIdentifier.def,dcType.def,dctermsSpatial.def,dcDescription.en,dcCreator.it,dcDescription.it,dcType.en,dctermsIsPartOf.en,dctermsProvenance.it,item_id
0,/proxy/provider/2048205/europeana_fashion_SC02...,[/aggregation/provider/2048205/europeana_fashi...,/item/2048205/europeana_fashion_SC0215701,,IMAGE,False,,[SC0215701],[http://thesaurus.europeanafashion.eu/thesauru...,,,[Emilio Pucci (Designer)],[Schizzo SC02157: Mod. 1: kimono corto in tess...,[Object Type: sketch],[Europeana XX: Century of Change],[Archivio Emilio Pucci],/2048205/europeana_fashion_SC0215701
1,/proxy/provider/2048205/europeana_fashion_SC02157,[/aggregation/provider/2048205/europeana_fashi...,/item/2048205/europeana_fashion_SC02157,,IMAGE,False,,[SC02157],[http://thesaurus.europeanafashion.eu/thesauru...,,,[Emilio Pucci (Designer)],[Schizzo SC02157: Mod. 1: kimono corto in tess...,[Object Type: sketch],[Europeana XX: Century of Change],[Archivio Emilio Pucci],/2048205/europeana_fashion_SC02157


In [18]:
lst_json

['https://search-api.europeana.eu/record/2048205/europeana_fashion_SC0215701.json?wskey=api2demo',
 'https://search-api.europeana.eu/record/2048205/europeana_fashion_SC02157.json?wskey=api2demo']

In [19]:
lst_web

['https://www.europeana.eu/item//2048205/europeana_fashion_SC0215701',
 'https://www.europeana.eu/item//2048205/europeana_fashion_SC02157']

In [124]:
df.item_id

0    /739/europeana_fashion_SK_C_1216
1    /739/europeana_fashion_SK_A_4457
Name: item_id, dtype: object

# Record Selection

In [128]:
query='(edm_datasetName:2051933*)'

df, lst_json, lst_web=fin_records(query,2,1)
df

Unnamed: 0,about,proxyIn,proxyFor,lineage,edmType,europeanaProxy,dcIdentifier.def,dcLanguage.def,dcType.def,dctermsSpatial.def,dcDescription.def,dcDescription.en,dcDescription.it,dcFormat.en,dcLanguage.en,dcPublisher.def,dcRights.en,dcSubject.def,dcTitle.en,dcTitle.it,dcType.en,dctermsAlternative.en,dctermsAlternative.it,dctermsExtent.def,dctermsIsPartOf.def,dctermsIsPartOf.en,dctermsIssued.def,item_id
0,/proxy/provider/2051933/data_euscreenXL_EUS_FF...,[/aggregation/provider/2051933/data_euscreenXL...,/item/2051933/data_euscreenXL_EUS_FFE4654D917A...,,VIDEO,False,[OC015006],,[http://www.wikidata.org/entity/Q38926],[Japan],[Extended description: Giappone: veduta estern...,[Japan: big concert of sacred court music Gaga...,[Original language summary: Giappone: grande ...,"[ASPECT RATIO: 4:3, Black & White, Mono, VIDEO]",[Italian],[INCOM],[All rights reserved Istituto Luce Cinecittà],"[National holidays, festivals, anniversaries a...",[Variety],[Varietà],[CLIP],[SERIES TITLE: Horizon film magazine],[Orizzonte cinematografico],[00:01:00],[http://mint-projects.image.ntua.gr/data/euscr...,[Europeana Subtitled],[01/05/1959],/2051933/data_euscreenXL_EUS_FFE4654D917AF1BD8...
1,/proxy/provider/2051933/data_euscreenXL_EUS_FF...,[/aggregation/provider/2051933/data_euscreenXL...,/item/2051933/data_euscreenXL_EUS_FFDF4E85015E...,,VIDEO,False,[OC030601],,[http://www.wikidata.org/entity/Q38926],"[Rome, Milan, New York]",[Extended description: Roma|immagini del saggi...,[Italy: the Academy of the finance police cele...,[Original language summary: Italia: L'Accadem...,"[ASPECT RATIO: 4:3, Black & White, Mono, VIDEO]",[Italian],[INCOM],[All rights reserved Istituto Luce Cinecittà],"[Education, http://thesaurus.euscreen.eu/EUscr...",[News through objective lens],[Cronaca con l'obiettivo],[CLIP],[SERIES TITLE: Horizon film magazine],[Orizzonte cinematografico],[00:03:30],[http://mint-projects.image.ntua.gr/data/euscr...,[Europeana Subtitled],[01/05/1962],/2051933/data_euscreenXL_EUS_FFDF4E85015E29F74...


Based on the search example above I have manually selected the following records from the aggregators, from the mentioned datasets [here](https://docs.google.com/spreadsheets/d/1M7dms5rEVKbijxZPNBhZh3JxnwA_T0KWei4zdR74Lb0/edit#gid=0). I have looked for meaningful values in the relevant fields on the provider/aggregator proxies.
The records are subdivided per aggregator.

## Euscreen

In [20]:
#EuSCREEN
# query='(edm_datasetName:2051914* AND proxy_dc_terms_temporal:*)' gives no result suggesting there are no items with dc_terms_temporal in the dataset, same ofr dc_terms_spatial in contrast with notes aggregator
# checked in both italian and english

# dc description in both en en nl, dc subject in both nl and en, dc title in both nl and en, dc terms alternative in both, dc Spatial in both en and nl
data_1='https://search-api.europeana.eu/record/2051906/data_euscreenXL_https___www_openbeelden_nl_media_9972.json?wskey=api2demo'
#dc format dc type in english
data_2='https://search-api.europeana.eu/record/2051918/data_euscreenXL_EUS_FFDCE2AB00B2936EC29312E656622105.json?wskey=api2demo',
#dc type and dc format in english only, dc terms alternative in both en and it
data_3='https://search-api.europeana.eu/record/2051933/data_euscreenXL_EUS_FFE4654D917AF1BD85344825D9B62172.json?wskey=api2demo'
#dctype only in french
data_4 ='https://search-api.europeana.eu/record/2051908/data_euscreenXL_ina_VDX14007632.json?wskey=api2demo',
#interesting for dc description field mixed french/italian
data_='https://search-api.europeana.eu/record/2051935/data_euscreenXL_EUS_FF97A8E4FCADBAE5A90F9650F923CAF3.json?wskey=api2demo'

# Museu

In [21]:
#dc_termsmedium and dc_format in french only
data_5= 'https://search-api.europeana.eu/record/2048001/AP_10450606.json?wskey=api2demo',


#dc_termsmedium  dc_format in dutch only
data_6='https://search-api.europeana.eu/record/2048001/AP_10407814.json?wskey=api2demo'

#dc_description and title in italian
data_7='https://search-api.europeana.eu/record/226/96_eikonprojekt_RM_2010.json?wskey=api2demo',

#dcterms_medium and type in french only
data_8='https://search-api.europeana.eu/record/2048001/AP_10450615.json?wskey=api2demo'

#dctype and dcspatial in french and english, format in eng nl, and fr
data_9='https://search-api.europeana.eu/record/322/Museu_ProvidedCHO_Mus_es_Royaux_d_Art_et_d_Histoire_94355.json?wskey=api2demo',

# Fashion

In [22]:
#dctempral in dutch
data_10='https://search-api.europeana.eu/record/2048230/europeana_fashion_9927.json?wskey=api2demo'


#dcterms spatial in dutch
data_11='https://search-api.europeana.eu/record/2048230/europeana_fashion_920.json?wskey=api2demo',

# dctitle in french
data_12='https://search-api.europeana.eu/record/2048218/europeana_fashion_mad_638.json?wskey=api2demo'

data_= 'https://search-api.europeana.eu/record/2048227/europeana_fashion_S_2_002.json?wskey=api2demo'

data___= 'https://search-api.europeana.eu/record/2048205/europeana_fashion_SC0215701.json?wskey=api2demo'