In [21]:
import pandas as pd
import requests
import os

def create_dir(path):
  if not os.path.exists(path):
    os.mkdir(path)
    
def query_single_category(skos_concept, N):  
  CHO_list = []
  response = {'nextCursor':'*'}
  while 'nextCursor' in response:
    
    if len(CHO_list)>N:
      break

    params = {
        'reusability':'open',
        'media':True,
        'cursor':response['nextCursor'],
        'qf':f'(skos_concept:"{skos_concept}" AND TYPE:IMAGE )', 
        'query':'*', 
        'wskey':'api2demo'
    }
    response = requests.get('https://www.europeana.eu/api/v2/search.json', params = params).json()

    for item in response['items']:
      ID = item['id']
      URI = 'http://data.europeana.eu/item'+ID
      try:
        URL = item['edmIsShownBy'][0]
        CHO_list.append({'category':category,'skos_concept':skos_concept,'URI':URI,'ID':ID,'URL':URL})
      except:
        pass
    
  return pd.DataFrame(CHO_list[:N])
    
 

ec_vocab = {
                 'building':'http://data.europeana.eu/concept/base/29',
                 'ceramics':'http://data.europeana.eu/concept/base/31',
                 'drawing':'http://data.europeana.eu/concept/base/35',
                 'furniture':'http://data.europeana.eu/concept/base/37',
                 'jewellery':'http://data.europeana.eu/concept/base/41',
                 'map':'http://data.europeana.eu/concept/base/43',
                 'painting':'http://data.europeana.eu/concept/base/47',
                 'photograph':'http://data.europeana.eu/concept/base/48',
                 'postcard':'http://data.europeana.eu/concept/base/50',
                 'sculpture':'http://data.europeana.eu/concept/base/51',
                 'specimen':'http://data.europeana.eu/concept/base/167',
                 'tapestry':'http://data.europeana.eu/concept/base/54',
                 'textile':'http://data.europeana.eu/concept/base/55',
                 'toy':'http://data.europeana.eu/concept/base/56',
                 'woodwork':'http://data.europeana.eu/concept/base/59',
                 }


getty_vocab = {
                'print': 'http://vocab.getty.edu/aat/300041379' ,
                'building': 'http://vocab.getty.edu/aat/300004792',
                'archaeological_site': 'http://vocab.getty.edu/aat/300266151',
                'cartoon': 'http://vocab.getty.edu/aat/300123430',
                'ceramics': 'http://vocab.getty.edu/aat/300151343',
                'clothing' : 'http://vocab.getty.edu/aat/300266639' ,
                'costume_accessories': 'http://vocab.getty.edu/aat/300209273',
                'drawing': 'http://vocab.getty.edu/aat/300033973',
                'map': 'http://vocab.getty.edu/aat/300028094',
                'furniture': 'http://vocab.getty.edu/aat/300037680',
                'textile': 'http://vocab.getty.edu/aat/300231565',
                'food': 'http://vocab.getty.edu/aat/300254496',
                'glassware': 'http://vocab.getty.edu/aat/300010898',
                'inscription': 'http://vocab.getty.edu/aat/300028702' ,
                'jewellery': 'http://vocab.getty.edu/aat/300209286' ,
                'metalwork': 'http://vocab.getty.edu/aat/300015336',
                'machinery': 'http://vocab.getty.edu/aat/300024839' ,
                'medal' : 'http://vocab.getty.edu/aat/300046025' ,
                'memorabilia': 'http://vocab.getty.edu/aat/300028884' ,
                'mineral': 'http://vocab.getty.edu/aat/300011068' ,
                'musical_instrument': 'http://vocab.getty.edu/aat/300041620' ,
                'painting': 'http://vocab.getty.edu/aat/300033618' ,
                'photograph': 'http://vocab.getty.edu/aat/300046300' ,
                'postcard': 'http://vocab.getty.edu/aat/300026816' ,
                'poster': 'http://vocab.getty.edu/aat/300027221' ,
                'sculpture': 'http://vocab.getty.edu/aat/300047090' ,
                'specimen': 'http://vocab.getty.edu/aat/300235576' ,
                'tableware': 'http://vocab.getty.edu/aat/300043196' ,
                'tool': 'http://vocab.getty.edu/aat/300024841' ,
                'tapestry': 'http://vocab.getty.edu/aat/300205002' ,
                'toy': 'http://vocab.getty.edu/aat/300211037' ,
                'weaponry': 'http://vocab.getty.edu/aat/300036926' ,
                'woodwork': 'http://vocab.getty.edu/aat/300015348' ,
                'stamp': 'http://vocab.getty.edu/aat/300037321' }

if __name__ == '__main__':

    #select some categories from getty vocab
    getty_categories = ['archaeological_site','clothing','costume_accessories','inscription','weaponry']
    vocab_dict = {k:getty_vocab[k] for k in getty_categories}
    
    #merge ec and getty
    vocab_dict.update(ec_vocab)

    data_path = '../new_data'
    create_dir(data_path)
    N = 10

    for category in vocab_dict.keys():
      skos_concept = vocab_dict[category]
      df = query_single_category(skos_concept, N)
      #save after each category
      df.to_csv(os.path.join(data_path,'dataset.csv'),index=False)


In [57]:
df = pd.read_csv('../new_data/dataset.csv')
print(df.shape)

(18775, 5)


In [52]:
#download images
import os
import requests
from PIL import Image
import pandas as pd
from io import BytesIO

def create_dir(path):
    if not os.path.exists(path):
        os.mkdir(path)

def url2img(url):
    try:
        response = requests.get(url)
        return Image.open(BytesIO(response.content)).convert('RGB')
    except:
        print('Failed to get media image')
        pass


if __name__ == "__main__":
    
    
    dest_path = '../new_training'
    create_dir(dest_path)
    
    df = pd.read_csv('../new_data/small_dataset.csv')
    
    for cat in df.category.unique():
        #subset 
        df_category = df.loc[df['category'] == cat]
        
        cat_path = os.path.join(dest_path,cat)
        create_dir(cat_path)
        
        for i in range(df_category.shape[0]):
            ID = df_category['ID'].iloc[i]
            img = url2img(df_category['URL'].iloc[i])
            
            if img:
                try:
                    img.save(os.path.join(cat_path,f'{ID}.jpg'.replace("/","[ph]")))
                except:
                    pass

    
    

archaeological_site
(1000, 5)
clothing
(1000, 5)
