# Reverse-Image Search for Graphische Sammlung

image size options   
150x150 default  
250x250 resolution=mediumImageResolution  
350x350 resolution=highImageResolution  
max resolution=superImageResolution  




In [1]:
example_url = "https://www.e-gs.ethz.ch/eMP/eMuseumPlus?service=ImageAsset&module=collection&objectId=2562&resolution=mediumImageResolution"

# Image Downloading



In [2]:
import requests
import urllib
import random
import re
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from PIL import Image
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

In [3]:
path_to_chromedriver="C:\lib\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(path_to_chromedriver)

def document_initialised(driver):
    return driver.execute_script("return initialised")


In [4]:
def find_next_url(soup):
    """
    find link to next page    
    """

#     soup_pagin = soup.find(name="ul", attrs={"class":"pagination"})
    soup_next = soup.find(name="a", attrs={"class":"nextBtn"})
    if soup_next:
        next_url = soup_next['href']
#         print(next_url)
        return next_url
    else:
        next_url = None
    
    return next_url
    

In [5]:
def find_list_of_elements(soup):

    #find main body col-lg-12 col-md-12 list-row
    soup_body = soup.find(name='div', attrs={"class":"col-lg-12 col-md-12 list-row"})
    if soup_body:
        soup_el_lst = soup_body.find_all(name='div', attrs={"class":"ssy_galleryElement"})        
    else:
        soup_el_lst = []
    
    return soup_el_lst

In [6]:
def find_img_url(soup_el):
    """
    find link to image
    accepts the html for a single gallery element
    """
    soup_fig = soup_el.find(name='figure')
    if soup_fig:
        img_url = soup_fig.find(name='a').find(name='img')['src']
#         print(img_url)
    
    else: 
        img_url = None
        
    return img_url

In [7]:
def find_object_id_from_image_url(img_url):
    
    ptrn = "(objectId=[0-9]+){1}"    
    match = re.search(ptrn, img_url)
    
    if match:
        object_id = match[0].split('=')[-1]
    else:
        object_id = None
        
    return object_id

In [8]:
def find_element_title(soup_el):
    """
    find title
    accepts the html for a single gallery element
    """

    soup_el_title = soup_el.find(name='span', attrs={"class":"galHeadline"})
    
    if soup_el_title:
        title = soup_el_title.text
#         print(title)
    
    else:
        title = None
        
    return title

In [9]:
def find_element_description(soup_el):
    """
    find title
    accepts the html for a single gallery element
    """
    soup_el_detail = soup_el.find(name='p', attrs = {"class":"galleryElementDetail"})

    if soup_el_detail:
        detail = soup_el_detail.text.strip()
        if len(detail) > 0:
            return detail
    else:
        return None

In [10]:
def find_element_detail_link(soup_el):
    """
    find link to detail page
    accepts the html for a single gallery element
    """
    # find link to detail page
    soup_el_links = soup_el.find_all(name='a')
    if len(soup_el_links)>0:    
        soup_el_link = soup_el_links[0]
        detail_url = soup_el_link['href']

    else:
        detail_url = None
    
    return detail_url

In [11]:
def find_one_element_details(soup_el):
    """
    return a dictionary with details of one element
    """
    el_dict = {}
    
    el_dict['title'] = find_element_title(soup_el)
    el_dict['img_url'] = find_img_url(soup_el)
    el_dict['detail_url'] = find_element_detail_link(soup_el)
    el_dict['detail_description'] = find_element_description(soup_el)

    if el_dict['img_url']:
        el_dict['object_id'] = find_object_id_from_image_url(el_dict['img_url'])

    return el_dict

In [12]:
def find_one_page_elements(soup):

    page_results = []
    soup_el_lst = find_list_of_elements(soup)
    
    if len(soup_el_lst)>0:
        print('    found elements')
    
    for soup_el in soup_el_lst:
        el_dict = find_one_element_details(soup_el)
        page_results.append(el_dict)

    return pd.DataFrame(page_results)

In [13]:
def download_image(filename, url):

    """ requests image from given url and saves it in original quality as jpeg in RGB format
    
    filename: local filepath to save the image to
    url: url to request image from"""
    
    if os.path.exists(filename):
        print('Image %s already exists. Skipping download.' % filename)
        return

    try:
        response = urllib.request.urlopen(url)
    except:
        logging.warning('Warning: Could not download image %s from %s' % (filename, url))
        return

    try:
        pil_image = Image.open(BytesIO(response.read()))
    except:
        print('Warning: Failed to parse image %s' % filename)
        return

    try:
        pil_image_rgb = pil_image.convert('RGB')
    except:
        print('Warning: Failed to convert image %s to RGB' % filename)
        return

    try:
        pil_image_rgb.save(filename, format='JPEG')  # , quality=95
    except:
        print('Warning: Failed to save image %s' % filename)
        return

      

In [14]:
def process_one_page_html(raw_html, csv_path, request_counter):

    soup = BeautifulSoup(raw_html)
    df_page = find_one_page_elements(soup)
    df_page['results_page'] = request_counter

    include_header=False

    if request_counter <= 1:
        include_header=True
    
    # write page
    df_page.to_csv(fpath, mode='a', header=include_header)
        
    return

In [15]:
def navigate_to_next_page(driver):
    
    next_button_lst = driver.find_elements_by_class_name("nextBtn")
    if next_button_lst:
        next_button = next_button_lst[0]
        if next_button.is_enabled():
            next_button.click()
            return True
    else:
        return False

In [16]:
def set_folder(string):

#     divide into folders of 999 pictures max

    if len(string)>3:

        fldr = string[0:-3]
    else:
        fldr = "0"

    return fldr

In [17]:
def save_only_image_selenium(img_url):
    
    return

from selenium.webdriver import ActionChains
from selenium.webdriver.common.keys import Keys


In [18]:
def save_image(driver, filename, url):

    """save a screenshot as png
    
    filename: local filepath to save the image to
    url: url to request image from"""
    
    if os.path.exists(filename):
        print('    Warning: Image %s already exists. Skipping download.' % filename)
        return

    try:
        driver.get(url)
        
        sleep_time = random.randint(3,5) + (3*random.random())
        sleep(sleep_time)
        
        img_el = driver.find_element_by_xpath('//img')


    except:
        print('    Warning: Could not find image element %s' % (url))
        return

    # save screenshot of image
    try:
        driver.get_screenshot_as_file(filename)
#         img = Image.open(filename)
#         img = pil_image.convert('RGB')
    except:
        print('    Warning: Failed to save png of image: %s' % filename)
        return
    
#     try:
#         img = center_crop_image_to_dims(img, h=h, w=w)
#     except:
#         print('Warning: Failed to crop image %s' % filename)
#         return

#     try:
#         filename = "".join(filename.split('.')[:-1]) + ".jpg"
#         img.save(filename, format='JPEG')  # , quality=95
#     except:
#         print('Warning: Failed to save image as jpg %s' % filename)
        
    return


In [19]:
def save_images(driver, file_dict, sleep_time_range=(5,7)):
 
    """loops through a dictionary of files to download. includes logging
    file_dict: should be in the format {file_fullpath:url }
    """
    
#     logging_funcs.start_logger(destination_folder, logger_fname='image_downloader.log')
 
    total_num_images = len(file_dict)   
    print('started download of {} images'.format(total_num_images))
    
    #loop to download from each url
    for i, (filepath, url) in enumerate(file_dict.items()):

        #print intermittinent milestones to console & log
        if i % 100 == 0:
            percent_complete = i/total_num_images
#             logging.info('Info: {:0.0%} complete'.format(percent_complete))
            print('currently processing image {} of {} ({:0.1%} complete)'.format(i, total_num_images, percent_complete))
#             logging.info('Info: processing {} ({} of {})'.format(file_subpath, i+1, total_num_images))
        
        #make subfolders in file path if doesn't exist

        subfolder = os.path.dirname(filepath)
        
        if not os.path.exists(subfolder):
            
            os.makedirs(r'{}'.format(subfolder))
            print('    Info: created subfolder directory {}'.format(subfolder))
        
        # download image
        
        if not os.path.exists(filepath):
            save_image(driver, filepath, url)
        else:
            print('    Warning: image already exists {}'.format(filepath))
            
        sleep_time = random.randint(*sleep_time_range) + (random.random()*4)
        sleep(sleep_time)
        

    print('finished download')

    return

In [20]:
def download_search_results(driver):

    # place to save results
    data_dir = os.path.normpath(os.path.join(os.getcwd(), '..','data','raw','scraped'))
    fname = 'graphik_portal_results.csv'
    fpath = os.path.join(data_dir,fname)

    request_counter = 0
    next_clickable = True
    first_url = "https://www.graphikportal.org/gallery/encoded/eJzjYBKS5GJLzMmJT0kVYk4tyZBidvRzUWIuycnWYhCSgUuxVZUWZSajyqpxcWfm5JQWlxQllqSmCCFzkNUBANijGqs*/5901"
    # get first page
    driver.get(first_url)

    while next_clickable:

        request_counter +=1
        print('current request {}'.format(request_counter))

        sleep(random.randint(3,5))

        # save results to_csv
        page_html = driver.page_source
        process_one_page_html(page_html, fpath, request_counter)

        # wait to not exceed throttle limits
        sleep_time = random.randint(3,5) + (3*random.random())
        sleep(sleep_time)

        next_clickable = navigate_to_next_page(driver)

# download images


In [21]:
data_dir = os.path.normpath(os.path.join(os.getcwd(), '..','data','raw','scraped'))
fname = 'graphik_portal_results.csv'
fpath = os.path.join(data_dir,fname)
fpath

'C:\\Users\\Barry\\projects\\graph-samm\\data\\raw\\scraped\\graphik_portal_results.csv'

In [22]:
# read in csv file

col_names = ['title','img_url','detail_url','detail_description','object_id', 'request_num']
df = pd.read_csv(fpath, index_col=0, header=None)
df.columns = col_names 

In [23]:
df.head()

Unnamed: 0_level_0,title,img_url,detail_url,detail_description,object_id,request_num
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,Intérieur mit zwei Frauen - Zwei Frauen mit Lampe,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0021...,"Albert Müller (1897 - 1926), Künstler, 1924, V...",6190,4
1.0,Porträt Anna III - Junge Frau,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0021...,"Albert Müller (1897 - 1926), Künstler, 1924, V...",16773,4
2.0,Knabenakt,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0022...,"Otto Meyer-Amden (1885 - 1933), Künstler, 1922...",33515,4
3.0,Die schöne Försterin,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0021...,"Henry Wyatt (1794 - 1840), nach, 1835, Francis...",33,4
4.0,Drei Putti mit einer Rüstung,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0021...,"Anonym, Kupferstecher, 16. Jahrhundert [?]",313,4


In [24]:

df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 10001 entries, 0.0 to 49.0
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   title               10001 non-null  object
 1   img_url             10001 non-null  object
 2   detail_url          10001 non-null  object
 3   detail_description  10001 non-null  object
 4   object_id           9996 non-null   object
 5   request_num         10001 non-null  object
dtypes: object(6)
memory usage: 546.9+ KB


In [25]:
# drop any nans
df = df.dropna()

In [26]:
# make filepath column
fldr_path = os.path.join(data_dir, 'images')

df['filepath'] = df['object_id'].apply(set_folder)
df['filepath']  = fldr_path + '\\' + df['filepath'] + '\\' + df['object_id'] + ".png"

In [27]:
# drop already downloaded records

# get flist of existing files   
existing_flist = []
    
for dirpath, dirnames, filenames in os.walk(fldr_path):
    for fname in filenames:
        cur_fpath = os.path.join(dirpath,fname)
        existing_flist.append(cur_fpath)

orig_len = df.shape[0]

# filter out already existing
fltr = ~df['filepath'].isin(existing_flist)
df = df.loc[fltr, :]

num_dropped = orig_len - df.shape[0]
print('removed {} records; already saved'.format(num_dropped))

removed 6838 records; already saved


In [28]:
# change image url to lower res
df['img_url'] = df['img_url'].str.replace("superImageResolution","highImageResolution")

In [29]:
df.filepath.iloc[0]

'C:\\Users\\Barry\\projects\\graph-samm\\data\\raw\\scraped\\images\\object\\object_id.png'

In [30]:
df.head()

Unnamed: 0_level_0,title,img_url,detail_url,detail_description,object_id,request_num,filepath
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,title,img_url,detail_url,detail_description,object_id,results_page,C:\Users\Barry\projects\graph-samm\data\raw\sc...
41.0,Jungfrau mit Kind und Papagei,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0022...,"Martin Schongauer (Um 1450 - 1491), Kopie nach...",23397,19,C:\Users\Barry\projects\graph-samm\data\raw\sc...
43.0,Die kleinen Wasserfälle bei Tivoli,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0022...,Christian Wilhelm Ernst Dietrich (1712 - 1774)...,23762,19,C:\Users\Barry\projects\graph-samm\data\raw\sc...
44.0,Porträt von Johann Bernoulli,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0022...,Johann Rudolf Huber (der Ältere) (1668 - 1748)...,23395,19,C:\Users\Barry\projects\graph-samm\data\raw\sc...
45.0,Maria und Joseph mit dem schlafenden Christuskind,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0022...,Christian Wilhelm Ernst Dietrich (1712 - 1774)...,23770,19,C:\Users\Barry\projects\graph-samm\data\raw\sc...


In [31]:
# make dict of filename:url
img_url_lst = df['img_url'].to_list()
fpath_lst = df['filepath'].to_list()
img_dict = dict(zip( fpath_lst, img_url_lst))

In [None]:
save_images(driver, img_dict, sleep_time_range=(3,7))

started download of 3158 images
currently processing image 0 of 3158 (0.0% complete)
currently processing image 100 of 3158 (3.2% complete)
currently processing image 200 of 3158 (6.3% complete)
currently processing image 300 of 3158 (9.5% complete)
currently processing image 400 of 3158 (12.7% complete)
currently processing image 500 of 3158 (15.8% complete)
currently processing image 600 of 3158 (19.0% complete)
currently processing image 700 of 3158 (22.2% complete)
currently processing image 800 of 3158 (25.3% complete)
currently processing image 900 of 3158 (28.5% complete)
currently processing image 1000 of 3158 (31.7% complete)
currently processing image 1100 of 3158 (34.8% complete)
currently processing image 1200 of 3158 (38.0% complete)
currently processing image 1300 of 3158 (41.2% complete)
currently processing image 1400 of 3158 (44.3% complete)
currently processing image 1500 of 3158 (47.5% complete)
