# Reverse-Image Search for Graphische Sammlung

image size options   
150x150 default  
250x250 resolution=mediumImageResolution  
350x350 resolution=highImageResolution  
max resolution=superImageResolution  




In [None]:
example_url = https://www.e-gs.ethz.ch/eMP/eMuseumPlus?service=ImageAsset&module=collection&objectId=2562&resolution=mediumImageResolution

# Image Downloading



In [54]:
import requests
import urllib
import random
import re
import os
import pandas as pd
from bs4 import BeautifulSoup
from PIL import Image
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

In [56]:
path_to_chromedriver="C:\lib\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(path_to_chromedriver)

def document_initialised(driver):
    return driver.execute_script("return initialised")


In [57]:
def find_next_url(soup):
    """
    find link to next page    
    """

#     soup_pagin = soup.find(name="ul", attrs={"class":"pagination"})
    soup_next = soup.find(name="a", attrs={"class":"nextBtn"})
    if soup_next:
        next_url = soup_next['href']
#         print(next_url)
        return next_url
    else:
        next_url = None
    
    return next_url
    

In [58]:
def find_list_of_elements(soup):

    #find main body col-lg-12 col-md-12 list-row
    soup_body = soup.find(name='div', attrs={"class":"col-lg-12 col-md-12 list-row"})
    if soup_body:
        soup_el_lst = soup_body.find_all(name='div', attrs={"class":"ssy_galleryElement"})        
    else:
        soup_el_lst = []
    
    return soup_el_lst

In [59]:
def find_img_url(soup_el):
    """
    find link to image
    accepts the html for a single gallery element
    """
    soup_fig = soup_el.find(name='figure')
    if soup_fig:
        img_url = soup_fig.find(name='a').find(name='img')['src']
#         print(img_url)
    
    else: 
        img_url = None
        
    return img_url

In [60]:
def find_object_id_from_image_url(img_url):
    
    ptrn = "(objectId=[0-9]+){1}"    
    match = re.search(ptrn, img_url)
    
    if match:
        object_id = match[0].split('=')[-1]
    else:
        object_id = None
        
    return object_id

In [61]:
def find_element_title(soup_el):
    """
    find title
    accepts the html for a single gallery element
    """

    soup_el_title = soup_el.find(name='span', attrs={"class":"galHeadline"})
    
    if soup_el_title:
        title = soup_el_title.text
#         print(title)
    
    else:
        title = None
        
    return title

In [62]:
def find_element_description(soup_el):
    """
    find title
    accepts the html for a single gallery element
    """
    soup_el_detail = soup_el.find(name='p', attrs = {"class":"galleryElementDetail"})

    if soup_el_detail:
        detail = soup_el_detail.text.strip()
        if len(detail) > 0:
            return detail
    else:
        return None

In [63]:
def find_element_detail_link(soup_el):
    """
    find link to detail page
    accepts the html for a single gallery element
    """
    # find link to detail page
    soup_el_links = soup_el.find_all(name='a')
    if len(soup_el_links)>0:    
        soup_el_link = soup_el_links[0]
        detail_url = soup_el_link['href']

    else:
        detail_url = None
    
    return detail_url

In [64]:
def find_one_element_details(soup_el):
    """
    return a dictionary with details of one element
    """
    el_dict = {}
    
    el_dict['title'] = find_element_title(soup_el)
    el_dict['img_url'] = find_img_url(soup_el)
    el_dict['detail_url'] = find_element_detail_link(soup_el)
    el_dict['detail_description'] = find_element_description(soup_el)

    if el_dict['img_url']:
        el_dict['object_id'] = find_object_id_from_image_url(el_dict['img_url'])

    return el_dict

In [65]:
def find_one_page_elements(soup):

    page_results = []
    soup_el_lst = find_list_of_elements(soup)
    
    if len(soup_el_lst)>0:
        print('    found elements')
    
    for soup_el in soup_el_lst:
        el_dict = find_one_element_details(soup_el)
        page_results.append(el_dict)

    return pd.DataFrame(page_results)

def download_image(img_url, file_name = None, rep_tup = ("superImageResolution","highImageResolution")):
    
    if replace_dict:
        img_url = img_url.replace(replace_dict)
        img_url.replace(*rep_tup)
    
    return img_url

def download_image(filename, url):

    """ requests image from given url and saves it in original quality as jpeg in RGB format
    
    filename: local filepath to save the image to
    url: url to request image from"""
    
    if os.path.exists(filename):
        logging.info('Image %s already exists. Skipping download.' % filename)
        return

    try:
        response = urllib.request.urlopen(url)
    except:
        logging.warning('Warning: Could not download image %s from %s' % (filename, url))
        return

    try:
        pil_image = Image.open(BytesIO(response.read()))
    except:
        logging.warning('Warning: Failed to parse image %s' % filename)
        return

    try:
        pil_image_rgb = pil_image.convert('RGB')
    except:
        logging.warning('Warning: Failed to convert image %s to RGB' % filename)
        return

    try:
        pil_image_rgb.save(filename, format='JPEG')  # , quality=95
    except:
        logging.warning('Warning: Failed to save image %s' % filename)
        return

        
def download_images(file_dict, destination_folder = ''):
 
    """loops through a dictionary of files to download. includes logging
    
    file_dict: should be in the format {url: file_subpath}
    destination folder: local path to directory to save all of the images to
    """
    #start logger (start_logger() makes the destination path if necessary)
    os.mkdirs(destination_folder)
    
    logging_funcs.start_logger(destination_folder, logger_fname='image_downloader.log')
 
    total_num_images = len(file_dict)   
    print('started download of {} images to {}'.format(total_num_images, 
                                                       destination_folder))
    
    #loop to download from each url
    for i, (url, file_subpath) in enumerate(file_dict.items()):

        #print intermittinent milestones to console & log
        if i % 100 == 0:
            percent_complete = i/total_num_images
            logging.info('Info: {:0.0%} complete'.format(percent_complete))
            print('currently processing image {} of {} ({:0.1%} complete)'.format(i, total_num_images, percent_complete))
            logging.info('Info: processing {} ({} of {})'.format(file_subpath, i+1, total_num_images))
        
        #make subfolders in file path if doesn't exist

        subfolders = dirname(file_subpath)
        if len(subfolders) == 0:
            subfolders = 'misc'
        subfolders_path = os.path.join(destination_folder, subfolders)
        
        if not os.path.exists(subfolders_path):
            
            os.makedirs(r'{}'.format(subfolders_path))
            logging.info('Info: created subfolder directory %s ' % subfolders_path)
        
        # download image    
            # create full filepath
        filepath = os.path.join(destination_folder, file_subpath)
        download_image(filepath, url)

    logging.shutdown()
    print('finished download')

    return

In [73]:
def process_one_page_html(raw_html, csv_path, request_counter):

    soup = BeautifulSoup(raw_html)
    df_page = find_one_page_elements(soup)
    df_page['results_page'] = request_counter

    include_header=False

    if request_counter <= 1:
        include_header=True
    
    # write page
    df_page.to_csv(fpath, mode='a', header=include_header)
        
    return

In [74]:
def navigate_to_next_page(driver):
    
    next_button_lst = driver.find_elements_by_class_name("nextBtn")
    if next_button_lst:
        next_button = next_button_lst[0]
        if next_button.is_enabled():
            next_button.click()
            return True
    else:
        return False

In [76]:
# place to save results
data_dir = os.path.normpath(os.path.join(os.getcwd(), '..','data','raw','scraped'))
fname = 'graphik_portal_results.csv'
fpath = os.path.join(data_dir,fname)

request_counter = 0
next_clickable = True
first_url = "https://www.graphikportal.org/gallery/encoded/eJzjYBKS5GJLzMmJT0kVYk4tyZBidvRzUWIuycnWYhCSgUuxVZUWZSajyqpxcWfm5JQWlxQllqSmCCFzkNUBANijGqs*"
first_url = "https://www.graphikportal.org/gallery/encoded/eJzjYBKS5GJLzMmJT0kVYk4tyZBidvRzUWIuycnWYhCSgUuxVZUWZSajyqpxcWfm5JQWlxQllqSmCCFzkNUBANijGqs*/5901"
# get first page
driver.get(first_url)

In [None]:
# start loop

while next_clickable:
    
    request_counter +=1
    print('current request {}'.format(request_counter))
    
    sleep(random.randint(3,5))

    # save results to_csv
    page_html = driver.page_source
    process_one_page_html(page_html, fpath, request_counter)

    # wait to not exceed throttle limits
    sleep_time = random.randint(3,5) + (3*random.random())
    sleep(sleep_time)
    
    next_clickable = navigate_to_next_page(driver)

current request 1
    found elements
current request 2
    found elements
current request 3
    found elements
current request 4
    found elements
current request 5
    found elements
current request 6
    found elements
current request 7
    found elements
current request 8
    found elements
current request 9
    found elements
current request 10
    found elements
current request 11
    found elements
current request 12
    found elements
current request 13
    found elements
current request 14
    found elements
current request 15
    found elements
current request 16
    found elements
current request 17
    found elements
current request 18
    found elements
current request 19
    found elements
current request 20
    found elements
current request 21
    found elements
current request 22
    found elements
current request 23
    found elements
current request 24
    found elements
current request 25
    found elements
current request 26
    found elements
current request 27
  