# Reverse Google Image Scrap

With the goal to find even more metadata about some of our illustrations, we will use Reverse Google Image to gather more data.

In [183]:
# Some imports
from selenium.webdriver.common.by import By
from selenium import webdriver
import os
import time
import glob
import pandas as pd
from bs4 import BeautifulSoup
import pyautogui
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import re 
import random
from tqdm import tqdm
import requests

To run the notebook, you will need a Webdriver

In [None]:
WEB_DRIVER_PATH = './chromedriver.exe'

We uploaded our image on the web, so that they are accessible from a url. They are in the `illus_urls.txt` file.

In [124]:
# Open the urls document
with open("data/illus_urls.txt", "r") as f:
    urls_file = f.readlines()

In [125]:
# Parsing the urls
urls = re.sub("[\(\[].*?[\)\]]", "", urls_file[0]).split(' ')

When performing the reverse search, we will particularly look if there is a link to one of the following source : artnet.fr, musee-orsay.fr or commons.wikimedia.org. They have nice formatted data about paintings, and the information are reliable and easy to extract.

In [126]:
trusted_sources = ['artnet.fr', 'musee-orsay.fr','commons.wikimedia.org']

Let's find the links we are looking for.

In [129]:
# Webdriver
driver = webdriver.Chrome(executable_path=WEB_DRIVER_PATH)
driver.get("https://images.google.com/")
time.sleep(3)
# Here you need to manually accept cookies

data = []
for im_url in tqdm(urls): # For all the illustrations
    upload = driver.find_element(By.CLASS_NAME, "ZaFQO") # Upload button
    upload.click()

    # Actually upload the image
    actions = ActionChains(driver)
    actions.send_keys(im_url)
    actions.perform()
    actions.send_keys(Keys.RETURN)
    actions.perform()

    time.sleep(1)
    
    # Checl for all returned linked
    links = driver.find_elements(By.CLASS_NAME,"yuRUbf")
    for l in links:
        href = l.find_element(by=By.XPATH, value="a").get_attribute("href")
        for ts in trusted_sources: # Add trusted sources only
            if ts in href:
                data.append({'ID' : im_url.split('/')[-1].split('.')[0].replace('-', '_'),
                'link' : href})
                
pd.DataFrame(data).to_csv('data/additional_links.csv', index = False) # Save dataframe
print(len(data))

  driver = webdriver.Chrome(executable_path='./chromedriver.exe')
100%|██████████| 8939/8939 [8:56:17<00:00,  3.60s/it]   

296





In [131]:
gg_links = pd.DataFrame(data)

Now we look at how many of the different sources we found in the links

In [133]:
# Function to determine the source
def identify_source(link):
    for ts in trusted_sources:
        if ts in link:
            return ts
    return ''

In [135]:
# Find source for each image
gg_links['source'] = gg_links.apply(lambda row: identify_source(row['link']), axis=1)

In [139]:
gg_links.groupby('source').count()

Unnamed: 0_level_0,ID,link
source,Unnamed: 1_level_1,Unnamed: 2_level_1
artnet.fr,118,118
commons.wikimedia.org,145,145
musee-orsay.fr,33,33


# Wikimedia

Now, for each of the source we find back the information about the painting that are available. We start with Wikimedia commons data. The quality and quantity of information available here is very different from one illustration to another.

In [368]:
# Function to get data from wikimedia
def get_painting_infos_wm(illu_id, fields):
    painting_infos = dict()
    painting_infos['ID'] = illu_id
    try:
        # Find artist
        c = driver.find_element(By.ID,"creator")
        artist = c.find_element(by=By.XPATH, value="bdi/a/span").get_attribute("innerHTML")
        painting_infos['Author'] = artist
    except:
        pass

    for field in fields:
        # Find category
        if "fileinfotpl_art_object_type" in str(field):
            painting_infos['Category'] = field.find_all("a", {"class": "extiw"})[0].text
        # Find date
        if "fileinfotpl_date" in str(field):
            painting_infos['Date'] = field.find_all("td")[1].text
        # Find Technique
        if "fileinfotpl_art_medium" in str(field):
            if len(field.find_all("a", {"class": "extiw"})) > 0:
                painting_infos['Technique'] = field.find_all("a", {"class": "extiw"})[0].text
            else:
                painting_infos['Technique'] = field.find_all("td")[1].text
        # Find dimensions
        if "fileinfotpl_art_dimensions" in str(field):
            painting_infos['Dimensions'] = field.find_all("td")[1].text
    return painting_infos

In [369]:
# Gather information from wikimedia
data = []
driver = webdriver.Chrome(executable_path='./chromedriver.exe')
# Illustrations that have metadata from wikimedia
for illu in tqdm(gg_links[gg_links['source']=="commons.wikimedia.org"].iterrows()):
    im_file = illu[1]['link']
    im_ID = illu[1]['ID']
    if im_file.split('.')[-1] == "jpg":
        # Open wikimedia page link
        driver.get(im_file)
        time.sleep(1)
        table = driver.find_element(By.CSS_SELECTOR, ".hproduct.commons-file-information-table")
        soup = BeautifulSoup(table.find_element(by=By.XPATH, value="table/tbody").get_attribute("innerHTML"))
        # Get information fields
        fields = [s for s in soup.find_all("tr") if "fileinfo-paramfield" in str(s)]
        data.append(get_painting_infos_wm_ID, fields)

  driver = webdriver.Chrome(executable_path='./chromedriver.exe')
145it [02:09,  1.12it/s]


In [375]:
# And save dataframe
pd.DataFrame(data).to_csv('data/commons_data.csv', index = False)

# Artnet

Then with artnet we can find the author, title, date technique and dimensions of the piantings, if available

In [423]:
# Handy function to gather paintings information from their Artnet html page
def get_painting_infos_an(illu_id):
    p_infos = dict()
    p_infos['ID'] = illu_id
    controller = "ucBasicDetailsControl"
    try:
        p_infos['Author'] = driver.find_element(By.ID, "ctl00_mainContentPlaceHolder_ucArtworkArea_repArtworkDetails_ctl00_" + controller + "_artistName").find_element(By.XPATH, "a").get_attribute("innerHTML")
    except: 
        controller = "ucArtworkDetailsControl"
        p_infos['Author'] = driver.find_element(By.ID, "ctl00_mainContentPlaceHolder_ucArtworkArea_repArtworkDetails_ctl00_" + controller + "_artistName").find_element(By.XPATH, "a").get_attribute("innerHTML")
    p_infos['Title'] = driver.find_element(By.ID, "ctl00_mainContentPlaceHolder_ucArtworkArea_repArtworkDetails_ctl00_" + controller + "_artworkTitle").find_element(By.XPATH, "i").get_attribute("innerHTML")
    p_infos['Date'] = driver.find_element(By.ID, "ctl00_mainContentPlaceHolder_ucArtworkArea_repArtworkDetails_ctl00_" + controller + "_artworkYear").get_attribute("innerHTML")
    p_infos['Technique'] = driver.find_element(By.ID, "ctl00_mainContentPlaceHolder_ucArtworkArea_repArtworkDetails_ctl00_" + controller + "_sMedium").get_attribute("innerHTML")
    try:
        p_infos['Dimensions'] = driver.find_element(By.ID, "ctl00_mainContentPlaceHolder_ucArtworkArea_repArtworkDetails_ctl00_" + controller + "_sSize").get_attribute("innerHTML")
    except:
        p_infos['Dimensions'] = ""
    return p_infos

In [414]:
driver = webdriver.Chrome(executable_path='./chromedriver.exe')

  driver = webdriver.Chrome(executable_path='./chromedriver.exe')


In [424]:
data = []
# Illustrations that have metadata from Artnet
for illu in tqdm(gg_links[gg_links['source']=="artnet.fr"].iterrows()):
    im_link = illu[1]['link']
    if im_link.split('/')[-1] != '' and len(im_link.split('/')[-1]) > 2:
        driver.get(im_link)
        time.sleep(1)
        data.append(get_painting_infos_an(illu[1]['ID']))

118it [01:10,  1.67it/s]


In [429]:
df_an = pd.DataFrame(data)
df_an['Date'] = df_an['Date'].str.replace(',', '').str.strip()
df_an.to_csv('data/artnet_data.csv', index=False)

# Orsay

And finally from the Orsay data we collect metadata for even more illustrations. For them, as there are not too many, we manually add the data from the following list of urls.

In [471]:
data = []

In [461]:
# Print list of paintings
orsay = gg_links[gg_links['source']=="musee-orsay.fr"]
list(orsay[orsay['link'].str.contains('oeuvres')]['link'])

['https://www.musee-orsay.fr/fr/oeuvres/le-champ-de-courses-jockeys-amateurs-pres-dune-voiture-1148',
 'https://www.musee-orsay.fr/fr/oeuvres/cheval-blesse-66953',
 'https://www.musee-orsay.fr/fr/oeuvres/la-maison-du-pendu-auvers-sur-oise-1476',
 'https://www.musee-orsay.fr/fr/oeuvres/le-dejeuner-sur-lherbe-25651',
 'https://www.musee-orsay.fr/fr/oeuvres/le-salon-aux-trois-lampes-rue-saint-florentin-109909',
 'https://www.musee-orsay.fr/fr/oeuvres/le-grand-jardin-775',
 'https://www.musee-orsay.fr/fr/oeuvres/portrait-de-madame-auguste-perret-75969',
 'https://www.musee-orsay.fr/fr/oeuvres/le-golfe-de-marseille-vu-de-lestaque-1309',
 'https://www.musee-orsay.fr/fr/oeuvres/restaurant-de-la-machine-bougival-8048',
 'https://www.musee-orsay.fr/fr/oeuvres/emile-zola-713',
 'https://www.musee-orsay.fr/fr/oeuvres/la-table-de-toilette-9074',
 'https://www.musee-orsay.fr/fr/oeuvres/premier-projet-pour-la-facade-du-theatre-des-champs-elysees-58021',
 'https://www.musee-orsay.fr/fr/oeuvres/fleurs

In [538]:
# Append manually the informations
data.append({'ID': "ILLU_15687_37_0",
            'Author': "Gustave Courbet",
            'Title': "Branche de pommier en fleurs",
            'Date': "1872",
            'Technique': "Huile sur toile",
            'Dimensions': "H. 32,2 ; L. 41,0 cm",
            'Category' : "Reproduction"})

In [546]:
pd.DataFrame(data).to_csv('data/orsay_data.csv', index=False)