# Painting reconciliation

From all our illustrations, we want to find metadata about them. What is the name of the author ? The title of the artwork ? Its dimensions ? 

For that, we will query [Smartify](https://smartify.org/fr), an API which recognizes artworks.

In [2]:
# Some imports
from selenium.webdriver.common.by import By
import pyautogui
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import os
import time
import splitfolders
import glob
import random
import shutil
from tqdm import tqdm
import pandas as pd

To run this notebook, you will just need to specify where all the illustrations are, as well a a Webdriver for the scraping.

In [1]:
ILLU_FOLDER = "illustrations/"
CHROME_DRIVER_PATH = './chromedriver.exe'

We will divide our illustrations in 10 batches, just so that it is easier to process.

In [3]:
N_BATCHES = 10

def divide_in_batches():
    all_illus = glob.glob(ILLU_FOLDER + "*.jpg")
    batch_len = int(len(all_illus)/N_BATCHES)
    random.seed(0)
    random.shuffle(all_illus)
    
    for i in tqdm(range(N_BATCHES)):
        for illu in all_illus[i*batch_len:(i+1)*batch_len]:
            dest = './batch' + str(i) + '/'
            shutil.copy(illu, dest)

divide_in_batches()

Now a handy function for scraping, that retrieve the painting information from the painting page in Smartify

In [16]:
# a function to retrieve - from the page - the painting information
def get_painting_info(driver, illu_id):
    title = driver.find_element(by=By.XPATH, value="/html/body/div/div/div/div/div/div/div[4]/div/div/h1").get_attribute("innerHTML")
    specs = driver.find_element(by=By.XPATH, value="/html/body/div/div/div/div/div/div/div[4]/div/div/p").get_attribute("innerHTML")
    author = driver.find_element(by=By.XPATH, value="/html/body/div/div/div/div/div/div/div[4]/a/div/h3").get_attribute("innerHTML")
    try:
        link = driver.find_element(by=By.XPATH, value="/html/body/div/div/div/div/div/div/div[4]/div[3]/a").get_attribute("href")
    except:
        link = ''
    return dict({'id': illu_id, 'title': title, 'specs': specs, 'author': author, 'link': link})

And here we query the API for all our illustrations.

In [145]:
# All the work of querying and scraping the website is done here
def retrieve_for_batch(batch):
    # The webdriver
    driver = webdriver.Chrome(CHROME_DRIVER_PATH)
    # We go to the website
    driver.get("https://smartify.org/")
    time.sleep(1)
    
    # List of all our illustrations
    all_illus = glob.glob(batch + "/*.jpg")
    # The consecutive steps to find in the file explorer where the illustrations are
    paths = ['documents', 'DFKV', 'DFKV-illustrations', '5_illustration_enrichment', batch, all_illus[0].split('\\')[-1].split('.')[0]]
    data = []
    # First illustration
    previous_url = driver.current_url
    # Find file selector element
    button = driver.find_element(by=By.XPATH, value="/html/body/div/div/div/div/div/header/div/section/div[2]/div[1]/div/div/button")
    button.click() # This opens the windows file selector
    time.sleep(1)
    
    # Look for our illustration
    for step in paths:
        time.sleep(1)
        pyautogui.write(step)
        pyautogui.press('enter')  # Send it to the API
    time.sleep(1)
    
    # If the API found something, then the page for the artwork is loaded
    if previous_url != driver.current_url:
        data.append(get_painting_info(illu_name))
    
    # Do it again for all the illustrations
    for illu in tqdm(all_illus[1:]):
        # Save current url to compare it later
        previous_url = driver.current_url
        # Name of the illustration
        illu_name = illu.split('\\')[-1].split('.')[0]
        # Find element for file selector
        button = driver.find_element(by=By.XPATH, value="/html/body/div/div/div/div/div/header/div/section/div[2]/div[1]/div/div/button")
        button.click() # This opens the windows file selector
        time.sleep(1)
        # Send our illustration to the API
        pyautogui.write(illu_name) 
        pyautogui.press('enter')
        time.sleep(2)
        # Check if API found something
        if previous_url != driver.current_url:
            # Get illustration's info
            data.append(get_painting_info(driver, illu_name))
            df = pd.DataFrame(data)
            df.to_csv(batch + '.csv') # intermediate saves
    # Save data for batch
    df = pd.DataFrame(data)
    df.to_csv(batch + '.csv')
    return df

In [146]:
# We do that for all the batches
df = retrieve_for_batch('batch9')

  driver = webdriver.Chrome('./chromedriver.exe')
100%|██████████| 454/454 [24:56<00:00,  3.30s/it]
