In [1]:
import re
import pandas as pd

In [2]:
TOP_IMAGES_ENDPOINT = 'https://www.wikiart.org/en/App/Painting/MostViewedPaintings?randomSeed=123&json=2'

In [3]:
images = pd.read_json(TOP_IMAGES_ENDPOINT)
images.head(3)

Unnamed: 0,artistContentId,artistName,completitionYear,contentId,height,image,title,width,yearAsString
0,225091,Leonardo da Vinci,1504.0,225189,4289,https://uploads7.wikiart.org/images/leonardo-d...,Mona Lisa,2835,1504.0
1,204915,Vincent van Gogh,1889.0,207190,599,https://uploads2.wikiart.org/00125/images/vinc...,The Starry Night,757,1889.0
2,220740,Salvador Dali,1931.0,221654,400,https://uploads.wikiart.org/Content/images/FRA...,The Persistence of Memory,480,1931.0


In [4]:
print('Total images:', len(images.index))

images = images[images.image.str.contains('FRAME') == False]
print('After filtering empty frames:', len(images.index))

images = images[images.width >= images.height]
print('After filtering vertical images:', len(images.index))

images.head(3)

Total images: 600
After filtering empty frames: 423
After filtering vertical images: 199


Unnamed: 0,artistContentId,artistName,completitionYear,contentId,height,image,title,width,yearAsString
1,204915,Vincent van Gogh,1889.0,207190,599,https://uploads2.wikiart.org/00125/images/vinc...,The Starry Night,757,1889.0
3,230332,Henri de Toulouse-Lautrec,1892.0,230453,815,https://uploads8.wikiart.org/images/henri-de-t...,In Bed The Kiss,1094,1892.0
4,188828,Sandro Botticelli,1485.0,189114,1067,https://uploads6.wikiart.org/images/sandro-bot...,The Birth of Venus,1600,1485.0


In [5]:
image_id_re = re.compile(r'(\/images\/)(?P<image_id>.*?)((\(\d+\))?.jpg)')

# TODO, HACK: Contact wikiarts team to get an API method for this...
special_cases = {
    'https://wikiart.org/en/vincent-van-gogh/the-starry-night': 'https://www.wikiart.org/en/vincent-van-gogh/the-starry-night-1889',
    'https://wikiart.org/en/gustav-klimt/portrait-of-adele-bloch-bauer-i': 'https://www.wikiart.org/en/gustav-klimt/portrait-of-adele-bloch-bauer-i-1907-1',
    'https://wikiart.org/en/pieter-bruegel-the-elder/fight-between-carnival-and-lent-1559': 'https://www.wikiart.org/en/pieter-bruegel-the-elder/the-fight-between-carnival-and-lent-1559-1',
    'https://wikiart.org/en/pieter-bruegel-the-elder/the-triumph-of-death': 'https://www.wikiart.org/en/pieter-bruegel-the-elder/the-triumph-of-death-1562-1',
    'https://wikiart.org/en/edouard-manet/music-in-the-tuileries-gardens-1862': 'https://www.wikiart.org/en/edouard-manet/music-in-the-tuileries-garden-1862-1',
    'https://wikiart.org/en/gustav-klimt/the-beethoven-frieze-the-hostile-powers-far-wall': 'https://www.wikiart.org/en/gustav-klimt/the-beethoven-frieze-the-hostile-powers-far-wall-1902-1',
    'https://wikiart.org/en/gustav-klimt/the-beethoven-frieze-the-longing-for-happiness-finds-repose-in-poetry-right-wall-1': 'https://www.wikiart.org/en/gustav-klimt/the-beethoven-frieze-the-longing-for-happiness-finds-repose-in-poetry-right-wall-1902-1',
    'https://wikiart.org/en/honore-daumier/gargantua': 'https://www.wikiart.org/en/honore-daumier/gargantua-1831',
    'https://wikiart.org/en/francois-boucher/triumph-of-venus-1740': 'https://www.wikiart.org/en/francois-boucher/the-birth-and-triumph-of-venus-1740-1',
    'https://wikiart.org/en/francisco-goya/execution-of-the-defenders-of-madrid-3rd-may-1808-1814': 'https://www.wikiart.org/en/francisco-goya/the-third-of-may-1808-execution-of-the-defenders-of-madrid-1814-1',
    'https://wikiart.org/en/giotto/lamentation-the-mourning-of-christ': 'https://www.wikiart.org/en/giotto/lamentation-the-mourning-of-christ-1306-1'
    
}

def get_details_url(image_url):
    image_id = re.search(image_id_re, image_url).group('image_id')
    details_url = 'https://wikiart.org/en/{}'.format(image_id)
    if details_url in special_cases:
        return special_cases[details_url]
    return details_url

images['details'] = images.image.apply(get_details_url)

In [6]:
# Drop unneeded columns
images.drop('yearAsString', 1, inplace=True)
images.drop('height', 1, inplace=True)
images.drop('width', 1, inplace=True)
# completitionYear gets handled as float with NaNs
images.completitionYear = images.completitionYear.fillna(-1).astype(int)

images.head(3)

Unnamed: 0,artistContentId,artistName,completitionYear,contentId,image,title,details
1,204915,Vincent van Gogh,1889,207190,https://uploads2.wikiart.org/00125/images/vinc...,The Starry Night,https://www.wikiart.org/en/vincent-van-gogh/th...
3,230332,Henri de Toulouse-Lautrec,1892,230453,https://uploads8.wikiart.org/images/henri-de-t...,In Bed The Kiss,https://wikiart.org/en/henri-de-toulouse-lautr...
4,188828,Sandro Botticelli,1485,189114,https://uploads6.wikiart.org/images/sandro-bot...,The Birth of Venus,https://wikiart.org/en/sandro-botticelli/the-b...


In [7]:
OUT_FILE = '../server/data/images.csv'
images.image = images.image.str.replace('!Large.jpg', '')
images.to_csv(OUT_FILE, sep=',', encoding='utf-8', index=False)
print('Exportet {} rows'.format(len(images.index)))

Exportet 199 rows


In [8]:
# Check if details urls are valid
"""
from urllib.request import urlopen

for idx, details_url in enumerate(images.details):
    if idx % 20 == 0:
        print(str(idx))
    try:
        page = urlopen(details_url)
    except Exception as e:
        print(e, details_url)
"""

0
20
40
60
80
100
120
140
160
180
