In [1]:
from bs4 import BeautifulSoup
import os
import requests

In [2]:
## set up directories
DATA_DIR = 'data/'

## for Henri-matisse
HENRI_ARTIST_URL = 'http://www.henri-matisse.net/paintingssection{section_num}.html'
HENRI_PAINTING_URL = 'http://www.henri-matisse.net/{painting_source}'

## for Pablo Picasso
PABLO_URL = 'https://www.pablo-ruiz-picasso.net/{where_to_scrape}.php'
PABLO_PAINTING_URL = 'https://www.pablo-ruiz-picasso.net/{painting_source}'

## checking if our data directory is there or not
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

In [3]:
# getting all of the image urls
def scrape_arts_henri(sec_num):
    url_query = HENRI_ARTIST_URL.format(section_num=sec_num)
    artist_page = requests.get(url_query)

    # check for request error
    try:
        artist_page.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print("Error trying to retrieve {}".format(artist_page.url))
        raise e

        
    soup = BeautifulSoup(artist_page.text, 'lxml')

    painting_paths = []
    ## filtering part, getting year between 1900-1942
    for li in soup.find_all('div', {'class': 'thmbnlspaintingselectbis'}):
        try:
            year = li.find('p').getText().split('\n')[-1].strip()
        except:
            year = li.getText().split('\n')[-1].strip()
        year_lst = year.split('-')
        year = year_lst[-1]
        if len(year) == 4:
            try:
                year = int(year)
            except:
                year = ''
        else:
            continue
        if(1900<=year<=1942):
            painting_paths.append(li.find('img').get('src'))
    return painting_paths

In [4]:
HENRI_ARTS = [scrape_arts_henri(sec_num) for sec_num in ['one', 'two', 'three']]
HENRI_ARTS = sum(HENRI_ARTS, [])
HENRI_ARTS

['paintings/thmbnls150/bb.jpg',
 'paintings/thmbnls150/ar.jpg',
 'paintings/thmbnls150/ba.jpg',
 'paintings/orient_99_thmb.jpg',
 'paintings/thmbnls150/glimpse.jpg',
 'paintings/thmbnls150/bd.jpg',
 'paintings/thmbnls150/bc.jpg',
 'paintings/thmbnls150/bh.jpg',
 'paintings/thmbnls150/cs.jpg',
 'paintings/orient_107_thmb.jpg',
 'paintings/thmbnls150/bj.jpg',
 'paintings/thmbnls150/dr.jpg',
 'paintings/thmbnls150/bg.jpg',
 'paintings/thmbnls150/bf.jpg',
 'paintings/thmbnls150/zd.jpg',
 'paintings/thmbnls150/bi.jpg',
 'paintings/thmbnls150/hma308.jpg',
 'paintings/thmbnls150/zq.jpg',
 'paintings/thmbnls150/by.jpg',
 'paintings/orient_109_thmb.jpg',
 'paintings/orient_108_thmb.jpg',
 'paintings/thmbnls150/gk.jpg',
 'paintings/thmbnls150/zc.jpg',
 'paintings/thmbnls150/zb.jpg',
 'paintings/thmbnls150/ds.jpg',
 'paintings/orient_113_thmb.jpg',
 'paintings/thmbnls150/self.jpg',
 'paintings/orient_111_thmb.jpg',
 'paintings/thmbnls150/bu.jpg',
 'paintings/thmbnls150/du.jpg',
 'paintings/thmbnl

In [5]:
# getting all of the image urls
def scrape_arts_pablo(here_to_scrape):
    PABLO_WEB = PABLO_URL.format(where_to_scrape=here_to_scrape)
    print(PABLO_WEB)
    artist_page = requests.get(PABLO_WEB)
    try:
        artist_page.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print("Error trying to retrieve {}".format(artist_page.url))
        raise e

    soup = BeautifulSoup(artist_page.text, 'lxml')
    
    painting_paths = []
    pics = soup.find('div', {'id': 'main'}).find_all('div', style=lambda value: 'width' in value)
    
    ## filtering part, getting year between 1900-1942
    for pic in pics:
        year = pic.text.split(',')[-1].strip()
        year_lst = year.split('-')
        year = int(year_lst[-1])
        if (1900<=year<=1942):
            painting_paths.append(pic.find('img').get('src'))
    return painting_paths

In [6]:
PABLO_ARTS = scrape_arts_pablo('topviews')+scrape_arts_pablo('topshared')+scrape_arts_pablo('topexpensive')
PABLO_ARTS 

https://www.pablo-ruiz-picasso.net/topviews.php
https://www.pablo-ruiz-picasso.net/topshared.php
https://www.pablo-ruiz-picasso.net/topexpensive.php


['images/works/57_s.jpg',
 'images/works/2_s.jpg',
 'images/works/117_s.jpg',
 'images/works/3437_s.jpg',
 'images/works/56_s.jpg',
 'images/works/122_s.jpg',
 'images/works/40_s.jpg',
 'images/works/18_s.jpg',
 'images/works/1513_s.jpg',
 'images/works/3570_s.jpg',
 'images/works/145_s.jpg',
 'images/works/170_s.jpg',
 'images/works/4000_s.jpg',
 'images/works/1459_s.jpg',
 'images/works/151_s.jpg',
 'images/works/85_s.jpg',
 'images/works/1885_s.jpg',
 'images/works/3040_s.jpg',
 'images/works/17_s.jpg',
 'images/works/97_s.jpg',
 'images/works/102_s.jpg',
 'images/works/44_s.jpg',
 'images/works/128_s.jpg',
 'images/works/3501_s.jpg',
 'images/works/1958_s.jpg',
 'images/works/88_s.jpg',
 'images/works/31_s.jpg',
 'images/works/3960_s.jpg',
 'images/works/61_s.jpg',
 'images/works/263_s.jpg',
 'images/works/3545_s.jpg',
 'images/works/2033_s.jpg',
 'images/works/158_s.jpg',
 'images/works/1651_s.jpg',
 'images/works/2025_s.jpg',
 'images/works/35_s.jpg',
 'images/works/3941_s.jpg',


In [7]:
def download_and_save(artist_name):
    if artist_name == 'henri-matisse':
        painting_urls = HENRI_ARTS
        PAINTING_URL = HENRI_PAINTING_URL
    else:
        painting_urls = PABLO_ARTS
        PAINTING_URL = PABLO_PAINTING_URL
    
    IMAGE_DIR = os.path.join(os.getcwd(),DATA_DIR, artist_name)
    if not os.path.exists(IMAGE_DIR):
        os.makedirs(IMAGE_DIR)
    display(IMAGE_DIR)
    for url in painting_urls:
        download_url = PAINTING_URL.format(painting_source=url)
        outfile = os.path.join(IMAGE_DIR, url.replace('/','-'))
        if not os.path.exists(outfile):
            print("downloading: {}".format(url))
            r_painting_page = requests.get(download_url)
            with open(outfile, 'wb') as f:
                f.write(r_painting_page.content)
        else:
            pass

In [8]:
download_and_save('henri-matisse')

'/Users/michelle/Documents/dsc160-midterm-404-not-found/data/henri-matisse'

downloading: paintings/thmbnls150/bb.jpg
downloading: paintings/thmbnls150/ar.jpg
downloading: paintings/thmbnls150/ba.jpg
downloading: paintings/orient_99_thmb.jpg
downloading: paintings/thmbnls150/glimpse.jpg
downloading: paintings/thmbnls150/bd.jpg
downloading: paintings/thmbnls150/bc.jpg
downloading: paintings/thmbnls150/bh.jpg
downloading: paintings/thmbnls150/cs.jpg
downloading: paintings/orient_107_thmb.jpg
downloading: paintings/thmbnls150/bj.jpg
downloading: paintings/thmbnls150/dr.jpg
downloading: paintings/thmbnls150/bg.jpg
downloading: paintings/thmbnls150/bf.jpg
downloading: paintings/thmbnls150/zd.jpg
downloading: paintings/thmbnls150/bi.jpg
downloading: paintings/thmbnls150/hma308.jpg
downloading: paintings/thmbnls150/zq.jpg
downloading: paintings/thmbnls150/by.jpg
downloading: paintings/orient_109_thmb.jpg
downloading: paintings/orient_108_thmb.jpg
downloading: paintings/thmbnls150/gk.jpg
downloading: paintings/thmbnls150/zc.jpg
downloading: paintings/thmbnls150/zb.jpg


In [9]:
download_and_save('picasso')

'/Users/michelle/Documents/dsc160-midterm-404-not-found/data/picasso'

downloading: images/works/57_s.jpg
downloading: images/works/2_s.jpg
downloading: images/works/117_s.jpg
downloading: images/works/3437_s.jpg
downloading: images/works/56_s.jpg
downloading: images/works/122_s.jpg
downloading: images/works/40_s.jpg
downloading: images/works/18_s.jpg
downloading: images/works/1513_s.jpg
downloading: images/works/3570_s.jpg
downloading: images/works/145_s.jpg
downloading: images/works/170_s.jpg
downloading: images/works/4000_s.jpg
downloading: images/works/1459_s.jpg
downloading: images/works/151_s.jpg
downloading: images/works/85_s.jpg
downloading: images/works/1885_s.jpg
downloading: images/works/3040_s.jpg
downloading: images/works/17_s.jpg
downloading: images/works/97_s.jpg
downloading: images/works/102_s.jpg
downloading: images/works/44_s.jpg
downloading: images/works/128_s.jpg
downloading: images/works/3501_s.jpg
downloading: images/works/1958_s.jpg
downloading: images/works/88_s.jpg
downloading: images/works/31_s.jpg
downloading: images/works/3960_

downloading: images/works/1702_s.jpg
downloading: images/works/2230_s.jpg
downloading: images/works/2628_s.jpg
downloading: images/works/1845_s.jpg
