### Download high-resolution images from wikiArt

In [17]:
import requests
import os
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin, urlparse

In [18]:
def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

In [19]:
def get_all_images(url):
    """
    Returns all image URLs on a single `url`
    """
    soup = bs(requests.get(url).content, "html.parser")
    urls = []
    for img in tqdm(soup.find_all("img"), "Extracting images"):
        img_url = img.attrs.get("src")
        if not img_url:
            # if img does not contain src attribute, just skip
            continue
        # make the URL absolute by joining domain with the URL that is just extracted
        img_url = urljoin(url, img_url)
        # remove URLs like '/hsts-pixel.gif?c=3.2.5'
        try:
            pos = img_url.index("?")
            img_url = img_url[:pos]
        except ValueError:
            pass
        # finally, if the url is valid
        if is_valid(img_url):
            img_url = img_url.split('!', 1)[0]
            urls.append(img_url)
    return urls

In [20]:
def get_all_addresses(addresses):
    """
    Returns all image URLs on a single `url`
    """
    soup = bs(requests.get(addresses).content, "html.parser")
    urls = []
    
    for img in tqdm(soup.findAll("li", {"class": "painting-list-text-row"}), "Extracting images"):
        img_url = img.a.get('href')
        img_url = img_url.split('/', 2)[2]
        img_url = "https://uploads0.wikiart.org/images/" + img_url + ".jpg"
        if not img_url:
            # if img does not contain src attribute, just skip
            continue
        # make the URL absolute by joining domain with the URL that is just extracted
        img_url = urljoin(addresses, img_url)
        # remove URLs like '/hsts-pixel.gif?c=3.2.5'
        try:
            pos = img_url.index("?")
            img_url = img_url[:pos]
        except ValueError:
            pass
        # finally, if the url is valid
        if is_valid(img_url):
            img_url = img_url.split('!', 1)[0]
            urls.append(img_url)
    return urls

In [21]:
def download(url, pathname):
    """
    Downloads a file given an URL and puts it in the folder `pathname`
    """
    # if path doesn't exist, make that path dir
    if not os.path.isdir(pathname):
        os.makedirs(pathname)
    # download the body of response by chunk, not immediately
    response = requests.get(url, stream=True)

    # get the total file size
    file_size = int(response.headers.get("Content-Length", 0))

    # get the file name
    prefix = url.split('/', 5)[4]
    prefix = prefix.split('.', 1)[0]
    filename = os.path.join(pathname, prefix + "_" + url.split("/")[-1])

    # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
    progress = tqdm(response.iter_content(1024), "Downloading " + filename, total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
    with open(filename, "wb") as f:
        for data in progress:
            # write data read to the file
            f.write(data)
            # update the progress bar manually
            progress.update(len(data))


In [22]:
def main(url, path):
    # get all images
    imgs = get_all_addresses(url)
    for img in imgs:
        # for each img, download it
        download(img, path)

### Download the files

In [23]:
artists_list = open("wikiart_artists.txt", "r")
lines = artists_list.readlines()

for url_addr in range(len(lines)):
    print(lines[url_addr].rstrip("\n")  + "/all-works/text-list")
    main(lines[url_addr].rstrip("\n")  + "/all-works/text-list", "wikiart_dataset")
    
#main("https://www.wikiart.org/en/tsuruko-yamazaki/", "wikiart_dataset")

https://www.wikiart.org/en/ad-reinhardt/all-works/text-list


Extracting images: 100%|████████████████████| 52/52 [00:00<00:00, 10393.81it/s]
Downloading wikiart_dataset\ad-reinhardt_untitled-1937.jpg:   0%| | 813/813k [0
Downloading wikiart_dataset\ad-reinhardt_untitled-1938.jpg:   0%| | 73.0/72.8k 
Downloading wikiart_dataset\ad-reinhardt_study-for-a-painting-1938.jpg:   0%| |
Downloading wikiart_dataset\ad-reinhardt_study-for-a-painting-1938-1.jpg:   0%|
Downloading wikiart_dataset\ad-reinhardt_study-for-a-painting-1938-2.jpg:   0%|
Downloading wikiart_dataset\ad-reinhardt_number-30-1938.jpg:   0%| | 44.0/43.3k
Downloading wikiart_dataset\ad-reinhardt_collage-1938.jpg:   0%| | 51.0/50.0k [
Downloading wikiart_dataset\ad-reinhardt_untitled-1938-1.jpg:   0%| | 66.0/65.9
Downloading wikiart_dataset\ad-reinhardt_study-for-a-painting-1939.jpg:   0%| |
Downloading wikiart_dataset\ad-reinhardt_study-for-a-painting-1939-1.jpg:   0%|
Downloading wikiart_dataset\ad-reinhardt_study-for-a-painting-1939-2.jpg:   0%|
Downloading wikiart_dataset\ad-reinhardt

https://www.wikiart.org/en/alekos-kontopoulos/all-works/text-list


Extracting images: 100%|█████████████████████| 79/79 [00:00<00:00, 7178.92it/s]
Downloading wikiart_dataset\alekos-kontopoulos_portrait-of-a-lady-1931.jpg:   0
Downloading wikiart_dataset\alekos-kontopoulos_loads-with-a-crane-barge-on-the-
Downloading wikiart_dataset\alekos-kontopoulos_naked-in-the-woods-1932.jpg:   0
Downloading wikiart_dataset\alekos-kontopoulos_the-forest-1938.jpg:   0%| | 176
Downloading wikiart_dataset\alekos-kontopoulos_le-canal-bethune-1939.jpg:   0%|
Downloading wikiart_dataset\alekos-kontopoulos_head-1939.jpg:   0%| | 163/163k 
Downloading wikiart_dataset\alekos-kontopoulos_mother-and-child-1943.jpg:   0%|
Downloading wikiart_dataset\alekos-kontopoulos_landscape-1945.jpg:   0%| | 138/
Downloading wikiart_dataset\alekos-kontopoulos_mountain-1946.jpg:   0%| | 123/1
Downloading wikiart_dataset\alekos-kontopoulos_armchair-1951.jpg:   0%| | 118/1
Downloading wikiart_dataset\alekos-kontopoulos_theokriteio-romance-1951.jpg:   
Downloading wikiart_dataset\alekos-konto

https://www.wikiart.org/en/alexander-liberman/all-works/text-list


Extracting images: 100%|█████████████████████| 16/16 [00:00<00:00, 7993.91it/s]
Downloading wikiart_dataset\alexander-liberman_two-circles-1950.jpg:   0%| | 10
Downloading wikiart_dataset\alexander-liberman_beat-1952.jpg:   0%| | 150/150k 
Downloading wikiart_dataset\alexander-liberman_path-iv-1952.jpg:   0%| | 76.0/7
Downloading wikiart_dataset\alexander-liberman_time-1952.jpg:   0%| | 38.0/37.1
Downloading wikiart_dataset\alexander-liberman_revolving-1959.jpg:   0%| | 44.0
Downloading wikiart_dataset\alexander-liberman_omega-ix-1961.jpg:   0%| | 23.0/
Downloading wikiart_dataset\alexander-liberman_omicron-v-1961.jpg:   0%| | 27.0
Downloading wikiart_dataset\alexander-liberman_omicron-vii-1961.jpg:   0%| | 64
Downloading wikiart_dataset\alexander-liberman_air-1962.jpg:   0%| | 86.0/85.4k
Downloading wikiart_dataset\alexander-liberman_socrates-1962.jpg:   0%| | 93.0/
Downloading wikiart_dataset\alexander-liberman_untitled-abstract-1962.jpg:   0%
Downloading wikiart_dataset\alexander-li

https://www.wikiart.org/en/alfred-jensen/all-works/text-list


Extracting images: 100%|█████████████████████| 27/27 [00:00<00:00, 8999.22it/s]
Downloading wikiart_dataset\alfred-jensen_my-oneness-a-universe-of-colours-1957
Downloading wikiart_dataset\alfred-jensen_ascending-i-1958.jpg:   0%| | 106/106
Downloading wikiart_dataset\alfred-jensen_family-portrait-1958.jpg:   0%| | 129
Downloading wikiart_dataset\alfred-jensen_tattoed-1958.jpg:   0%| | 46.0/45.3k 
Downloading wikiart_dataset\alfred-jensen_interlacing-colors-i-1959.jpg:   0%| 
Downloading wikiart_dataset\alfred-jensen_spanish-door-design-purple-1959.jpg: 
Downloading wikiart_dataset\alfred-jensen_a-perfect-equal-area-i-1960.jpg:   0%
Downloading wikiart_dataset\alfred-jensen_magic-14-1960.jpg:   0%| | 175/175k [
Downloading wikiart_dataset\alfred-jensen_the-apex-is-nothing-1960.jpg:   0%| |
Downloading wikiart_dataset\alfred-jensen_the-great-mystery-ii-1960.jpg:   0%| 
Downloading wikiart_dataset\alfred-jensen_untitled-1960.jpg:   0%| | 36.0/35.2k
Downloading wikiart_dataset\alfred-jense

https://www.wikiart.org/en/alfred-jensen/all-works/text-list


Extracting images: 100%|█████████████████████| 27/27 [00:00<00:00, 8994.22it/s]
Downloading wikiart_dataset\alfred-jensen_my-oneness-a-universe-of-colours-1957
Downloading wikiart_dataset\alfred-jensen_ascending-i-1958.jpg:   0%| | 106/106
Downloading wikiart_dataset\alfred-jensen_family-portrait-1958.jpg:   0%| | 129
Downloading wikiart_dataset\alfred-jensen_tattoed-1958.jpg:   0%| | 46.0/45.3k 
Downloading wikiart_dataset\alfred-jensen_interlacing-colors-i-1959.jpg:   0%| 
Downloading wikiart_dataset\alfred-jensen_spanish-door-design-purple-1959.jpg: 
Downloading wikiart_dataset\alfred-jensen_a-perfect-equal-area-i-1960.jpg:   0%
Downloading wikiart_dataset\alfred-jensen_magic-14-1960.jpg:   0%| | 175/175k [
Downloading wikiart_dataset\alfred-jensen_the-apex-is-nothing-1960.jpg:   0%| |
Downloading wikiart_dataset\alfred-jensen_the-great-mystery-ii-1960.jpg:   0%| 
Downloading wikiart_dataset\alfred-jensen_untitled-1960.jpg:   0%| | 36.0/35.2k
Downloading wikiart_dataset\alfred-jense

https://www.wikiart.org/en/alma-woodsey-thomas/all-works/text-list


Extracting images: 100%|█████████████████████| 29/29 [00:00<00:00, 7248.80it/s]
Downloading wikiart_dataset\alma-woodsey-thomas_the-stormy-sea-1958.jpg:   0%| 
Downloading wikiart_dataset\alma-woodsey-thomas_red-abstraction-1960.jpg:   0%|
Downloading wikiart_dataset\alma-woodsey-thomas_watusi-hard-edge-1963.jpg:   0%
Downloading wikiart_dataset\alma-woodsey-thomas_air-view-of-a-spring-nursery-19
Downloading wikiart_dataset\alma-woodsey-thomas_ressurection-1966.jpg:   0%| | 
Downloading wikiart_dataset\alma-woodsey-thomas_a-glimpse-of-mars-1969.jpg:   0
Downloading wikiart_dataset\alma-woodsey-thomas_iris-tulips-jonquils-and-crocus
Downloading wikiart_dataset\alma-woodsey-thomas_atmospheric-effects-i-1970.jpg:
Downloading wikiart_dataset\alma-woodsey-thomas_new-galaxy-1970.jpg:   0%| | 10
Downloading wikiart_dataset\alma-woodsey-thomas_red-violet-nursery-viewed-from-
Downloading wikiart_dataset\alma-woodsey-thomas_snoopy-sees-earth-wrapped-in-su
Downloading wikiart_dataset\alma-woodsey

https://www.wikiart.org/en/andre-masson/all-works/text-list


Extracting images: 100%|███████████████████| 122/122 [00:00<00:00, 6095.28it/s]
Downloading wikiart_dataset\andre-masson_portrait-of-roland-tual.jpg:   0%| | 9
Downloading wikiart_dataset\andre-masson_pedestal-table-in-the-studio.jpg:   0%
Downloading wikiart_dataset\andre-masson_card-trick.jpg:   0%| | 53.0/52.4k [00
Downloading wikiart_dataset\andre-masson_landscape-with-rocks.jpg:   0%| | 140/
Downloading wikiart_dataset\andre-masson_the-meals.jpg:   0%| | 128/127k [00:00
Downloading wikiart_dataset\andre-masson_guitar-and-profile-1924.jpg:   0%| | 1
Downloading wikiart_dataset\andre-masson_automatic-drawing-1924.jpg:   0%| | 2.
Downloading wikiart_dataset\andre-masson_woman-holding-a-bird.jpg:   0%| | 14.0
Downloading wikiart_dataset\andre-masson_the-sleeper.jpg:   0%| | 11.0/10.1k [0
Downloading wikiart_dataset\andre-masson_two-naked.jpg:   0%| | 10.0/9.82k [00:
Downloading wikiart_dataset\andre-masson_birds-and-masks.jpg:   0%| | 11.0/10.7
Downloading wikiart_dataset\andre-masson

https://www.wikiart.org/en/arshile-gorky/all-works/text-list


Extracting images: 100%|█████████████████████| 71/71 [00:00<00:00, 6450.26it/s]
Downloading wikiart_dataset\arshile-gorky_park-street-church.jpg:   0%| | 248/2
Downloading wikiart_dataset\arshile-gorky_bound-in-sleep.jpg:   0%| | 165/164k 
Downloading wikiart_dataset\arshile-gorky_man-s-head.jpg:   0%| | 199/199k [00:
Downloading wikiart_dataset\arshile-gorky_portrait-of-azadoohi-liberty-miller.j
Downloading wikiart_dataset\arshile-gorky_pears-peaches-and-pitcher-1927.jpg:  
Downloading wikiart_dataset\arshile-gorky_portrait-of-a-young-man-1927.jpg:   0
Downloading wikiart_dataset\arshile-gorky_staten-island.jpg:   0%| | 163/162k [
Downloading wikiart_dataset\arshile-gorky_still-life-with-skull.jpg:   0%| | 23
Downloading wikiart_dataset\arshile-gorky_portrait-of-a-woman.jpg:   0%| | 71.0
Downloading wikiart_dataset\arshile-gorky_self-portrait-at-the-age-of-nine.jpg:
Downloading wikiart_dataset\arshile-gorky_portrait-of-master-bill-1929.jpg:   0
Downloading wikiart_dataset\arshile-gork

https://www.wikiart.org/en/balcomb-greene/all-works/text-list


Extracting images: 100%|████████████████████| 22/22 [00:00<00:00, 10990.32it/s]
Downloading wikiart_dataset\balcomb-greene_untitled-34-8-1934.jpg:   0%| | 204/
Downloading wikiart_dataset\balcomb-greene_black-and-red-tension-1935.jpg:   0%
Downloading wikiart_dataset\balcomb-greene_untitled-35-4-1935.jpg:   0%| | 223/
Downloading wikiart_dataset\balcomb-greene_untitled-35-7-1935.jpg:   0%| | 135/
Downloading wikiart_dataset\balcomb-greene_untitled-35-14-1935.jpg:   0%| | 241
Downloading wikiart_dataset\balcomb-greene_untitled-1936.jpg:   0%| | 164/164k 
Downloading wikiart_dataset\balcomb-greene_untitled-from-the-american-abstract-
Downloading wikiart_dataset\balcomb-greene_untitled-39-03-1939.jpg:   0%| | 119
Downloading wikiart_dataset\balcomb-greene_the-nautical-land-1943.jpg:   0%| | 
Downloading wikiart_dataset\balcomb-greene_way-down-blue-1945.jpg:   0%| | 83.0
Downloading wikiart_dataset\balcomb-greene_walking-woman-1949.jpg:   0%| | 32.0
Downloading wikiart_dataset\balcomb-gree

https://www.wikiart.org/en/barnett-newman/all-works/text-list


Extracting images: 100%|████████████████████| 86/86 [00:00<00:00, 10741.81it/s]
Downloading wikiart_dataset\barnett-newman_the-blessing-1944.jpg:   0%| | 82.0/
Downloading wikiart_dataset\barnett-newman_untitled-1945.jpg:   0%| | 23.0/22.0
Downloading wikiart_dataset\barnett-newman_untitled-1945-1.jpg:   0%| | 245/245
Downloading wikiart_dataset\barnett-newman_untitled-1946.jpg:   0%| | 46.0/45.3
Downloading wikiart_dataset\barnett-newman_moment-1946.jpg:   0%| | 85.0/84.9k 
Downloading wikiart_dataset\barnett-newman_onement-i-1948.jpg:   0%| | 247/246k
Downloading wikiart_dataset\barnett-newman_two-edges-1948.jpg:   0%| | 34.0/33.
Downloading wikiart_dataset\barnett-newman_untitled-2-1948.jpg:   0%| | 54.0/53
Downloading wikiart_dataset\barnett-newman_concord-1949.jpg:   0%| | 56.0/55.3k
Downloading wikiart_dataset\barnett-newman_covenant-1949.jpg:   0%| | 45.0/44.6
Downloading wikiart_dataset\barnett-newman_dionysius-1949.jpg:   0%| | 2.09k/2.
Downloading wikiart_dataset\barnett-newm

https://www.wikiart.org/en/burgoyne-diller/all-works/text-list


Extracting images:   0%|                                | 0/26 [00:00<?, ?it/s]


AttributeError: 'NoneType' object has no attribute 'get'

### Sort the .txt file alphabetically

In [8]:
txt = "wikiart_artists.txt"

f = open(txt , "r")
lines = f.readlines()
f.close()
lines.sort()

f = open(txt , "w")
for line in lines:
    f.write(line)
    
f.flush()
f.close()