In [1]:
import json
import pathlib
import re
import time

import pandas as pd
import requests

## Artsy.net

In [24]:
ARTSY_URL = "https://metaphysics-production.artsy.net/v2"

saving_path = pathlib.Path("../downloads/artsynet/images")
saving_path.mkdir(parents=True, exist_ok=True)

headers_from_chrome: str = """
accept: */*
accept-encoding: gzip, deflate, br
accept-language: ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7
content-length: 4346
content-type: application/json
origin: https://www.artsy.net
referer: https://www.artsy.net/
sec-ch-ua: "Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"
sec-ch-ua-mobile: ?0
sec-ch-ua-platform: "macOS"
sec-fetch-dest: empty
sec-fetch-mode: cors
sec-fetch-site: same-site
user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36
x-timezone: Europe/Moscow
""".strip()
headers = {}
for line in headers_from_chrome.split("\n"):
    header_name, *header_value = line.split(": ")
    headers[header_name] = ": ".join(header_value)

In [3]:
def make_query(page: int):
    return {
        "id": "ArtworkQueryFilterQuery",
        "query": 'query ArtworkQueryFilterQuery(\n  $input: FilterArtworksInput\n) {\n  viewer {\n    ...ArtworkFilter_viewer_2VV6jB\n  }\n}\n\nfragment ArtworkFilterArtworkGrid_filtered_artworks on FilterArtworksConnection {\n  id\n  pageInfo {\n    hasNextPage\n    endCursor\n  }\n  pageCursors {\n    ...Pagination_pageCursors\n  }\n  edges {\n    node {\n      id\n    }\n  }\n  ...ArtworkGrid_artworks\n}\n\nfragment ArtworkFilter_viewer_2VV6jB on Viewer {\n  filtered_artworks: artworksConnection(input: $input) {\n    ...ArtworkFilterArtworkGrid_filtered_artworks\n    counts {\n      total(format: "0,0")\n    }\n    id\n  }\n}\n\nfragment ArtworkGrid_artworks on ArtworkConnectionInterface {\n  __isArtworkConnectionInterface: __typename\n  edges {\n    __typename\n    node {\n      id\n      slug\n      href\n      internalID\n      image(includeAll: false) {\n        aspectRatio\n      }\n      ...GridItem_artwork\n      ...FlatGridItem_artwork\n    }\n    ... on Node {\n      __isNode: __typename\n      id\n    }\n  }\n}\n\nfragment Badge_artwork on Artwork {\n  is_biddable: isBiddable\n  href\n  sale {\n    is_preview: isPreview\n    display_timely_at: displayTimelyAt\n    id\n  }\n}\n\nfragment DeprecatedSaveButton_artwork on Artwork {\n  id\n  internalID\n  slug\n  isSaved\n  title\n}\n\nfragment Details_artwork on Artwork {\n  internalID\n  href\n  title\n  date\n  sale_message: saleMessage\n  cultural_maker: culturalMaker\n  artist {\n    targetSupply {\n      isP1\n    }\n    id\n  }\n  marketPriceInsights {\n    demandRank\n  }\n  artists(shallow: true) {\n    id\n    href\n    name\n  }\n  collecting_institution: collectingInstitution\n  partner(shallow: true) {\n    name\n    href\n    id\n  }\n  sale {\n    endAt\n    cascadingEndTimeIntervalMinutes\n    extendedBiddingIntervalMinutes\n    startAt\n    is_auction: isAuction\n    is_closed: isClosed\n    id\n  }\n  sale_artwork: saleArtwork {\n    lotID\n    lotLabel\n    endAt\n    extendedBiddingEndAt\n    formattedEndDateTime\n    counts {\n      bidder_positions: bidderPositions\n    }\n    highest_bid: highestBid {\n      display\n    }\n    opening_bid: openingBid {\n      display\n    }\n    id\n  }\n  ...SaveButton_artwork\n  ...SaveArtworkToListsButton_artwork\n  ...HoverDetails_artwork\n}\n\nfragment FlatGridItem_artwork on Artwork {\n  ...Metadata_artwork\n  ...DeprecatedSaveButton_artwork\n  sale {\n    extendedBiddingPeriodMinutes\n    extendedBiddingIntervalMinutes\n    startAt\n    id\n  }\n  saleArtwork {\n    endAt\n    extendedBiddingEndAt\n    lotID\n    id\n  }\n  internalID\n  title\n  image_title: imageTitle\n  image(includeAll: false) {\n    resized(width: 445, version: ["larger", "large"]) {\n      src\n      srcSet\n      width\n      height\n    }\n  }\n  artistNames\n  href\n  isSaved\n}\n\nfragment GridItem_artwork on Artwork {\n  internalID\n  title\n  imageTitle\n  image(includeAll: false) {\n    internalID\n    placeholder\n    url(version: ["larger", "large"])\n    aspectRatio\n    versions\n  }\n  artistNames\n  href\n  ...Metadata_artwork\n  ...Badge_artwork\n}\n\nfragment HoverDetails_artwork on Artwork {\n  internalID\n  attributionClass {\n    name\n    id\n  }\n  mediumType {\n    filterGene {\n      name\n      id\n    }\n  }\n}\n\nfragment Metadata_artwork on Artwork {\n  ...Details_artwork\n  internalID\n  href\n}\n\nfragment Pagination_pageCursors on PageCursors {\n  around {\n    cursor\n    page\n    isCurrent\n  }\n  first {\n    cursor\n    page\n    isCurrent\n  }\n  last {\n    cursor\n    page\n    isCurrent\n  }\n  previous {\n    cursor\n    page\n  }\n}\n\nfragment SaveArtworkToListsButton_artwork on Artwork {\n  id\n  internalID\n  isSaved\n  slug\n  title\n  date\n  preview: image {\n    url(version: "square")\n  }\n  customCollections: collectionsConnection(first: 0, default: false, saves: true) {\n    totalCount\n  }\n}\n\nfragment SaveButton_artwork on Artwork {\n  id\n  internalID\n  slug\n  isSaved\n  title\n}\n',
        "variables": {
            "input": {
                "first": 30,
                "majorPeriods": [],
                "page": page,
                "sizes": [],
                "sort": "-decayed_merch",
                "artistIDs": [],
                "attributionClass": [],
                "partnerIDs": [],
                "additionalGeneIDs": [],
                "colors": [],
                "locationCities": [],
                "artistNationalities": [],
                "materialsTerms": [],
                "height": "*-*",
                "width": "*-*",
                "priceRange": "*-*",
            }
        },
    }

In [45]:
all_periods = """
2020
2010
2000
1990
1980
1970
1960
1950
1940
1930
1920
1910
1900
Late 19th Century
Mid 19th Century
Early 19th Century
18th Century & Earlier
""".strip().split(
    "\n"
)


def make_query_for_location(page: int, period: str):
    return {
        "id": "ArtworkQueryFilterQuery",
        "query": 'query ArtworkQueryFilterQuery(\n  $input: FilterArtworksInput\n) {\n  viewer {\n    ...ArtworkFilter_viewer_2VV6jB\n  }\n}\n\nfragment ArtworkFilterArtworkGrid_filtered_artworks on FilterArtworksConnection {\n  id\n  pageInfo {\n    hasNextPage\n    endCursor\n  }\n  pageCursors {\n    ...Pagination_pageCursors\n  }\n  edges {\n    node {\n      id\n    }\n  }\n  ...ArtworkGrid_artworks\n}\n\nfragment ArtworkFilter_viewer_2VV6jB on Viewer {\n  filtered_artworks: artworksConnection(input: $input) {\n    ...ArtworkFilterArtworkGrid_filtered_artworks\n    counts {\n      total(format: "0,0")\n    }\n    id\n  }\n}\n\nfragment ArtworkGrid_artworks on ArtworkConnectionInterface {\n  __isArtworkConnectionInterface: __typename\n  edges {\n    __typename\n    node {\n      id\n      slug\n      href\n      internalID\n      image(includeAll: false) {\n        aspectRatio\n      }\n      ...GridItem_artwork\n      ...FlatGridItem_artwork\n    }\n    ... on Node {\n      __isNode: __typename\n      id\n    }\n  }\n}\n\nfragment Badge_artwork on Artwork {\n  is_biddable: isBiddable\n  href\n  sale {\n    is_preview: isPreview\n    display_timely_at: displayTimelyAt\n    id\n  }\n}\n\nfragment DeprecatedSaveButton_artwork on Artwork {\n  id\n  internalID\n  slug\n  isSaved\n  title\n}\n\nfragment Details_artwork on Artwork {\n  internalID\n  href\n  title\n  date\n  sale_message: saleMessage\n  cultural_maker: culturalMaker\n  artist {\n    targetSupply {\n      isP1\n    }\n    id\n  }\n  marketPriceInsights {\n    demandRank\n  }\n  artists(shallow: true) {\n    id\n    href\n    name\n  }\n  collecting_institution: collectingInstitution\n  partner(shallow: true) {\n    name\n    href\n    id\n  }\n  sale {\n    endAt\n    cascadingEndTimeIntervalMinutes\n    extendedBiddingIntervalMinutes\n    startAt\n    is_auction: isAuction\n    is_closed: isClosed\n    id\n  }\n  sale_artwork: saleArtwork {\n    lotID\n    lotLabel\n    endAt\n    extendedBiddingEndAt\n    formattedEndDateTime\n    counts {\n      bidder_positions: bidderPositions\n    }\n    highest_bid: highestBid {\n      display\n    }\n    opening_bid: openingBid {\n      display\n    }\n    id\n  }\n  ...SaveButton_artwork\n  ...SaveArtworkToListsButton_artwork\n  ...HoverDetails_artwork\n}\n\nfragment FlatGridItem_artwork on Artwork {\n  ...Metadata_artwork\n  ...DeprecatedSaveButton_artwork\n  sale {\n    extendedBiddingPeriodMinutes\n    extendedBiddingIntervalMinutes\n    startAt\n    id\n  }\n  saleArtwork {\n    endAt\n    extendedBiddingEndAt\n    lotID\n    id\n  }\n  internalID\n  title\n  image_title: imageTitle\n  image(includeAll: false) {\n    resized(width: 445, version: ["larger", "large"]) {\n      src\n      srcSet\n      width\n      height\n    }\n  }\n  artistNames\n  href\n  isSaved\n}\n\nfragment GridItem_artwork on Artwork {\n  internalID\n  title\n  imageTitle\n  image(includeAll: false) {\n    internalID\n    placeholder\n    url(version: ["larger", "large"])\n    aspectRatio\n    versions\n  }\n  artistNames\n  href\n  ...Metadata_artwork\n  ...Badge_artwork\n}\n\nfragment HoverDetails_artwork on Artwork {\n  internalID\n  attributionClass {\n    name\n    id\n  }\n  mediumType {\n    filterGene {\n      name\n      id\n    }\n  }\n}\n\nfragment Metadata_artwork on Artwork {\n  ...Details_artwork\n  internalID\n  href\n}\n\nfragment Pagination_pageCursors on PageCursors {\n  around {\n    cursor\n    page\n    isCurrent\n  }\n  first {\n    cursor\n    page\n    isCurrent\n  }\n  last {\n    cursor\n    page\n    isCurrent\n  }\n  previous {\n    cursor\n    page\n  }\n}\n\nfragment SaveArtworkToListsButton_artwork on Artwork {\n  id\n  internalID\n  isSaved\n  slug\n  title\n  date\n  preview: image {\n    url(version: "square")\n  }\n  customCollections: collectionsConnection(first: 0, default: false, saves: true) {\n    totalCount\n  }\n}\n\nfragment SaveButton_artwork on Artwork {\n  id\n  internalID\n  slug\n  isSaved\n  title\n}\n',
        "variables": {
            "input": {
                "first": 30,
                "majorPeriods": [period],
                "page": page,
                "sizes": [],
                "sort": "-decayed_merch",
                "artistIDs": [],
                "attributionClass": [],
                "partnerIDs": [],
                "additionalGeneIDs": [],
                "colors": [],
                "locationCities": [],
                "artistNationalities": [],
                "materialsTerms": [],
                "height": "*-*",
                "width": "*-*",
                "priceRange": "*-*",
            }
        },
    }

In [5]:
def parse_page(json_data):
    df = pd.DataFrame([x['node'] for x in json_data['data']['viewer']['filtered_artworks']['edges']])
    df["image_url"] = df.image.apply(lambda x: x['url'])
    return df

In [43]:
session = requests.Session()

In [8]:
metadata = []
image_names = {}

In [9]:
for page in range(1, 100):
    print(f"Page {page}", end="\t")
    try:
        json_data = session.post(ARTSY_URL, json=make_query(page)).json()
    except Exception as exc:
        print(exc)
        time.sleep(10)
        continue

    page_items = parse_page(json_data)
    for i, item in page_items.iterrows():
        if item.image_url in image_names:
            continue
            
        file_name = item.image_url.rpartition("/")[2]
        extension = file_name.rpartition(".")[2]
        new_name = f"image-{len(image_names)}.{extension}"
        try:
            image_content = session.get(item.image_url).content
            saving_path.joinpath(new_name).write_bytes(image_content)
            image_names[item.image_url] = new_name
            print(".", end="")
        except Exception as exc:
            print(exc)
        time.sleep(1)
    metadata.append(page_items.assign(local_image=lambda x: x.image_url.apply(image_names.get)))
    pd.concat(metadata).to_csv(saving_path.parent / "artsynet.csv", index=False)
    print("")

Page 1	
Page 2	..............................
Page 3	..............................
Page 4	..............................
Page 5	..............................
Page 6	..............................
Page 7	..............................
Page 8	..............................
Page 9	..............................
Page 10	..............................
Page 11	..............................
Page 12	..............................
Page 13	..............................
Page 14	..............................
Page 15	..............................
Page 16	..............................
Page 17	..............................
Page 18	..............................
Page 19	..............................
Page 20	..............................
Page 21	..............................
Page 22	..............................
Page 23	.............................
Page 24	..............................
Page 25	..............................
Page 26	..............................
Page 27	..................

TypeError: 'NoneType' object is not subscriptable

In [None]:
for page in range(7, 100):
    print(f"Page {page}", end="\t")
    for period in all_periods:
        try:
            json_data = session.post(ARTSY_URL, json=make_query_for_location(page, period)).json()
        except Exception as exc:
            print(exc)
            time.sleep(10)
            continue

        page_items = parse_page(json_data)
        for i, item in page_items.iterrows():
            if item.image_url in image_names:
                continue

            file_name = item.image_url.rpartition("/")[2]
            extension = file_name.rpartition(".")[2]
            new_name = f"image-{len(image_names)}.{extension}"
            try:
                image_content = session.get(item.image_url).content
                saving_path.joinpath(new_name).write_bytes(image_content)
                image_names[item.image_url] = new_name
                print(".", end="")
            except Exception as exc:
                print(exc)
            time.sleep(0.05)
        metadata.append(page_items.assign(local_image=lambda x: x.image_url.apply(image_names.get)))
        pd.concat(metadata).to_csv(saving_path.parent / "artsynet.csv", index=False)
        print("")

Page 7	






..
..............................
..............................
..............................
......