In [1]:
from wc_simd.utility import spark_path

In [2]:
from pyspark.sql import SparkSession


spark = SparkSession.builder \
    .appName("test_pyspark") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g") \
    .config("spark.sql.orc.enableVectorizedReader", "false") \
    .config("spark.sql.parquet.columnarReaderBatchSize", "1024") \
    .config("spark.sql.orc.columnarReaderBatchSize", "1024") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

25/05/01 16:22:45 WARN Utils: Your hostname, Daniels-MacBook-Pro-1035.local resolves to a loopback address: 127.0.0.1; using 172.26.8.178 instead (on interface en0)
25/05/01 16:22:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/01 16:22:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/01 16:22:46 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
import json
from pprint import pp

In [4]:
def print_json(id: str):
    row = spark.table("works").where(
        F.col("id") == id).first()
    row_dict = row.asDict(recursive=True)
    json_str = json.dumps(row_dict, indent=2)
    print(json_str)

# IIIF libraries

- https://iiif-prezi.github.io/iiif-prezi3/
- https://github.com/zimeon/iiif


# Collection Examples

- Theme/Concept page: https://wellcomecollection.org/concepts/jabjbmgj

## Work with many parts
https://wellcomecollection.org/works/rqyum5sv

In [6]:
print_json("rqyum5sv")

[Stage 1:>                                                          (0 + 1) / 1]

{
  "alternativeTitles": [],
  "availabilities": [],
  "contributors": [
    {
      "agent": {
        "id": "wzggp2bj",
        "identifiers": [
          {
            "identifierType": {
              "id": "label-derived",
              "label": "Identifier derived from the label of the referent",
              "type": "IdentifierType"
            },
            "type": "Identifier",
            "value": "twort, frederick william (1877-1950)"
          }
        ],
        "label": "Twort, Frederick William (1877-1950)",
        "type": "Agent"
      },
      "primary": true,
      "roles": [],
      "type": "Contributor"
    }
  ],
  "createdDate": null,
  "currentFrequency": null,
  "description": "The material here consists largely of papers of Twort or about him, mainly his work at the Brown Institution. The bulk was arranged in files created by his son for the purpose of writing a biography. It includes: laboratory notes and correspondence, etc from the period 1903-1950,  inc

                                                                                

## Work (Archives and manuscripts) (locationType.id: `iiif-presentation`)
https://wellcomecollection.org/works/gumbfr2w 

In [24]:
print_json("gumbfr2w")



{
  "alternativeTitles": [],
  "availabilities": [
    {
      "id": "closed-stores",
      "label": "Closed stores",
      "type": "Availability"
    },
    {
      "id": "online",
      "label": "Online",
      "type": "Availability"
    }
  ],
  "contributors": [],
  "createdDate": null,
  "currentFrequency": null,
  "description": "Correspondence, diagrams, press-cuttings, patent applications.",
  "designation": [],
  "duration": null,
  "edition": null,
  "formerFrequency": [],
  "genres": [],
  "holdings": [],
  "id": "gumbfr2w",
  "identifiers": [
    {
      "identifierType": {
        "id": "calm-record-id",
        "label": "Calm RecordIdentifier",
        "type": "IdentifierType"
      },
      "type": "Identifier",
      "value": "94ee05ca-b065-40f5-9e5e-78085048f0cb"
    },
    {
      "identifierType": {
        "id": "calm-ref-no",
        "label": "Calm RefNo",
        "type": "IdentifierType"
      },
      "type": "Identifier",
      "value": "GC176/B/2"
    },
    {


                                                                                

25/04/22 15:06:14 WARN HikariPool: HikariPool-1 - Thread starvation or clock leap detected (housekeeper delta=14m57s49ms).
25/04/22 15:06:14 WARN HikariPool: HikariPool-2 - Thread starvation or clock leap detected (housekeeper delta=14m57s48ms).


## Work (Digital Images) with one image (locationType.id: `iiif-image`)
- https://wellcomecollection.org/works/nf7cjmsg
- Image: https://iiif.wellcomecollection.org/image/N0014023/full/600,/0/default.jpg (Change "600" for different width)

In [25]:
print_json("nf7cjmsg")



{
  "alternativeTitles": [],
  "availabilities": [
    {
      "id": "online",
      "label": "Online",
      "type": "Availability"
    }
  ],
  "contributors": [],
  "createdDate": null,
  "currentFrequency": null,
  "description": "Scan - axial, brain",
  "designation": [],
  "duration": null,
  "edition": null,
  "formerFrequency": [],
  "genres": [],
  "holdings": [],
  "id": "nf7cjmsg",
  "identifiers": [
    {
      "identifierType": {
        "id": "miro-image-number",
        "label": "Miro image number",
        "type": "IdentifierType"
      },
      "type": "Identifier",
      "value": "N0014023"
    },
    {
      "identifierType": {
        "id": "miro-library-reference",
        "label": "Miro library reference",
        "type": "IdentifierType"
      },
      "type": "Identifier",
      "value": "NMSB Frame No NOVIDEO"
    }
  ],
  "images": [
    {
      "id": "msg3gj54",
      "type": "Image"
    }
  ],
  "items": [
    {
      "id": null,
      "identifiers": [],
   

                                                                                

## Work (Book) (No digital) (locationType.id: `open-shelves`)
- https://wellcomecollection.org/works/m6uy6knc

In [8]:
print_json("m6uy6knc")



{
  "alternativeTitles": [],
  "availabilities": [
    {
      "id": "open-shelves",
      "label": "Open shelves",
      "type": "Availability"
    }
  ],
  "contributors": [
    {
      "agent": {
        "id": "rcncxsjb",
        "identifiers": [
          {
            "identifierType": {
              "id": "lc-names",
              "label": "Library of Congress Name authority records",
              "type": "IdentifierType"
            },
            "type": "Identifier",
            "value": "n80117079"
          }
        ],
        "label": "Simoons, Frederick J.",
        "type": "Person"
      },
      "primary": true,
      "roles": [],
      "type": "Contributor"
    }
  ],
  "createdDate": null,
  "currentFrequency": null,
  "description": null,
  "designation": [],
  "duration": null,
  "edition": null,
  "formerFrequency": [],
  "genres": [],
  "holdings": [],
  "id": "m6uy6knc",
  "identifiers": [
    {
      "identifierType": {
        "id": "sierra-system-number",
  

                                                                                

## Work (Book) (locationType.id: `online-resource`) (go.gale.com)
- https://wellcomecollection.org/works/mtbrsqa8
- https://go.gale.com/ps/i.do?p=ECCO&u=wellcome&id=GALE%7CCW0123958160&v=2.1&it=r 

In [None]:
print_json("mtbrsqa8")

                                                                                

{
  "alternativeTitles": [
    "Case of His Majesties sugar plantations (Online)"
  ],
  "availabilities": [
    {
      "id": "online",
      "label": "Online",
      "type": "Availability"
    }
  ],
  "contributors": [],
  "createdDate": null,
  "currentFrequency": null,
  "description": null,
  "designation": [],
  "duration": null,
  "edition": null,
  "formerFrequency": [],
  "genres": [],
  "holdings": [
    {
      "enumeration": [
        "Full text available: 1670."
      ],
      "location": {
        "accessConditions": [
          {
            "method": {
              "id": "view-online",
              "label": "View online",
              "type": "AccessMethod"
            },
            "status": {
              "id": "licensed-resources",
              "label": "Licensed resources",
              "type": "AccessStatus"
            },
            "type": "AccessCondition"
          }
        ],
        "label": null,
        "linkText": "Available in Eighteenth Century

## Work (Book) (locationType.id: `online-resource`) (www.proquest.com/eebo)
- https://wellcomecollection.org/works/u7xtg8mj

In [10]:
print_json("u7xtg8mj")



{
  "alternativeTitles": [],
  "availabilities": [
    {
      "id": "online",
      "label": "Online",
      "type": "Availability"
    }
  ],
  "contributors": [
    {
      "agent": {
        "id": "ckk9ffrq",
        "identifiers": [
          {
            "identifierType": {
              "id": "lc-names",
              "label": "Library of Congress Name authority records",
              "type": "IdentifierType"
            },
            "type": "Identifier",
            "value": "n84177453"
          }
        ],
        "label": "Coley, Henry, 1633-1704?",
        "type": "Person"
      },
      "primary": true,
      "roles": [],
      "type": "Contributor"
    }
  ],
  "createdDate": null,
  "currentFrequency": null,
  "description": null,
  "designation": [],
  "duration": null,
  "edition": null,
  "formerFrequency": [],
  "genres": [
    {
      "concepts": [
        {
          "id": "kqkakf66",
          "identifiers": [
            {
              "identifierType": {
 

                                                                                

## Work (Book) (locationType.id: `iiif-presentation`) (embedded PDF)
- https://wellcomecollection.org/works/deybwyaf

In [5]:
print_json("deybwyaf")

                                                                                

{
  "alternativeTitles": [
    "Galen commentary on Hippocrates Epidemics book 1 parts I-III"
  ],
  "availabilities": [
    {
      "id": "online",
      "label": "Online",
      "type": "Availability"
    }
  ],
  "contributors": [
    {
      "agent": {
        "id": "ch4dx5r2",
        "identifiers": [
          {
            "identifierType": {
              "id": "lc-names",
              "label": "Library of Congress Name authority records",
              "type": "IdentifierType"
            },
            "type": "Identifier",
            "value": "n79059644"
          }
        ],
        "label": "Galen",
        "type": "Person"
      },
      "primary": true,
      "roles": [
        {
          "label": "author",
          "type": "ContributionRole"
        }
      ],
      "type": "Contributor"
    },
    {
      "agent": {
        "id": "mx4zvzk6",
        "identifiers": [
          {
            "identifierType": {
              "id": "lc-names",
              "label": 

# Works rows with item location url has "presentation"

In [28]:
items_filter = """
  exists(
    items, i ->
        exists(i.locations, l ->
            l.url LIKE '%/presentation/%')
  )
"""

df_works_w_presentation = spark.table("works").filter(
    F.expr(items_filter)
)

# Number of works with IIIF Presentations

In [None]:
df_works_w_presentation.count()

In [32]:
print(json.dumps(df_works_w_presentation.first().asDict(recursive=True), indent=2))

{
  "alternativeTitles": [],
  "availabilities": [
    {
      "id": "closed-stores",
      "label": "Closed stores",
      "type": "Availability"
    },
    {
      "id": "online",
      "label": "Online",
      "type": "Availability"
    }
  ],
  "contributors": [],
  "createdDate": null,
  "currentFrequency": null,
  "description": null,
  "designation": [],
  "duration": null,
  "edition": null,
  "formerFrequency": [],
  "genres": [],
  "holdings": [],
  "id": "xjjnw2zf",
  "identifiers": [
    {
      "identifierType": {
        "id": "calm-record-id",
        "label": "Calm RecordIdentifier",
        "type": "IdentifierType"
      },
      "type": "Identifier",
      "value": "c660fa26-3b12-4993-ae16-ff2a56e0aa87"
    },
    {
      "identifierType": {
        "id": "calm-ref-no",
        "label": "Calm RefNo",
        "type": "IdentifierType"
      },
      "type": "Identifier",
      "value": "SAEUG/D/100"
    },
    {
      "identifierType": {
        "id": "calm-altref-no",


In [42]:
df_works_item_urls = (
    df_works_w_presentation
      # 1) explode each item so we have work‐level id + item struct
      .select(
          F.col("id"),
          F.explode(F.col("items")).alias("item")
      )
      # 2) from each item, explode only the locations whose URL matches
      .select(
          F.col("id"),
          F.col("item.id").alias("item_id"),
          F.explode(
              F.expr("filter(item.locations, l -> l.url LIKE '%/presentation/%')")
          ).alias("location")
      )
      # 3) pick out just the fields you want
      .select(
          F.col("id"),
          F.col("item_id"),
          F.col("location.url").alias("url")
      )
)


# Number of Presentation URLs

In [None]:
df_works_item_urls.count()

                                                                                

338128

In [44]:
df_works_item_urls.show(truncate=False)

+--------+--------+-------------------------------------------------------------+
|id      |item_id |url                                                          |
+--------+--------+-------------------------------------------------------------+
|xjjnw2zf|NULL    |https://iiif.wellcomecollection.org/presentation/v2/b16235484|
|t4rtqnma|NULL    |https://iiif.wellcomecollection.org/presentation/v2/b22422547|
|ukw5685u|NULL    |https://iiif.wellcomecollection.org/presentation/v2/b29104956|
|rjfdmzt4|NULL    |https://iiif.wellcomecollection.org/presentation/v2/b29748859|
|y6rya34t|NULL    |https://iiif.wellcomecollection.org/presentation/v2/b2494628x|
|c3782xby|gux39adm|https://iiif.wellcomecollection.org/presentation/v2/b28109892|
|m3bpbasa|uk6jn3dr|https://iiif.wellcomecollection.org/presentation/v2/b24853367|
|k89n5png|c45295z6|https://iiif.wellcomecollection.org/presentation/v2/b13134218|
|xd8cw6mc|NULL    |https://iiif.wellcomecollection.org/presentation/v2/b31497305|
|ep3say76|uft6gt

In [60]:
df_works_item_urls_subset = df_works_item_urls.sample(
    withReplacement=False,
    fraction=0.01,
    seed=42
)
df_works_item_urls_subset.count()

                                                                                

3398

In [None]:
import requests
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

# 1) UDF to fetch JSON text


def fetch_json(url: str) -> str:
    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status()
        return resp.text
    except Exception:
        return None


fetch_json_udf = F.udf(fetch_json, StringType())

# -------------------------------------------------------
# Assume you already have `item_urls` from the previous step:
#   columns: work_id, item_id, url
# -------------------------------------------------------

# 2) Attach raw JSON string
with_json_str = (
    df_works_item_urls_subset
    .withColumn("json_str", fetch_json_udf(F.col("url")))
)

# 3) Infer schema once (optional; run this just one time)
raw_json_rdd = (
    with_json_str
    .select("json_str")
    .rdd
    .map(lambda r: r.json_str)
    .filter(lambda txt: txt is not None)
)
inferred_schema = spark.read.json(raw_json_rdd).schema

# 4) Parse into a Struct column, drop the raw text
result_df = (
    with_json_str
    .withColumn("json_data", F.from_json(F.col("json_str"), inferred_schema))
    .drop("json_str")
)

result_df.printSchema()
result_df.show(truncate=False)

In [51]:
result_df.schema

StructType([StructField('id', StringType(), True), StructField('item_id', StringType(), True), StructField('url', StringType(), True), StructField('json_data', StructType([StructField('@context', StringType(), True), StructField('@id', StringType(), True), StructField('@type', StringType(), True), StructField('label', StringType(), True), StructField('license', StringType(), True), StructField('logo', StringType(), True), StructField('metadata', ArrayType(StructType([StructField('label', StringType(), True), StructField('value', StringType(), True)]), True), True), StructField('otherContent', ArrayType(StructType([StructField('@id', StringType(), True), StructField('@type', StringType(), True), StructField('label', StringType(), True)]), True), True), StructField('related', StructType([StructField('@id', StringType(), True), StructField('format', StringType(), True), StructField('label', StringType(), True)]), True), StructField('seeAlso', StructType([StructField('@id', StringType(), T

In [62]:
pages_df = result_df.withColumn(
    "pages",
    # grab the first element of the sequences array, then count its canvases
    F.expr("size(json_data.sequences[0].canvases)")
).select("id", "item_id", "url", "pages")

# Estimate number of scanned pages with a 1% sample

In [63]:
pages_df.agg(F.sum("pages")).show()



+----------+
|sum(pages)|
+----------+
|    427656|
+----------+



                                                                                

In [64]:
427656 / 0.01

42765600.0

# Estimate number of scanned pages with a 0.1% sample

In [58]:
pages_df.agg(F.sum("pages")).show()



+----------+
|sum(pages)|
+----------+
|     36511|
+----------+



                                                                                

In [59]:
36511/0.001

36511000.0