In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("test_pyspark") \
    .config("spark.driver.memory", "100g") \
    .config("spark.executor.memory", "100g") \
    .config("spark.sql.orc.enableVectorizedReader", "false") \
    .config("spark.sql.parquet.columnarReaderBatchSize", "256") \
    .config("spark.sql.orc.columnarReaderBatchSize", "256") \
    .config("spark.sql.shuffle.partitions", "1024") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/03 13:25:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.table("works").printSchema()

root
 |-- alternativeTitles: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- availabilities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- contributors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- agent: struct (nullable = true)
 |    |    |    |-- id: string (nullable = true)
 |    |    |    |-- identifiers: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- identifierType: struct (nullable = true)
 |    |    |    |    |    |    |-- id: string (nullable = true)
 |    |    |    |    |    |    |-- label: string (nullable = true)
 |    |    |    |    |    |    |-- type: string (nullable = true)
 |    |    |    |    |    |-- type: string (nullable = true)
 |    |    |    |    |    |-- val

In [4]:
# Explore the genres column
# First, let's see some sample data from the genres column
print("Sample genres data:")
spark.table("works").select("genres").show(10, truncate=False)

Sample genres data:
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|genres                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [3]:
from pyspark.sql.functions import explode, col

# Extract unique labels from the genres column
# The genres column appears to be an array of structs with 'label' field
print("Unique genre labels:")
unique_labels = spark.table("works") \
    .select(explode(col("genres")).alias("genre")) \
    .select("genre.label") \
    .distinct() \
    .orderBy("label")

unique_labels.show(100, truncate=False)

# Count the number of unique labels
label_count = unique_labels.count()
print(f"\nTotal number of unique genre labels: {label_count}")

Unique genre labels:


                                                                                

+--------------------------------------------------------------------+
|label                                                               |
+--------------------------------------------------------------------+
|AB.AH                                                               |
|Abstract journals (form)                                            |
|Abstracts                                                           |
|Academic Dissertation                                               |
|Academic Dissertations                                              |
|Academic addresses                                                  |
|Academic addresses - Netherlands - Leiden                           |
|Academic catalogues                                                 |
|Academic dissertations                                              |
|Academic dissertations - Austria - Vienna                           |
|Academic dissertations - Denmark - Copenhagen - 18th cent           |
|Acade

In [6]:
# Show frequency of each genre label
print("Genre label frequencies (top 50):")
label_frequencies = spark.table("works") \
    .select(explode(col("genres")).alias("genre")) \
    .select("genre.label") \
    .groupBy("label") \
    .count() \
    .orderBy(col("count").desc())

label_frequencies.show(50, truncate=False)

print(f"\nTotal number of genre label occurrences: {label_frequencies.agg({'count': 'sum'}).collect()[0][0]}")

Genre label frequencies (top 50):
+----------------------+------+
|label                 |count |
+----------------------+------+
|Electronic books      |191342|
|Annual reports        |82239 |
|MOH reports           |71897 |
|Pamphlets             |37280 |
|Statistics            |23553 |
|Ephemera              |20110 |
|Academic dissertations|16479 |
|Engravings            |14165 |
|Lithographs           |12570 |
|Periodicals           |11403 |
|Etchings              |9914  |
|Portrait prints       |9756  |
|Photographs           |9675  |
|Conference proceedings|8006  |
|Posters               |6469  |
|Photographic prints   |6034  |
|Leaflets              |5933  |
|Biographies           |5850  |
|Paintings             |5806  |
|Advertising cards     |3928  |
|Videocassettes        |3772  |
|Obituaries            |3515  |
|Watercolors           |3472  |
|Portrait photographs  |3429  |
|Colonial reports      |3322  |
|Hospital reports      |3315  |
|Quick Ref. Collection |2930  |
|Drawi

In [7]:
# Explore the subjects column
# First, let's see some sample data from the subjects column
print("Sample subjects data:")
spark.table("works").select("subjects").show(10, truncate=False)

Sample subjects data:
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [8]:
# Extract unique labels from the subjects column
# The subjects column appears to be an array of structs with 'label' field
print("Unique subject labels:")
unique_subject_labels = spark.table("works") \
    .select(explode(col("subjects")).alias("subject")) \
    .select("subject.label") \
    .distinct() \
    .orderBy("label")

unique_subject_labels.show(100, truncate=False)

# Count the number of unique subject labels
subject_label_count = unique_subject_labels.count()
print(f"\nTotal number of unique subject labels: {subject_label_count}")

Unique subject labels:
+---------------------------------------------------------------+
|label                                                          |
+---------------------------------------------------------------+
|!Kung (African people)                                         |
|"Degewop" Gesellschaft wissenschaftlicher Organpräparate (Firm)|
|"Erma" di Bretschneider                                        |
|"Not Forgotten" Association                                    |
|"St. Ronan's Wells" & Mineral Water Company                    |
|"Stud" Company (Manchester) Ltd                                |
|"Thread-horned" flies                                          |
|"Vita" Glass Marketing Board                                   |
|#                                                              |
|#UTATION                                                       |
|'AT HOME'                                                      |
|'Alī ibn Sahl Rabbān, al-Ṭabarī. Firdaws al-ḥikmah. 

In [9]:
# Show frequency of each subject label
print("Subject label frequencies (top 50):")
subject_label_frequencies = spark.table("works") \
    .select(explode(col("subjects")).alias("subject")) \
    .select("subject.label") \
    .groupBy("label") \
    .count() \
    .orderBy(col("count").desc())

subject_label_frequencies.show(50, truncate=False)

print(f"\nTotal number of subject label occurrences: {subject_label_frequencies.agg({'count': 'sum'}).collect()[0][0]}")

Subject label frequencies (top 50):
+-------------------------------+-----+
|label                          |count|
+-------------------------------+-----+
|Sanitation                     |66922|
|Public Health                  |66007|
|Water Supply                   |63415|
|Disease Outbreaks              |63156|
|Great Britain                  |20033|
|London (England)               |10386|
|20th century                   |8498 |
|United States                  |8372 |
|Medicine                       |6852 |
|19th century                   |6300 |
|Physicians                     |5890 |
|Hospitals                      |4709 |
|Germany                        |4650 |
|Ophthalmology                  |4402 |
|England                        |4278 |
|Medicine - History             |4159 |
|Sex                            |4038 |
|France                         |3973 |
|19th-20th centuries            |3839 |
|Ancient                        |3819 |
|Science                        |3598 |
|Ind

In [11]:
# Survey concept types in the subjects column
# First, let's see what fields are available in the subjects structure
print("Sample subjects data with all fields:")
spark.table("works").select("subjects").show(5, truncate=False)

# Let's also look at the schema of the subjects column
print("\nSubjects column schema:")
subjects_schema = spark.table("works").select("subjects").schema
print(subjects_schema)

Sample subjects data with all fields:
+--------+
|subjects|
+--------+
|[]      |
|[]      |
|[]      |
|[]      |
|[]      |
+--------+
only showing top 5 rows


Subjects column schema:
StructType([StructField('subjects', ArrayType(StructType([StructField('concepts', ArrayType(StructType([StructField('id', StringType(), True), StructField('identifiers', ArrayType(StructType([StructField('identifierType', StructType([StructField('id', StringType(), True), StructField('label', StringType(), True), StructField('type', StringType(), True)]), True), StructField('type', StringType(), True), StructField('value', StringType(), True)]), True), True), StructField('label', StringType(), True), StructField('type', StringType(), True)]), True), True), StructField('id', StringType(), True), StructField('identifiers', ArrayType(StructType([StructField('identifierType', StructType([StructField('id', StringType(), True), StructField('label', StringType(), True), StructField('type', StringType(), True)

In [12]:
# Extract unique concept types from the subjects column
# The subjects column contains structs with 'concepts' field that has concept types
print("Unique concept types in subjects:")
concept_types = spark.table("works") \
    .select(explode(col("subjects")).alias("subject")) \
    .select(explode(col("subject.concepts")).alias("concept")) \
    .select("concept.type") \
    .distinct() \
    .orderBy("type")

concept_types.show(100, truncate=False)

# Count the number of unique concept types
concept_type_count = concept_types.count()
print(f"\nTotal number of unique concept types: {concept_type_count}")

Unique concept types in subjects:
+------------+
|type        |
+------------+
|Concept     |
|Meeting     |
|Organisation|
|Period      |
|Person      |
|Place       |
+------------+


Total number of unique concept types: 6


In [13]:
# Show frequency of each concept type
print("Concept type frequencies:")
concept_type_frequencies = spark.table("works") \
    .select(explode(col("subjects")).alias("subject")) \
    .select(explode(col("subject.concepts")).alias("concept")) \
    .select("concept.type") \
    .groupBy("type") \
    .count() \
    .orderBy(col("count").desc())

concept_type_frequencies.show(truncate=False)

print(f"\nTotal number of concept type occurrences: {concept_type_frequencies.agg({'count': 'sum'}).collect()[0][0]}")

Concept type frequencies:
+------------+-------+
|type        |count  |
+------------+-------+
|Concept     |1765932|
|Place       |272764 |
|Person      |103407 |
|Period      |59629  |
|Organisation|46018  |
|Meeting     |705    |
+------------+-------+


Total number of concept type occurrences: 2248455


In [15]:
# Explore identifierType.label in the subjects column
# Let's first check what's in the identifiers field within subjects
print("Sample identifiers data from subjects:")
spark.table("works") \
    .select(explode(col("subjects")).alias("subject")) \
    .select("subject.identifiers") \
    .show(10, truncate=False)

# Extract unique identifierType labels from the identifiers field
print("\nUnique identifierType labels in subjects:")
identifier_type_labels = spark.table("works") \
    .select(explode(col("subjects")).alias("subject")) \
    .select(explode(col("subject.identifiers")).alias("identifier")) \
    .select("identifier.identifierType.label") \
    .distinct() \
    .orderBy("label")

identifier_type_labels.show(100, truncate=False)

# Count the number of unique identifierType labels
identifier_type_count = identifier_type_labels.count()
print(f"\nTotal number of unique identifierType labels: {identifier_type_count}")

Sample identifiers data from subjects:
+-------------------------------------------------------------------------------------------------+
|identifiers                                                                                      |
+-------------------------------------------------------------------------------------------------+
|[{{nlm-mesh, Medical Subject Headings (MeSH) identifier, IdentifierType}, Identifier, D006298}]  |
|[{{nlm-mesh, Medical Subject Headings (MeSH) identifier, IdentifierType}, Identifier, D011635}]  |
|[{{nlm-mesh, Medical Subject Headings (MeSH) identifier, IdentifierType}, Identifier, D012499}]  |
|[{{lc-names, Library of Congress Name authority records, IdentifierType}, Identifier, n79100617}]|
|[{{lc-names, Library of Congress Name authority records, IdentifierType}, Identifier, n80126261}]|
|[{{nlm-mesh, Medical Subject Headings (MeSH) identifier, IdentifierType}, Identifier, D006973}]  |
|[{{nlm-mesh, Medical Subject Headings (MeSH) identifier, Ide

In [16]:
# Show frequency of each identifierType label
print("IdentifierType label frequencies:")
identifier_type_frequencies = spark.table("works") \
    .select(explode(col("subjects")).alias("subject")) \
    .select(explode(col("subject.identifiers")).alias("identifier")) \
    .select("identifier.identifierType.label") \
    .groupBy("label") \
    .count() \
    .orderBy(col("count").desc())

identifier_type_frequencies.show(truncate=False)

print(f"\nTotal number of identifierType label occurrences: {identifier_type_frequencies.agg({'count': 'sum'}).collect()[0][0]}")

IdentifierType label frequencies:
+-------------------------------------------------+------+
|label                                            |count |
+-------------------------------------------------+------+
|Medical Subject Headings (MeSH) identifier       |811002|
|Library of Congress Subject Headings (LCSH)      |449876|
|Identifier derived from the label of the referent|330754|
|Library of Congress Name authority records       |273785|
+-------------------------------------------------+------+


Total number of identifierType label occurrences: 1865417


In [6]:
# Explore concepts.label in the subjects column
print("Sample concepts data from subjects:")
spark.table("works") \
    .select(explode(col("subjects")).alias("subject")) \
    .select("subject.concepts") \
    .show(10, truncate=False)

# Extract unique concept labels from the concepts field
print("\nUnique concept labels in subjects:")
concept_labels = spark.table("works") \
    .select(explode(col("subjects")).alias("subject")) \
    .select(explode(col("subject.concepts")).alias("concept")) \
    .select("concept.label") \
    .distinct() \
    .orderBy("label")

concept_labels.show(100, truncate=False)

# Count the number of unique concept labels
concept_label_count = concept_labels.count()
print(f"\nTotal number of unique concept labels: {concept_label_count}")

Sample concepts data from subjects:
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|concepts                                                                                                                                                         |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[{qsazb8t4, [{{nlm-mesh, Medical Subject Headings (MeSH) identifier, IdentifierType}, Identifier, D006298}], Health Services Administration, Concept}]           |
|[{z2eup69w, [{{nlm-mesh, Medical Subject Headings (MeSH) identifier, IdentifierType}, Identifier, D011635}], Public Health Administration, Concept}]             |
|[{eva7r2dw, [{{nlm-mesh, Medical Subject Headings (MeSH) identifier, IdentifierType}, Identifier, D012499}], Sanitation, Concept}]             

In [7]:
# Show frequency of each concept label (top 50)
print("Concept label frequencies (top 50):")
concept_label_frequencies = spark.table("works") \
    .select(explode(col("subjects")).alias("subject")) \
    .select(explode(col("subject.concepts")).alias("concept")) \
    .select("concept.label") \
    .groupBy("label") \
    .count() \
    .orderBy(col("count").desc())

concept_label_frequencies.show(50, truncate=False)

print(f"\nTotal number of concept label occurrences: {concept_label_frequencies.agg({'count': 'sum'}).collect()[0][0]}")

Concept label frequencies (top 50):
+----------------------------+-----+
|label                       |count|
+----------------------------+-----+
|history                     |72789|
|Public Health               |67797|
|Sanitation                  |67289|
|Disease Outbreaks           |63900|
|Water Supply                |63539|
|History                     |46476|
|Great Britain               |28001|
|Medicine                    |19286|
|Early works to 1800         |15904|
|United States               |13951|
|19th century                |12275|
|20th century                |12257|
|England                     |11720|
|Science                     |10536|
|London (England)            |10467|
|Physicians                  |8400 |
|prevention & control        |8350 |
|Hospitals                   |7963 |
|drug therapy                |6291 |
|therapy                     |6250 |
|18th century                |6085 |
|Women                       |6078 |
|Germany                     |6002 |
|F

In [4]:
# Export unique concept labels with their IDs, identifierType, and concept type to CSV
print("Extracting unique concept labels with IDs, identifierType, and concept type...")
concept_labels_with_details = spark.table("works") \
    .select(explode(col("subjects")).alias("subject")) \
    .select(
        explode(col("subject.concepts")).alias("concept"),
        explode(col("subject.identifiers")).alias("identifier"),
        col("subject.label").alias("subject_label")
    ) \
    .select(
        "concept.id", 
        "concept.label", 
        "concept.type",
        "identifier.identifierType.label",
        "subject_label"
    ) \
    .distinct() \
    .orderBy("concept.label")

# Show sample data
print("Sample concept labels with details:")
concept_labels_with_details.show(20, truncate=False)

# Count total unique concept labels with details
total_unique_concepts = concept_labels_with_details.count()
print(f"\nTotal unique concept labels with details: {total_unique_concepts}")

# Export to CSV
output_path = "../data/tmp/unique_concept_labels_with_details.csv"
print(f"\nExporting to: {output_path}")

concept_labels_with_details.toPandas().to_csv(output_path, index=False)

print("Export completed successfully!")

Extracting unique concept labels with IDs, identifierType, and concept type...
Sample concept labels with details:


                                                                                

+--------+---------------------------------------------------------------+------------+-------------------------------------------------+---------------------------------------------------------------+
|id      |label                                                          |type        |label                                            |subject_label                                                  |
+--------+---------------------------------------------------------------+------------+-------------------------------------------------+---------------------------------------------------------------+
|yrxf2hsf|!Kung (African people)                                         |Concept     |Library of Congress Subject Headings (LCSH)      |!Kung (African people)                                         |
|f67f2qvs|"Degewop" Gesellschaft wissenschaftlicher Organpräparate (Firm)|Organisation|Identifier derived from the label of the referent|"Degewop" Gesellschaft wissenschaftlicher Organpräparat

                                                                                


Total unique concept labels with details: 354910

Exporting to: ../data/tmp/unique_concept_labels_with_details.csv


                                                                                

Export completed successfully!
