# Patch required to work with selected LLM

The LLM used returns table names with "\n"s which causes an error when calling `InfoSparkSQLTool`.

Edit the file `.venv/lib/python3.13/site-packages/langchain_community/tools/spark_sql/tool.py`

In the `InfoSparkSQLTool` class, edit the method `_run` with this code:
```python
        """Get the schema for tables in a comma-separated list."""
        split = table_names.split(", ")
        # remove /n from the splits
        split = [s.replace("\n", "") for s in split]

        return self.db.get_table_info_no_throw(split)
```

## Reimplementation of SparkSQLToolkit

In [10]:
"""Toolkit for interacting with Spark SQL."""

from langchain_community.tools.spark_sql.prompt import QUERY_CHECKER
from langchain_core.prompts import PromptTemplate
from langchain_core.callbacks import (
    AsyncCallbackManagerForToolRun,
    CallbackManagerForToolRun,
)
from pydantic import BaseModel, Field, root_validator, model_validator, ConfigDict
from typing import Any, Dict, Optional
from typing import List

from langchain_core.language_models import BaseLanguageModel
from langchain_core.tools import BaseTool
from langchain_core.tools.base import BaseToolkit
from pydantic import ConfigDict, Field

from langchain_community.utilities.spark_sql import SparkSQL

# flake8: noqa
"""Tools for interacting with Spark SQL."""


class BaseSparkSQLTool(BaseModel):
    """Base tool for interacting with Spark SQL."""

    db: SparkSQL = Field(exclude=True)

    model_config = ConfigDict(
        arbitrary_types_allowed=True,
    )


class QuerySparkSQLTool(BaseSparkSQLTool, BaseTool):
    """Tool for querying a Spark SQL."""

    name: str = "query_sql_db"
    description: str = """
    Input to this tool is a detailed and correct SQL query, output is a result from the Spark SQL.
    If the query is not correct, an error message will be returned.
    If an error is returned, rewrite the query, check the query, and try again.
    """

    def _run(
        self,
        query: str,
        run_manager: Optional[CallbackManagerForToolRun] = None,
    ) -> str:
        """Execute the query, return the results or an error message."""
        return self.db.run_no_throw(query)


class InfoSparkSQLTool(BaseSparkSQLTool, BaseTool):
    """Tool for getting metadata about a Spark SQL."""

    name: str = "schema_sql_db"
    description: str = """
    Input to this tool is a comma-separated list of tables, output is the schema and sample rows for those tables.
    Be sure that the tables actually exist by calling list_tables_sql_db first!

    Example Input: "table1, table2, table3"
    """

    def _run(
        self,
        table_names: str,
        run_manager: Optional[CallbackManagerForToolRun] = None,
    ) -> str:
        """Get the schema for tables in a comma-separated list."""
        split = table_names.split(", ")
        # remove /n from the splits
        split = [s.replace("\n", "") for s in split]

        return self.db.get_table_info_no_throw(split)


class ListSparkSQLTool(BaseSparkSQLTool, BaseTool):
    """Tool for getting tables names."""

    name: str = "list_tables_sql_db"
    description: str = "Input is an empty string, output is a comma separated list of tables in the Spark SQL."

    def _run(
        self,
        tool_input: str = "",
        run_manager: Optional[CallbackManagerForToolRun] = None,
    ) -> str:
        """Get the schema for a specific table."""
        return ", ".join(self.db.get_usable_table_names())


class QueryCheckerTool(BaseSparkSQLTool, BaseTool):
    """Use an LLM to check if a query is correct.
    Adapted from https://www.patterns.app/blog/2023/01/18/crunchbot-sql-analyst-gpt/"""

    template: str = QUERY_CHECKER
    llm: BaseLanguageModel
    llm_chain: Any = Field(init=False)
    name: str = "query_checker_sql_db"
    description: str = """
    Use this tool to double check if your query is correct before executing it.
    Always use this tool before executing a query with query_sql_db!
    """

    @model_validator(mode="before")
    @classmethod
    def initialize_llm_chain(cls, values: Dict[str, Any]) -> Any:
        if "llm_chain" not in values:
            from langchain.chains.llm import LLMChain

            values["llm_chain"] = LLMChain(
                llm=values.get("llm"),  # type: ignore[arg-type]
                prompt=PromptTemplate(
                    template=QUERY_CHECKER, input_variables=["query"]
                ),
            )

        if values["llm_chain"].prompt.input_variables != ["query"]:
            raise ValueError(
                "LLM chain for QueryCheckerTool need to use ['query'] as input_variables "
                "for the embedded prompt")

        return values

    def _run(
        self,
        query: str,
        run_manager: Optional[CallbackManagerForToolRun] = None,
    ) -> str:
        """Use the LLM to check the query."""
        return self.llm_chain.predict(
            query=query, callbacks=run_manager.get_child()
            if run_manager else None)

    async def _arun(
        self,
        query: str,
        run_manager: Optional[AsyncCallbackManagerForToolRun] = None,
    ) -> str:
        return await self.llm_chain.apredict(
            query=query, callbacks=run_manager.get_child() if run_manager else None
        )


class SparkSQLToolkit(BaseToolkit):
    """Toolkit for interacting with Spark SQL.

    Parameters:
        db: SparkSQL. The Spark SQL database.
        llm: BaseLanguageModel. The language model.
    """

    db: SparkSQL = Field(exclude=True)
    llm: BaseLanguageModel = Field(exclude=True)

    model_config = ConfigDict(
        arbitrary_types_allowed=True,
    )

    def get_tools(self) -> List[BaseTool]:
        """Get the tools in the toolkit."""
        return [
            QuerySparkSQLTool(db=self.db),
            InfoSparkSQLTool(db=self.db),
            ListSparkSQLTool(db=self.db),
            QueryCheckerTool(db=self.db, llm=self.llm),
        ]

# Initialise Bedrock LLM

In [None]:
from wc_simd.llm import get_llm
import os

# Set LANGSMITH_PROJECT environment variable
os.environ['LANGSMITH_PROJECT'] = "describe_works"


# Claude is a good alternative to GPT-4o: https://blog.promptlayer.com/big-differences-claude-3-5-vs-gpt-4o/
llm = get_llm()

# Create Spark Context

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("test_pyspark") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g") \
    .config("spark.sql.orc.enableVectorizedReader", "false") \
    .config("spark.sql.parquet.columnarReaderBatchSize", "1024") \
    .config("spark.sql.orc.columnarReaderBatchSize", "1024") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/06 13:25:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/06 13:25:16 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
from langchain_community.utilities.spark_sql import SparkSQL
# Create a Spark SQL connection
# Note: You'll need to have Spark and a database set up
spark_sql = SparkSQL(spark_session=spark, include_tables=["works"])

# Test Spark SQL Toolkit

In [11]:
# Create individual Spark SQL tools
info_tool = InfoSparkSQLTool(db=spark_sql)
list_tool = ListSparkSQLTool(db=spark_sql)
query_checker_tool = QueryCheckerTool(db=spark_sql, llm=llm)
query_tool = QuerySparkSQLTool(db=spark_sql)

# Example usage of each tool

# 1. Get information about the database
print("Database Info:")
print(info_tool.run("works"))

# 2. List tables in the database
print("\nDatabase Tables:")
print(list_tool.run(""))

# 3. Check a SQL query (uses LLM to validate)
print("\nQuery Validation:")
query_to_check = "SELECT * FROM works WHERE Age > 30"
print(query_checker_tool.run(query_to_check))

# 4. Execute a SQL query
print("\nQuery Results:")
query = "SELECT * FROM works LIMIT 5"
print(query_tool.run(query))

  values["llm_chain"] = LLMChain(


Database Info:
CREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  designation ARRAY<STRING>,
  duration BIGINT,
  edition STRING,
  formerFrequency ARRAY<STRING>,
  genres ARRAY<STRUCT<concepts: ARRAY<STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>>, label: STRING, type: STRING>>,
  holdings ARRAY<STRUCT<enumeration: ARRAY<STRING>, location: STRUCT<accessConditions: ARRAY<STRUCT<met

# Create Spark SQL Agent

In [12]:
from langchain_community.agent_toolkits import create_spark_sql_agent

toolkit = SparkSQLToolkit(db=spark_sql, llm=llm)
agent_executor = create_spark_sql_agent(llm=llm, toolkit=toolkit, verbose=True, max_iterations=200)

In [None]:
agent_executor.run("Describe the works table")

  agent_executor.run("Describe the works table")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you describe the works table by examining its schema and sample data.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to provide a description.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  designation ARRAY<STRING>,
  duration BIGINT,
  edition ST

'The works table is a complex database table that stores information about various works or items in a collection. Based on the schema and sample data, here\'s a description:\n\nThe works table contains detailed metadata about different types of works including books, archives, manuscripts, and e-books. Key fields include:\n\n1. **Basic Identification**:\n   - `id`: Unique identifier for each work\n   - `title`: The title of the work\n   - `type`: The type of item (e.g., "Work")\n   - `workType`: Structured data about the format (e.g., E-books, Archives and manuscripts)\n   - `referenceNumber`: Reference or catalog number\n\n2. **Content Description**:\n   - `description`: Textual description of the work\n   - `physicalDescription`: Physical characteristics\n   - `alternativeTitles`: Array of alternative titles\n   - `lettering`: Text information about lettering\n   - `edition`: Edition information\n   - `duration`: Duration (for time-based media)\n\n3. **Contributors and Creation**:\n

25/04/14 16:58:01 WARN HikariPool: HikariPool-1 - Thread starvation or clock leap detected (housekeeper delta=1m19s57ms).
25/04/14 16:58:01 WARN HikariPool: HikariPool-2 - Thread starvation or clock leap detected (housekeeper delta=1m19s58ms).
25/04/14 16:58:50 WARN HikariPool: HikariPool-1 - Thread starvation or clock leap detected (housekeeper delta=48s901ms).
25/04/14 16:58:50 WARN HikariPool: HikariPool-2 - Thread starvation or clock leap detected (housekeeper delta=48s901ms).
25/04/14 17:00:56 WARN HikariPool: HikariPool-2 - Thread starvation or clock leap detected (housekeeper delta=2m5s555ms).
25/04/14 17:00:56 WARN HikariPool: HikariPool-1 - Thread starvation or clock leap detected (housekeeper delta=2m5s600ms).
25/04/14 17:03:16 WARN HikariPool: HikariPool-2 - Thread starvation or clock leap detected (housekeeper delta=2m20s817ms).
25/04/14 17:03:16 WARN HikariPool: HikariPool-1 - Thread starvation or clock leap detected (housekeeper delta=2m20s784ms).
25/04/14 17:21:53 WARN H

In [6]:
agent_executor.run("Describe each sub-structure in works table")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you describe the sub-structures in the works table. Let me first check what tables are available in the database.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to understand its sub-structures.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  design

'The works table contains numerous complex sub-structures that represent various aspects of works in what appears to be a library or archive catalog system. Here are the main sub-structures:\n\n1. **Simple Array Fields**:\n   - `alternativeTitles`: Array of alternative titles for the work\n   - `designation`: Array of designations\n   - `formerFrequency`: Array of former publication frequencies\n\n2. **Complex Array Structures**:\n   - `availabilities`: Array of structured availability information (id, label, type)\n   - `contributors`: Array of contributor information with nested structures including agent details, roles, and whether they are primary contributors\n   - `genres`: Array of genre information with nested concept structures\n   - `holdings`: Array of holding information including enumeration, location details, and notes\n   - `identifiers`: Array of identifier information with type details\n   - `images`: Array of image references\n   - `items`: Array of item information w

In [9]:
agent_executor.run("For each \"Complex Array Structure\" in works table, give a summary of the data in it.")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you analyze the "Complex Array Structure" in the works table. Let's start by exploring the database structure.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to understand what "Complex Array Structure" might refer to.

Action: schema_sql_db
Action Input: works
[0m

                                                                                


Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  designation ARRAY<STRING>,
  duration BIGINT,
  edition STRING,
  formerFrequency ARRAY<STRING>,
  genres ARRAY<STRUCT<concepts: ARRAY<STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>>, label: STRING, type: STRING>>,
  holdings ARRAY<STRUCT<enumeration: ARRAY<STRING>, location: STRUCT<accessConditions: ARRA

                                                                                


Observation: [36;1m[1;3m[('alternativeTitles', '1166657', '235633', '0.2148626374332816'), ('availabilities', '1166657', '1106148', '1.112278930311137'), ('contributors', '1166657', '769602', '0.979999262851035'), ('genres', '1166657', '431157', '0.6070747443335959'), ('identifiers', '1166657', '1166657', '3.4292838426375534'), ('languages', '1166657', '737846', '0.6593068913999574')][0m
Thought:[32;1m[1;3mI need to analyze more complex array structures in the works table. Let me check some additional ones.

Action: query_checker_sql_db
Action Input: SELECT 
  'items' as array_name, 
  COUNT(*) as total_records, 
  COUNT(CASE WHEN size(items) > 0 THEN 1 END) as non_empty_records,
  AVG(size(items)) as avg_elements
FROM works
UNION ALL
SELECT 
  'subjects' as array_name, 
  COUNT(*) as total_records, 
  COUNT(CASE WHEN size(subjects) > 0 THEN 1 END) as non_empty_records,
  AVG(size(subjects)) as avg_elements
FROM works
UNION ALL
SELECT 
  'production' as array_name, 
  COUNT(*) as

                                                                                


Observation: [36;1m[1;3m[('items', '1166657', '928750', '0.9965413999144564'), ('subjects', '1166657', '600117', '1.5978886682203939'), ('production', '1166657', '1101887', '0.9461349822612816'), ('notes', '1166657', '748644', '1.054298735618095'), ('partOf', '1166657', '682162', '0.6352852637921858'), ('parts', '1166657', '26898', '0.22384728330606168')][0m
Thought:[32;1m[1;3mI need to examine the structure of one of these complex arrays to understand their contents better.

Action: query_checker_sql_db
Action Input: SELECT 
  'contributors' as array_name,
  explode(contributors) as contributor_details
FROM works
WHERE size(contributors) > 0
LIMIT 5
[0m
Observation: [36;1m[1;3mThe query looks correct. It selects from the "works" table, filters for rows where the "contributors" array has at least one element, and uses the EXPLODE function to flatten the array elements into separate rows. The query also adds a column to identify the source array name and limits the results to 5

'# Summary of Complex Array Structures in the Works Table\n\nBased on my analysis, here are the key complex array structures in the works table and their characteristics:\n\n## Array Usage Statistics\n| Array Name | Total Records | Non-Empty Records | % With Data | Avg Elements |\n|------------|---------------|-------------------|------------|--------------|\n| identifiers | 1,166,657 | 1,166,657 | 100% | 3.43 |\n| availabilities | 1,166,657 | 1,106,148 | 94.8% | 1.11 |\n| production | 1,166,657 | 1,101,887 | 94.4% | 0.95 |\n| items | 1,166,657 | 928,750 | 79.6% | 1.00 |\n| contributors | 1,166,657 | 769,602 | 66.0% | 0.98 |\n| notes | 1,166,657 | 748,644 | 64.2% | 1.05 |\n| languages | 1,166,657 | 737,846 | 63.2% | 0.66 |\n| partOf | 1,166,657 | 682,162 | 58.5% | 0.64 |\n| subjects | 1,166,657 | 600,117 | 51.4% | 1.60 |\n| genres | 1,166,657 | 431,157 | 37.0% | 0.61 |\n| alternativeTitles | 1,166,657 | 235,633 | 20.2% | 0.21 |\n| parts | 1,166,657 | 26,898 | 2.3% | 0.22 |\n\n## Struct

# Now with Memory

In [7]:
from langgraph.checkpoint.memory import MemorySaver
from langchain_community.agent_toolkits import SparkSQLToolkit, create_spark_sql_agent

toolkit = SparkSQLToolkit(db=spark_sql, llm=llm)
memory = MemorySaver()
agent_executor_mem = create_spark_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
    max_iterations=200,
    checkpointer=memory,
    config=dict(configurable=dict(thread_id="user_conversation_1"))
)

In [5]:
from IPython.display import Markdown

display(Markdown("""# Header
*some markdown* $\\phi$"""))

# Header
*some markdown* $\phi$

In [11]:
result = agent_executor_mem.run(
    "List each \"Complex Array Structure\" in works table")
display(Markdown(result))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you find the "Complex Array Structure" in the works table.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to understand what complex array structures it contains.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  designation ARRAY<STRING>,
  duration 

                                                                                


Observation: [36;1m[1;3m[('alternativeTitles', '235633'), ('availabilities', '1106148'), ('contributors', '769602'), ('designation', '12282'), ('formerFrequency', '728'), ('genres', '431157'), ('holdings', '210979'), ('identifiers', '1166657'), ('images', '84699'), ('items', '928750')][0m
Thought:[32;1m[1;3mI need to continue getting the counts for the remaining array fields.

Action: query_sql_db
Action Input: SELECT 
  'languages' as array_name, 
  COUNT(*) as count 
FROM works 
WHERE size(languages) > 0
UNION ALL
SELECT 
  'notes' as array_name, 
  COUNT(*) as count 
FROM works 
WHERE size(notes) > 0
UNION ALL
SELECT 
  'partOf' as array_name, 
  COUNT(*) as count 
FROM works 
WHERE size(partOf) > 0
UNION ALL
SELECT 
  'parts' as array_name, 
  COUNT(*) as count 
FROM works 
WHERE size(parts) > 0
UNION ALL
SELECT 
  'precededBy' as array_name, 
  COUNT(*) as count 
FROM works 
WHERE size(precededBy) > 0
UNION ALL
SELECT 
  'production' as array_name, 
  COUNT(*) as count 
FROM

                                                                                


Observation: [36;1m[1;3m[('languages', '737846'), ('notes', '748644'), ('partOf', '682162'), ('parts', '26898'), ('precededBy', '234251'), ('production', '1101887'), ('subjects', '600117'), ('succeededBy', '234261')][0m
Thought:[32;1m[1;3mI need to examine the complex array structures in the works table.

Action: query_sql_db
Action Input: SELECT column_name, data_type 
FROM information_schema.columns 
WHERE table_name = 'works' 
AND data_type LIKE 'ARRAY%'
ORDER BY column_name
[0m
Observation: [36;1m[1;3mError: [TABLE_OR_VIEW_NOT_FOUND] The table or view `information_schema`.`columns` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 2 pos 5;
'Sort ['column_name ASC NULLS FIRST], true
+- 'Project ['column_name, 'data_type]
  

25/04/15 13:43:14 WARN ObjectStore: Failed to get database information_schema, returning NoSuchObjectException


[32;1m[1;3mI need to use a different approach to get the schema information. Let me try to list all the complex array structures in the works table.

Action: query_sql_db
Action Input: SELECT 
  'alternativeTitles' as array_name, 
  'ARRAY<STRING>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(alternativeTitles) > 0
UNION ALL
SELECT 
  'availabilities' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(availabilities) > 0
UNION ALL
SELECT 
  'contributors' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(contributors) > 0
UNION ALL
SELECT 
  'designation' as array_name, 
  'ARRAY<STRING>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(designation) > 0
UNION ALL
SELECT 
  'formerFrequency' as array_name, 
  'ARRAY<STRING>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(formerFrequency) > 0
UNION ALL
SELECT 
  'genres' as array_name, 
  'ARRAY<STRUCT>' as array_typ

                                                                                


Observation: [36;1m[1;3m[('availabilities', 'ARRAY<STRUCT>', '1106148'), ('contributors', 'ARRAY<STRUCT>', '769602'), ('genres', 'ARRAY<STRUCT>', '431157'), ('alternativeTitles', 'ARRAY<STRING>', '235633'), ('designation', 'ARRAY<STRING>', '12282'), ('formerFrequency', 'ARRAY<STRING>', '728')][0m
Thought:[32;1m[1;3mI need to continue getting information about the complex array structures in the works table.

Action: query_sql_db
Action Input: SELECT 
  array_name,
  array_type,
  count
FROM (
  SELECT 
    'alternativeTitles' as array_name, 
    'ARRAY<STRING>' as array_type,
    COUNT(*) as count 
  FROM works 
  WHERE size(alternativeTitles) > 0
  UNION ALL
  SELECT 
    'availabilities' as array_name, 
    'ARRAY<STRUCT>' as array_type,
    COUNT(*) as count 
  FROM works 
  WHERE size(availabilities) > 0
  UNION ALL
  SELECT 
    'contributors' as array_name, 
    'ARRAY<STRUCT>' as array_type,
    COUNT(*) as count 
  FROM works 
  WHERE size(contributors) > 0
  UNION ALL
  

                                                                                


Observation: [36;1m[1;3m[('availabilities', 'ARRAY<STRUCT>', '1106148'), ('contributors', 'ARRAY<STRUCT>', '769602'), ('genres', 'ARRAY<STRUCT>', '431157'), ('alternativeTitles', 'ARRAY<STRING>', '235633'), ('designation', 'ARRAY<STRING>', '12282'), ('formerFrequency', 'ARRAY<STRING>', '728')][0m
Thought:[32;1m[1;3mI need to continue with more array fields to get a complete picture.

Action: query_sql_db
Action Input: SELECT 
  'holdings' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(holdings) > 0
UNION ALL
SELECT 
  'identifiers' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(identifiers) > 0
UNION ALL
SELECT 
  'images' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(images) > 0
UNION ALL
SELECT 
  'items' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(items) > 0
UNION ALL
SELECT 
  'languages' as

                                                                                


Observation: [36;1m[1;3m[('identifiers', 'ARRAY<STRUCT>', '1166657'), ('items', 'ARRAY<STRUCT>', '928750'), ('languages', 'ARRAY<STRUCT>', '737846'), ('holdings', 'ARRAY<STRUCT>', '210979'), ('images', 'ARRAY<STRUCT>', '84699')][0m
Thought:[32;1m[1;3mI need to get the remaining array fields.

Action: query_sql_db
Action Input: SELECT 
  'notes' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(notes) > 0
UNION ALL
SELECT 
  'partOf' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(partOf) > 0
UNION ALL
SELECT 
  'parts' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(parts) > 0
UNION ALL
SELECT 
  'precededBy' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(precededBy) > 0
UNION ALL
SELECT 
  'production' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(productio

                                                                                


Observation: [36;1m[1;3m[('production', 'ARRAY<STRUCT>', '1101887'), ('notes', 'ARRAY<STRUCT>', '748644'), ('partOf', 'ARRAY<STRUCT>', '682162'), ('subjects', 'ARRAY<STRUCT>', '600117'), ('succeededBy', 'ARRAY<STRUCT>', '234261'), ('precededBy', 'ARRAY<STRUCT>', '234251'), ('parts', 'ARRAY<STRUCT>', '26898')][0m
Thought:[32;1m[1;3mI now know the final answer

Final Answer: Here's a list of all the complex array structures in the works table, ordered by the number of records containing each structure:

1. identifiers (ARRAY<STRUCT>) - 1,166,657 records
2. availabilities (ARRAY<STRUCT>) - 1,106,148 records
3. production (ARRAY<STRUCT>) - 1,101,887 records
4. items (ARRAY<STRUCT>) - 928,750 records
5. contributors (ARRAY<STRUCT>) - 769,602 records
6. notes (ARRAY<STRUCT>) - 748,644 records
7. languages (ARRAY<STRUCT>) - 737,846 records
8. partOf (ARRAY<STRUCT>) - 682,162 records
9. subjects (ARRAY<STRUCT>) - 600,117 records
10. genres (ARRAY<STRUCT>) - 431,157 records
11. alternativ

Here's a list of all the complex array structures in the works table, ordered by the number of records containing each structure:

1. identifiers (ARRAY<STRUCT>) - 1,166,657 records
2. availabilities (ARRAY<STRUCT>) - 1,106,148 records
3. production (ARRAY<STRUCT>) - 1,101,887 records
4. items (ARRAY<STRUCT>) - 928,750 records
5. contributors (ARRAY<STRUCT>) - 769,602 records
6. notes (ARRAY<STRUCT>) - 748,644 records
7. languages (ARRAY<STRUCT>) - 737,846 records
8. partOf (ARRAY<STRUCT>) - 682,162 records
9. subjects (ARRAY<STRUCT>) - 600,117 records
10. genres (ARRAY<STRUCT>) - 431,157 records
11. alternativeTitles (ARRAY<STRING>) - 235,633 records
12. succeededBy (ARRAY<STRUCT>) - 234,261 records
13. precededBy (ARRAY<STRUCT>) - 234,251 records
14. holdings (ARRAY<STRUCT>) - 210,979 records
15. images (ARRAY<STRUCT>) - 84,699 records
16. parts (ARRAY<STRUCT>) - 26,898 records
17. designation (ARRAY<STRING>) - 12,282 records
18. formerFrequency (ARRAY<STRING>) - 728 records

The table contains 15 complex array structures of type ARRAY<STRUCT> and 3 simple array structures of type ARRAY<STRING>.

In [16]:
result = agent_executor_mem.run(
    input="Write helpful documentation for the \"availabilities\" \"Complex Array Structure\" in works table. Include sample values for each field. If a field is an enumerated or has set of distinct values, list them all.",
    handle_parsing_errors=True)
display(Markdown(result))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you understand the "availabilities" complex array structure in the works table.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to understand the "availabilities" complex array structure.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  designation AR

                                                                                


Observation: [36;1m[1;3m[('closed-stores', 'Closed stores', 'Availability'), ('online', 'Online', 'Availability'), ('open-shelves', 'Open shelves', 'Availability')][0m
Thought:[32;1m[1;3mI now know the final answer

Final Answer: # Documentation: "availabilities" Complex Array Structure in works table

## Overview
The "availabilities" field in the works table is an array of structured objects that indicates where and how a work can be accessed within the collection.

## Structure
Each element in the "availabilities" array has the following structure:

```
{
  id: STRING,       // Identifier for the availability type
  label: STRING,    // Human-readable description of the availability
  type: STRING      // Always "Availability" - indicates the field type
}
```

## Possible Values
The "availabilities" array can contain the following distinct values:

| id | label | type |
|----|-------|------|
| closed-stores | Closed stores | Availability |
| online | Online | Availability |
| o

# Documentation: "availabilities" Complex Array Structure in works table

## Overview
The "availabilities" field in the works table is an array of structured objects that indicates where and how a work can be accessed within the collection.

## Structure
Each element in the "availabilities" array has the following structure:

```
{
  id: STRING,       // Identifier for the availability type
  label: STRING,    // Human-readable description of the availability
  type: STRING      // Always "Availability" - indicates the field type
}
```

## Possible Values
The "availabilities" array can contain the following distinct values:

| id | label | type |
|----|-------|------|
| closed-stores | Closed stores | Availability |
| online | Online | Availability |
| open-shelves | Open shelves | Availability |

## Sample Values
Examples of how this array appears in records:

```json
// Example 1: Work available in closed stores only
[
  {
    "id": "closed-stores",
    "label": "Closed stores",
    "type": "Availability"
  }
]

// Example 2: Work available both online and in closed stores
[
  {
    "id": "closed-stores",
    "label": "Closed stores",
    "type": "Availability"
  },
  {
    "id": "online",
    "label": "Online",
    "type": "Availability"
  }
]

// Example 3: Work available on open shelves
[
  {
    "id": "open-shelves",
    "label": "Open shelves",
    "type": "Availability"
  }
]
```

## Usage
The "availabilities" field is used to determine where users can access a particular work. This information helps users understand if they need to request items from closed storage, if they can access them directly on open shelves, or if digital versions are available online.

In [17]:
result = agent_executor_mem.run(
    input="Write helpful documentation for the \"production\" \"Complex Array Structure\" in works table. Include sample values for each field. If a field is an enumerated or has set of distinct values, list them all.",
    handle_parsing_errors=True)
display(Markdown(result))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you create documentation for the "Complex Array Structure" in the "works" table.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the "works" table to understand its structure, particularly focusing on the "production" field which appears to contain a complex array structure.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: ST

                                                                                


Observation: [36;1m[1;3m[('Publication',), ('Manufacture',), ('Production',), ('Distribution',)][0m
Thought:[32;1m[1;3mI need to get more examples of the production field structure to create comprehensive documentation.

Action: query_sql_db
Action Input: SELECT 
  COUNT(*) as total_count,
  COUNT(CASE WHEN size(production) > 0 THEN 1 END) as with_production,
  COUNT(CASE WHEN size(production) > 0 AND production[0].function IS NOT NULL THEN 1 END) as with_function
FROM works
[0m
Observation: [36;1m[1;3m[('1167387', '1102958', '125556')][0m
Thought:[32;1m[1;3mI need to understand the structure of the agents field within production.

Action: query_sql_db
Action Input: SELECT 
  COUNT(CASE WHEN size(production) > 0 AND size(production[0].agents) > 0 THEN 1 END) as with_agents,
  COUNT(CASE WHEN size(production) > 0 AND size(production[0].places) > 0 THEN 1 END) as with_places,
  COUNT(CASE WHEN size(production) > 0 AND size(production[0].dates) > 0 THEN 1 END) as with_dates
FR

# Documentation for "production" Complex Array Structure in Works Table

## Overview
The `production` field in the works table is a complex array structure that contains information about the production events associated with a work, such as publication, manufacture, distribution, or production details.

## Structure
The `production` field is an ARRAY of STRUCT with the following structure:

```
ARRAY<STRUCT<
  agents: ARRAY<STRUCT<label: STRING, type: STRING>>,
  dates: ARRAY<STRUCT<label: STRING, type: STRING>>,
  function: STRUCT<label: STRING, type: STRING>,
  label: STRING,
  places: ARRAY<STRUCT<label: STRING, type: STRING>>,
  type: STRING
>>
```

## Field Descriptions

### Top-level Fields
- **agents**: Array of entities involved in the production event
- **dates**: Array of dates associated with the production event
- **function**: The role or purpose of the production event
- **label**: A formatted string representation of the production event
- **places**: Array of locations associated with the production event
- **type**: Always "ProductionEvent"

### Sub-field Details

#### agents
- **label**: Name of the agent (e.g., publisher, manufacturer)
- **type**: Always "Agent"

#### dates
- **label**: Date or date range of the production event (e.g., "1981", "[2008]", "between 1880 and 1900?")
- **type**: Always "Period"

#### function
- **label**: Type of production event
- **type**: Always "Concept"
- **Enumerated values for label**:
  - "Publication"
  - "Manufacture"
  - "Production"
  - "Distribution"

#### places
- **label**: Location name (e.g., "London", "Philadelphia")
- **type**: Always "Place"

## Usage Statistics
- Total works: 1,167,387
- Works with production data: 1,102,958 (94.5%)
- Works with function specified: 125,556 (10.8%)
- Works with agents specified: 520,607 (44.6%)
- Works with places specified: 746,916 (64.0%)
- Works with dates specified: 926,959 (79.4%)
- Works with multiple production events: 1,513 (0.1%)

## Sample Values

### Example 1: Basic Publication
```
[Row(
  agents=[Row(label="Women's Aid Federation England", type='Agent')],
  dates=[Row(label='1981', type='Period')],
  function=None,
  label="[London] : Women's Aid Federation England, 1981.",
  places=[Row(label='[London]', type='Place')],
  type='ProductionEvent'
)]
```

### Example 2: Date-only Production
```
[Row(
  agents=[],
  dates=[Row(label='18th July 1923', type='Period')],
  function=None,
  label='18th July 1923',
  places=[],
  type='ProductionEvent'
)]
```

### Example 3: Publication with Function
```
[Row(
  agents=[Row(label='Orion Spring', type='Agent')],
  dates=[Row(label='2022', type='Period')],
  function=Row(label='Publication', type='Concept'),
  label='London : Orion Spring, 2022.',
  places=[Row(label='London', type='Place')],
  type='ProductionEvent'
)]
```

### Example 4: Multiple Production Events
```
[Row(
  agents=[],
  dates=[Row(label='©2014', type='Period')],
  function=Row(label='Production', type='Concept'),
  label='©2014.',
  places=[],
  type='ProductionEvent'
), 
Row(
  agents=[Row(label='Stanford University Press', type='Agent')],
  dates=[Row(label='2014', type='Period')],
  function=Row(label='Publication', type='Concept'),
  label='Stanford, California : Stanford University Press, 2014.',
  places=[Row(label='Stanford, California', type='Place')],
  type='ProductionEvent'
)]
```

###

In [11]:
result = agent_executor_mem.run(
    input="Write helpful documentation for the \"Items\" \"Complex Array Structure\" in works table. Include sample values for each field. If a field is an enumerated or has set of distinct values, list them all."
    , handle_parsing_errors=True)
display(Markdown(result))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you understand the "Items" complex array structure in the works table. Let me first check what tables are available in the database.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to understand the "Items" complex array structure.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency ST

# Documentation for "Items" Complex Array Structure in Works Table

The "items" field in the works table is a complex array structure that contains information about individual physical or digital items associated with a work. Each item in the array has the following structure:

## Structure Overview
```
items: ARRAY<STRUCT<
  id: STRING,
  identifiers: ARRAY<STRUCT<...>>,
  locations: ARRAY<STRUCT<...>>,
  note: STRING,
  title: STRING,
  type: STRING
>>
```

## Field Descriptions and Sample Values

### id
- **Description**: Unique identifier for the item
- **Sample Values**: 'ere6ew33', 'drxrd6n5', 'kcm9u6dh', etc.
- **Note**: Can be null for some digital items

### identifiers
- **Description**: Array of identifier structures for the item
- **Structure**:
  ```
  identifiers: ARRAY<STRUCT<
    identifierType: STRUCT<id: STRING, label: STRING, type: STRING>,
    type: STRING,
    value: STRING
  >>
  ```
- **Sample Values**:
  - Sierra system number: 'i17016629', 'i19309223', 'i14728278'
  - Sierra identifier: '1701662', '1930922', '1472827'

### locations
- **Description**: Array of location information for the item
- **Structure**:
  ```
  locations: ARRAY<STRUCT<
    accessConditions: ARRAY<STRUCT<
      method: STRUCT<id: STRING, label: STRING, type: STRING>,
      note: STRING,
      status: STRUCT<id: STRING, label: STRING, type: STRING>,
      terms: STRING,
      type: STRING
    >>,
    credit: STRING,
    label: STRING,
    license: STRUCT<id: STRING, label: STRING, type: STRING, url: STRING>,
    linkText: STRING,
    locationType: STRUCT<id: STRING, label: STRING, type: STRING>,
    shelfmark: STRING,
    type: STRING,
    url: STRING
  >>
  ```

#### locationType (distinct values)
- 'closed-stores' (Physical location)
- 'iiif-presentation' (Digital location)
- 'iiif-image' (Digital location)
- 'online-resource' (Digital location)

#### accessConditions.method (distinct values)
- 'online-request' (For physical items)
- 'view-online' (For digital items)

#### accessConditions.status (distinct values)
- 'open'
- 'restricted'
- 'licensed-resources'

#### license (distinct values)
- 'pdm' (Public Domain Mark)
- 'inc' (In Copyright)

### note
- **Description**: Additional information about the item
- **Sample Values**: Typically null, but can contain notes like volume information

### title
- **Description**: Title specific to the item (may differ from the work title)
- **Sample Values**: 'Vol. 1 1905', or null when same as work title

### type
- **Description**: The type of item
- **Sample Value**: 'Item' (appears to be the standard value)

## Usage Examples
1. Physical items typically have:
   - Sierra identifiers
   - 'closed-stores' locationType
   - 'online-request' access method
   - Shelfmark (e.g., 'M12242', '545970i.1')

2. Digital items typically have:
   - No id or empty identifiers array
   - Digital locationType ('iiif-presentation', 'iiif-image', 'online-resource')
   - 'view-online' access method
   - URL to access the digital resource
   - License information

3. Some works have both physical and digital representations of the same item, shown as separate entries in the items array.

In [19]:
result = agent_executor_mem.run(
    input="Write helpful documentation for the \"contributors\" \"Complex Array Structure\" in works table. Include sample values for each field. If a field is an enumerated or has set of distinct values, list them all."
    , handle_parsing_errors=True)
display(Markdown(result))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you explore the "contributors" complex array structure in the "works" table.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the "works" table to understand the "contributors" complex array structure.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  designation ARRAY

ValueError: An output parsing error occurred. In order to pass this error back to the agent and have it try again, pass `handle_parsing_errors=True` to the AgentExecutor. This is the error: Could not parse LLM output: `# Contributors Complex Array Structure Documentation

## Overview
The `contributors` field in the `works` table is a complex array structure that stores information about individuals and organizations who contributed to a work. Each entry in the array represents a single contributor with detailed information about their identity, role, and relationship to the work.

## Structure
The `contributors` array contains objects with the following fields:

### 1. `agent` (STRUCT)
Contains information about the contributor entity:

- **`id`** (STRING): Unique identifier for the agent
- **`identifiers`** (ARRAY\<STRUCT>): Array of identifier objects with:
  - **`identifierType`** (STRUCT): Contains identifier type information:
    - **`id`** (STRING): Identifier type code (e.g., 'lc-names', 'label-derived')
    - **`label`** (STRING): Human-readable identifier type (e.g., 'Library of Congress Name authority records')
    - **`type`** (STRING): Always 'IdentifierType'
  - **`type`** (STRING): Always 'Identifier'
  - **`value`** (STRING): The actual identifier value (e.g., 'n79067826')
- **`label`** (STRING): Display name of the contributor (e.g., 'Binney, Val.', 'World Health Organization. Regional Office for Europe.')
- **`type`** (STRING): Entity type, either 'Person' or 'Organisation'

### 2. `primary` (BOOLEAN)
Indicates whether this contributor is the primary contributor to the work (TRUE/FALSE)

### 3. `roles` (ARRAY\<STRUCT>)
Array of roles the contributor played in relation to the work:
- **`label`** (STRING): Role name (e.g., 'author', 'editor', 'dedicatee', 'publisher', 'data contributor')
- **`type`** (STRING): Always 'ContributionRole'

### 4. `type` (STRING)
Always 'Contributor'

## Sample Values

### Complete Contributor Examples:

1. **Primary Author (Person)**:
```
{
  agent: {
    id: 'up3qag2d',
    identifiers: [{
      identifierType: {
        id: 'lc-names',
        label: 'Library of Congress Name authority records',
        type: 'IdentifierType'
      },
      type: 'Identifier',
      value: 'n88253144'
    }],
    label: 'Yeo, Giles',
    type: 'Person'
  },
  primary: true,
  roles: [{
    label: 'author',
    type: 'ContributionRole'
  }],
  type: 'Contributor'
}
```

2. **Organization with Multiple Roles**:
```
{
  agent: {
    id: 'vawcuuxj',
    identifiers: [{
      identifierType: {
        id: 'lc-names',
        label: 'Library of Congress Name authority records',
        type: 'IdentifierType'
      },
      type: 'Identifier',
      value: 'n81056556'
    }],
    label: 'Agence nationale de recherches sur le SIDA (France)',
    type: 'Organisation'
  },
  primary: false,
  roles: [{
    label: 'author,',
    type: 'ContributionRole'
  }, {
    label: 'publisher',
    type: 'ContributionRole'
  }],
  type: 'Contributor'
}
```

### Distinct Role Values:
- author
- editor
- dedicatee
- publisher
- data contributor

### Agent Types:
- Person
- Organisation

## Usage Notes
1. A work can have multiple contributors, with one typically marked as `primary: true`
2. Contributors may have empty `roles` arrays or multiple roles
3. The `identifiers` array typically contains standardized identifiers like Library of Congress Name authority records
4. Both individuals (Person) and organizations (Organisation) can be contributors`
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE 

25/05/03 05:03:13 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

In [None]:
result = agent_executor_mem.run(
    input="Write helpful documentation for the \"Genre\" \"Complex Array Structure\" in works table. Include sample values for each field. If a field is an enumerated or has set of distinct values, list them all.",
    handle_parsing_errors=True)
display(Markdown(result))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you understand the "Genre" "Complex Array Structure" in the works table. Let me first check what tables are available in the database.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to understand the Genre complex array structure.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency ST

                                                                                


Observation: [36;1m[1;3m[('AB.AH',), ('Abstracts',), ('Academic Dissertations',), ('Academic addresses',), ('Academic addresses - Netherlands - Leiden',), ('Academic catalogues',), ('Academic dissertations',), ('Academic dissertations - Austria - Vienna',), ('Academic dissertations - Denmark - Copenhagen - 18th cent',), ('Academic dissertations - England - Cambridge - 18th cent',), ('Academic dissertations - France - 17th cent',), ('Academic dissertations - France - 19th cent',), ('Academic dissertations - France - Montpellier',), ('Academic dissertations - France - Montpellier - 18th cent',), ('Academic dissertations - France - Paris',), ('Academic dissertations - France - Paris - 18th cent',), ('Academic dissertations - France - Paris - 19th cent',), ('Academic dissertations - France - Strasbourg - 17th cent',), ('Academic dissertations - France - Strasbourg - 18th cent',), ('Academic dissertations - Germany - 18th cent',), ('Academic dissertations - Germany - Altdorf - 17th cent'

# Documentation: "Genre" Complex Array Structure in Works Table

## Overview
The "Genre" field in the works table is implemented as a complex array structure that categorizes works by their genre types. This structure allows for multiple genres to be associated with a single work.

## Structure
The `genres` field is an array of structs with the following structure:

```
genres: ARRAY<STRUCT<
  concepts: ARRAY<STRUCT<
    id: STRING,
    identifiers: ARRAY<STRUCT<
      identifierType: STRUCT<
        id: STRING,
        label: STRING,
        type: STRING
      >,
      type: STRING,
      value: STRING
    >>,
    label: STRING,
    type: STRING
  >>,
  label: STRING,
  type: STRING
>>
```

## Field Descriptions

### Top-level Fields
- **label**: The display name of the genre (e.g., "Group portraits", "Pamphlets")
- **type**: Always "Genre" for this structure
- **concepts**: An array of concept structs that provide additional information about the genre

### Concept Fields
- **id**: A unique identifier for the genre concept (e.g., "fnmpvt78", "pg4zm9nx")
- **label**: The display name of the genre concept (same as the parent label)
- **type**: Always "Genre" for genre concepts
- **identifiers**: An array of identifier structs that provide external references

### Identifier Fields
- **value**: The actual identifier value (e.g., "group portraits", "engravings")
- **type**: Always "Identifier" for this structure
- **identifierType**: A struct containing metadata about the identifier type
  - **id**: Usually "label-derived" for genres
  - **label**: Description of the identifier source (e.g., "Identifier derived from the label of the referent")
  - **type**: Always "IdentifierType" for this structure

## Sample Values
Here are some examples of genre values found in the database:

- Group portraits
- Pamphlets
- Electronic books
- Chromolithographs
- Calendars
- Mezzotints
- Portrait prints
- Science fiction
- Short stories
- Academic dissertations (with various geographic and temporal qualifiers)
- Abstracts

## Usage Notes
1. A work can have multiple genres (the field is an array)
2. Each genre has its own unique identifier and may have additional metadata
3. Genre labels are often standardized terms from controlled vocabularies
4. Some genres include geographic and temporal qualifiers (e.g., "Academic dissertations - Germany - Leipzig - 18th cent")

This structure allows for rich genre classification and enables faceted searching and filtering of works by their genre characteristics.

In [None]:
result = agent_executor_mem.run(
    input="Write helpful documentation for the \"Works\" table and all of its nested \"Complex Array Structures\" and arrays. Include sample values for each field. If a field is an enumerated or has set of distinct values, list them all."
    , handle_parsing_errors=True)
display(Markdown(result))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you explore the "Works" table and its nested complex array structure.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the "Works" table to understand its structure, including any nested complex array structures.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  design

                                                                                


Observation: [36;1m[1;3m[('closed-stores', 'Closed stores', 'Availability'), ('online', 'Online', 'Availability'), ('open-shelves', 'Open shelves', 'Availability')][0m
Thought:[32;1m[1;3mI now know the final answer

Final Answer: # Works Table Documentation

The "Works" table contains detailed information about various works in a collection, with a rich structure of nested arrays and complex data types. Below is comprehensive documentation of the table structure and its nested components.

## Basic Fields

| Field | Type | Description | Sample Values |
|-------|------|-------------|--------------|
| id | STRING | Unique identifier for the work | "upv6vqtt", "xpk6b63t", "ue6gc5rt" |
| title | STRING | Title of the work | "Leaving violent men : a study of refuges and housing for battered women", "Sales catalogue: Phillips, Son and Neale" |
| type | STRING | Type classification of the record | "Work" |
| description | STRING | Description of the work | "Decorative and Household Furn

# Works Table Documentation

The "Works" table contains detailed information about various works in a collection, with a rich structure of nested arrays and complex data types. Below is comprehensive documentation of the table structure and its nested components.

## Basic Fields

| Field | Type | Description | Sample Values |
|-------|------|-------------|--------------|
| id | STRING | Unique identifier for the work | "upv6vqtt", "xpk6b63t", "ue6gc5rt" |
| title | STRING | Title of the work | "Leaving violent men : a study of refuges and housing for battered women", "Sales catalogue: Phillips, Son and Neale" |
| type | STRING | Type classification of the record | "Work" |
| description | STRING | Description of the work | "Decorative and Household Furniture" |
| physicalDescription | STRING | Physical description of the work | "xvii, 112 pages : illustrations, 1 map, portraits ; 21 cm", "1 print : engraving ; image 42.5 x 59.5 cm" |
| referenceNumber | STRING | Reference number | "WA/HMM/CM/Sal/15/247", "545970i" |
| edition | STRING | Edition information | null |
| lettering | STRING | Lettering information | "To the King's Most Excellent Majesty this plate the Death of General Wolfe..." |
| currentFrequency | STRING | Current frequency for periodicals | null |
| duration | BIGINT | Duration (likely in seconds) | null |
| alternativeTitles | ARRAY<STRING> | Alternative titles for the work | [] |
| designation | ARRAY<STRING> | Designation information | [] |
| formerFrequency | ARRAY<STRING> | Former frequency for periodicals | [] |

## Complex Nested Structures

### workType (STRUCT)
Contains format information about the work.

| Field | Type | Description | Sample Values |
|-------|------|-------------|--------------|
| id | STRING | Format identifier | "a", "h", "k" |
| label | STRING | Format label | "Books", "Archives and manuscripts", "Pictures", "E-books" |
| type | STRING | Type of format | "Format" |

### availabilities (ARRAY<STRUCT>)
Indicates where and how the work is available.

| Field | Type | Description | Sample Values |
|-------|------|-------------|--------------|
| id | STRING | Availability identifier | "closed-stores", "online", "open-shelves" |
| label | STRING | Availability label | "Closed stores", "Online", "Open shelves" |
| type | STRING | Type of availability | "Availability" |

### contributors (ARRAY<STRUCT>)
People or organizations who contributed to the work.

| Field | Type | Description | Sample Values |
|-------|------|-------------|--------------|
| agent | STRUCT | Information about the contributor | Contains id, identifiers, label, type |
| primary | BOOLEAN | Whether this is a primary contributor | true, false |
| roles | ARRAY<STRUCT> | Roles of the contributor | Contains label, type |
| type | STRING | Type of contributor | "Contributor" |

#### agent (STRUCT within contributors)
| Field | Type | Description | Sample Values |
|-------|------|-------------|--------------|
| id | STRING | Agent identifier | "mhwbzbxm", "w39pck74" |
| identifiers | ARRAY<STRUCT> | Identifiers for the agent | Contains identifierType, type, value |
| label | STRING | Name of the agent | "Binney, Val.", "Women's Aid Federation of England." |
| type | STRING | Type of agent | "Person", "Organisation" |

### languages (ARRAY<STRUCT>)
Languages associated with the work.

| Field | Type | Description | Sample Values |
|-------|------|-------------|--------------|
| id | STRING | Language code | "eng" |
| label | STRING | Language name | "English" |
| type | STRING | Type of language entry | "Language" |

### identifiers (ARRAY<STRUCT>)
Various identifiers for the work.

| Field | Type | Description | Sample Values |
|-------|------|-------------|--------------|
| identifierType | STRUCT | Type of identifier |

## Flattened Useful Text Dataframe

In [17]:
from IPython.display import Markdown

In [24]:
from langchain_community.agent_toolkits import create_spark_sql_agent

# Read json file into a string
with open("../data/mappings.works_indexed.2024-11-14.json", "r") as f:
    index_map_json = f.read()

prompt = f"""
Make flattened dataframe of the works table with text fields that could have
useful information for discovering understanding a work.

Use the following ElasticSearch index map as a guide for the fields to include:
{index_map_json}
"""
# prompt
toolkit = SparkSQLToolkit(db=spark_sql, llm=llm)
agent_executor = create_spark_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
    max_iterations=200, top_k=30)
result = agent_executor.run(
    input=prompt,
    handle_parsing_errors=True)
display(Markdown(result))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you create a flattened dataframe of the works table with text fields that could be useful for understanding a work. Let me first check what tables are available in the database.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to understand what fields are available.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, t

I've created a flattened dataframe of the works table with text fields that could be useful for understanding a work. The dataframe includes the following fields:

1. `id` - Unique identifier for the work
2. `title` - Primary title of the work
3. `description` - Detailed description of the work's content
4. `physicalDescription` - Physical characteristics of the work
5. `lettering` - Any lettering information
6. `referenceNumber` - Reference number for the work
7. `edition` - Edition information
8. `contributor_names` - Names of contributors with their roles
9. `alternative_titles` - Alternative titles for the work
10. `subject_labels` - Subject classifications
11. `genre_labels` - Genre classifications
12. `language_labels` - Languages of the work
13. `production_info` - Production information including dates and places
14. `note_contents` - Additional notes about the work
15. `work_type` - Type of work (Books, Archives, etc.)
16. `availability_labels` - Availability information (Online, Closed stores, etc.)

This flattened structure makes it easier to search and analyze the works collection by extracting the most important textual information from the nested structure. The fields align with the ElasticSearch index map provided, focusing on elements that help with discovery and understanding of works.

The dataframe includes a mix of bibliographic information (title, contributors), physical attributes, subject classification, and availability information - all key elements for understanding and discovering works in a collection.

In [15]:
spark.sql("""
SELECT
  id,
  title,
  description,
  physicalDescription,
  referenceNumber,
  lettering,
  CASE WHEN size(alternativeTitles) > 0 THEN concat_ws('; ', alternativeTitles) ELSE NULL END AS alternativeTitles,
  CASE WHEN size(notes) > 0 THEN
    (SELECT concat_ws('; ', collect_list(content))
     FROM (SELECT explode(note_struct.contents) AS content FROM (SELECT explode(notes) AS note_struct)))
  ELSE NULL END AS note_contents,
  CASE WHEN size(subjects) > 0 THEN
    (SELECT concat_ws('; ', collect_list(subject_struct.label))
     FROM (SELECT explode(subjects) AS subject_struct)
     WHERE subject_struct.label IS NOT NULL)
  ELSE NULL END AS subject_labels,
  CASE WHEN size(genres) > 0 THEN
    (SELECT concat_ws('; ', collect_list(genre_struct.label))
     FROM (SELECT explode(genres) AS genre_struct)
     WHERE genre_struct.label IS NOT NULL)
  ELSE NULL END AS genre_labels,
  CASE WHEN size(contributors) > 0 THEN
    (SELECT concat_ws('; ', collect_list(contributor_struct.agent.label))
     FROM (SELECT explode(contributors) AS contributor_struct)
     WHERE contributor_struct.agent.label IS NOT NULL)
  ELSE NULL END AS contributor_labels,
  CASE WHEN workType IS NOT NULL THEN workType.label ELSE NULL END AS work_type,
  type
FROM works
""").show(truncate=False)

                                                                                

+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------+---------------+----------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------

In [21]:
spark.sql("""
SELECT
  id,
  title,
  description,
  physicalDescription,
  lettering,
  referenceNumber,
  CASE WHEN size(alternativeTitles) > 0 THEN concat_ws('; ', alternativeTitles) ELSE NULL END AS alternativeTitles,
  CASE WHEN size(notes) > 0 THEN
    array_join(transform(notes, n -> concat(n.noteType.label, ': ', array_join(n.contents, ' '))), '; ')
    ELSE NULL
  END AS notes,
  CASE WHEN size(subjects) > 0 THEN
    array_join(transform(subjects, s -> s.label), '; ')
    ELSE NULL
  END AS subjects,
  CASE WHEN size(genres) > 0 THEN
    array_join(transform(genres, g -> g.label), '; ')
    ELSE NULL
  END AS genres,
  CASE WHEN size(contributors) > 0 THEN
    array_join(transform(contributors, c -> c.agent.label), '; ')
    ELSE NULL
  END AS contributors,
  CASE WHEN size(languages) > 0 THEN
    array_join(transform(languages, l -> l.label), '; ')
    ELSE NULL
  END AS languages,
  CASE WHEN size(production) > 0 THEN
    array_join(transform(production, p -> p.label), '; ')
    ELSE NULL
  END AS production,
  workType.label AS workType
FROM works
""").show(truncate=False)

+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------+---------+--------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------

# Attempt at using a JSON agent

In [21]:
from langchain_community.agent_toolkits import JsonToolkit, create_json_agent
from langchain_community.tools.json.tool import JsonSpec


# Load JSON file
import json
with open("data/works.json", "r") as f:
    data = json.load(f)
# Create JSON spec
json_spec = JsonSpec(data=data)


: 

: 

In [None]:
json_toolkit = JsonToolkit(spec=json_spec)

json_agent_executor = create_json_agent(
    llm=llm, toolkit=json_toolkit, verbose=True
)