# Initialise Bedrock LLM

In [None]:
# from simd.llm import llm

from langchain_aws import ChatBedrock
import dotenv
import os

dotenv.load_dotenv()
# Set LANGSMITH_PROJECT environment variable
os.environ['LANGSMITH_PROJECT'] = "describe_works"


# Claude is a good alternative to GPT-4o: https://blog.promptlayer.com/big-differences-claude-3-5-vs-gpt-4o/
llm = ChatBedrock(
    model_id="us.anthropic.claude-3-7-sonnet-20250219-v1:0",
    # model_id="us.meta.llama3-3-70b-instruct-v1:0",
    # model_id="us.meta.llama3-2-3b-instruct-v1:0",
    # model_id="us.deepseek.r1-v1:0",
    model_kwargs=dict(temperature=0.2),
    # other params...
)

# Create Spark Context

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("test_pyspark") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g") \
    .config("spark.sql.orc.enableVectorizedReader", "false") \
    .config("spark.sql.parquet.columnarReaderBatchSize", "1024") \
    .config("spark.sql.orc.columnarReaderBatchSize", "1024") \
    .getOrCreate()
spark

25/04/15 13:36:45 WARN Utils: Your hostname, WTH2X9T5QYWX resolves to a loopback address: 127.0.0.1; using 10.21.24.66 instead (on interface en0)
25/04/15 13:36:45 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/15 13:36:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
from langchain_community.utilities.spark_sql import SparkSQL
# Create a Spark SQL connection
# Note: You'll need to have Spark and a database set up
spark_sql = SparkSQL(spark_session=spark, include_tables=["works"])

25/04/15 13:39:04 WARN HiveConf: HiveConf of name hive.repl.dump.metadata.only.for.external.table does not exist
25/04/15 13:39:04 WARN HiveConf: HiveConf of name hive.druid.rollup does not exist
25/04/15 13:39:04 WARN HiveConf: HiveConf of name hive.repl.retain.prev.dump.dir does not exist
25/04/15 13:39:04 WARN HiveConf: HiveConf of name hive.log.explain.output.to.console does not exist
25/04/15 13:39:04 WARN HiveConf: HiveConf of name hive.server2.wm.delayed.move.timeout does not exist
25/04/15 13:39:04 WARN HiveConf: HiveConf of name hive.privilege.synchronizer does not exist
25/04/15 13:39:04 WARN HiveConf: HiveConf of name hive.metastore.wm.default.pool.size does not exist
25/04/15 13:39:04 WARN HiveConf: HiveConf of name hive.llap.task.scheduler.preempt.independent does not exist
25/04/15 13:39:04 WARN HiveConf: HiveConf of name hive.llap.io.cvb.memory.consumption. does not exist
25/04/15 13:39:04 WARN HiveConf: HiveConf of name hive.llap.output.format.arrow does not exist
25/04

# Test Spark SQL Toolkit

In [13]:
# Import necessary Spark SQL tools
from langchain_community.tools.spark_sql.tool import (
    InfoSparkSQLTool,
    ListSparkSQLTool,
    QueryCheckerTool,
    QuerySparkSQLTool
)


# Create individual Spark SQL tools
info_tool = InfoSparkSQLTool(db=spark_sql)
list_tool = ListSparkSQLTool(db=spark_sql)
query_checker_tool = QueryCheckerTool(db=spark_sql, llm=llm)
query_tool = QuerySparkSQLTool(db=spark_sql)

# Example usage of each tool

# 1. Get information about the database
print("Database Info:")
print(info_tool.run("works"))

# 2. List tables in the database
print("\nDatabase Tables:")
print(list_tool.run(""))

# 3. Check a SQL query (uses LLM to validate)
print("\nQuery Validation:")
query_to_check = "SELECT * FROM works WHERE Age > 30"
print(query_checker_tool.run(query_to_check))

# 4. Execute a SQL query
print("\nQuery Results:")
query = "SELECT * FROM works LIMIT 5"
print(query_tool.run(query))

Database Info:


                                                                                

CREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  designation ARRAY<STRING>,
  duration BIGINT,
  edition STRING,
  formerFrequency ARRAY<STRING>,
  genres ARRAY<STRUCT<concepts: ARRAY<STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>>, label: STRING, type: STRING>>,
  holdings ARRAY<STRUCT<enumeration: ARRAY<STRING>, location: STRUCT<accessConditions: ARRAY<STRUCT<method: STRUCT<id:

                                                                                

# Create Spark SQL Agent

In [8]:
from langchain_community.agent_toolkits import SparkSQLToolkit, create_spark_sql_agent

toolkit = SparkSQLToolkit(db=spark_sql, llm=llm)
agent_executor = create_spark_sql_agent(llm=llm, toolkit=toolkit, verbose=True, max_iterations=200)

In [None]:
agent_executor.run("Describe the works table")

  agent_executor.run("Describe the works table")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you describe the works table by examining its schema and sample data.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to provide a description.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  designation ARRAY<STRING>,
  duration BIGINT,
  edition ST

'The works table is a complex database table that stores information about various works or items in a collection. Based on the schema and sample data, here\'s a description:\n\nThe works table contains detailed metadata about different types of works including books, archives, manuscripts, and e-books. Key fields include:\n\n1. **Basic Identification**:\n   - `id`: Unique identifier for each work\n   - `title`: The title of the work\n   - `type`: The type of item (e.g., "Work")\n   - `workType`: Structured data about the format (e.g., E-books, Archives and manuscripts)\n   - `referenceNumber`: Reference or catalog number\n\n2. **Content Description**:\n   - `description`: Textual description of the work\n   - `physicalDescription`: Physical characteristics\n   - `alternativeTitles`: Array of alternative titles\n   - `lettering`: Text information about lettering\n   - `edition`: Edition information\n   - `duration`: Duration (for time-based media)\n\n3. **Contributors and Creation**:\n

25/04/14 16:58:01 WARN HikariPool: HikariPool-1 - Thread starvation or clock leap detected (housekeeper delta=1m19s57ms).
25/04/14 16:58:01 WARN HikariPool: HikariPool-2 - Thread starvation or clock leap detected (housekeeper delta=1m19s58ms).
25/04/14 16:58:50 WARN HikariPool: HikariPool-1 - Thread starvation or clock leap detected (housekeeper delta=48s901ms).
25/04/14 16:58:50 WARN HikariPool: HikariPool-2 - Thread starvation or clock leap detected (housekeeper delta=48s901ms).
25/04/14 17:00:56 WARN HikariPool: HikariPool-2 - Thread starvation or clock leap detected (housekeeper delta=2m5s555ms).
25/04/14 17:00:56 WARN HikariPool: HikariPool-1 - Thread starvation or clock leap detected (housekeeper delta=2m5s600ms).
25/04/14 17:03:16 WARN HikariPool: HikariPool-2 - Thread starvation or clock leap detected (housekeeper delta=2m20s817ms).
25/04/14 17:03:16 WARN HikariPool: HikariPool-1 - Thread starvation or clock leap detected (housekeeper delta=2m20s784ms).
25/04/14 17:21:53 WARN H

In [6]:
agent_executor.run("Describe each sub-structure in works table")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you describe the sub-structures in the works table. Let me first check what tables are available in the database.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to understand its sub-structures.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  design

'The works table contains numerous complex sub-structures that represent various aspects of works in what appears to be a library or archive catalog system. Here are the main sub-structures:\n\n1. **Simple Array Fields**:\n   - `alternativeTitles`: Array of alternative titles for the work\n   - `designation`: Array of designations\n   - `formerFrequency`: Array of former publication frequencies\n\n2. **Complex Array Structures**:\n   - `availabilities`: Array of structured availability information (id, label, type)\n   - `contributors`: Array of contributor information with nested structures including agent details, roles, and whether they are primary contributors\n   - `genres`: Array of genre information with nested concept structures\n   - `holdings`: Array of holding information including enumeration, location details, and notes\n   - `identifiers`: Array of identifier information with type details\n   - `images`: Array of image references\n   - `items`: Array of item information w

In [9]:
agent_executor.run("For each \"Complex Array Structure\" in works table, give a summary of the data in it.")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you analyze the "Complex Array Structure" in the works table. Let's start by exploring the database structure.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to understand what "Complex Array Structure" might refer to.

Action: schema_sql_db
Action Input: works
[0m

                                                                                


Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  designation ARRAY<STRING>,
  duration BIGINT,
  edition STRING,
  formerFrequency ARRAY<STRING>,
  genres ARRAY<STRUCT<concepts: ARRAY<STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>>, label: STRING, type: STRING>>,
  holdings ARRAY<STRUCT<enumeration: ARRAY<STRING>, location: STRUCT<accessConditions: ARRA

                                                                                


Observation: [36;1m[1;3m[('alternativeTitles', '1166657', '235633', '0.2148626374332816'), ('availabilities', '1166657', '1106148', '1.112278930311137'), ('contributors', '1166657', '769602', '0.979999262851035'), ('genres', '1166657', '431157', '0.6070747443335959'), ('identifiers', '1166657', '1166657', '3.4292838426375534'), ('languages', '1166657', '737846', '0.6593068913999574')][0m
Thought:[32;1m[1;3mI need to analyze more complex array structures in the works table. Let me check some additional ones.

Action: query_checker_sql_db
Action Input: SELECT 
  'items' as array_name, 
  COUNT(*) as total_records, 
  COUNT(CASE WHEN size(items) > 0 THEN 1 END) as non_empty_records,
  AVG(size(items)) as avg_elements
FROM works
UNION ALL
SELECT 
  'subjects' as array_name, 
  COUNT(*) as total_records, 
  COUNT(CASE WHEN size(subjects) > 0 THEN 1 END) as non_empty_records,
  AVG(size(subjects)) as avg_elements
FROM works
UNION ALL
SELECT 
  'production' as array_name, 
  COUNT(*) as

                                                                                


Observation: [36;1m[1;3m[('items', '1166657', '928750', '0.9965413999144564'), ('subjects', '1166657', '600117', '1.5978886682203939'), ('production', '1166657', '1101887', '0.9461349822612816'), ('notes', '1166657', '748644', '1.054298735618095'), ('partOf', '1166657', '682162', '0.6352852637921858'), ('parts', '1166657', '26898', '0.22384728330606168')][0m
Thought:[32;1m[1;3mI need to examine the structure of one of these complex arrays to understand their contents better.

Action: query_checker_sql_db
Action Input: SELECT 
  'contributors' as array_name,
  explode(contributors) as contributor_details
FROM works
WHERE size(contributors) > 0
LIMIT 5
[0m
Observation: [36;1m[1;3mThe query looks correct. It selects from the "works" table, filters for rows where the "contributors" array has at least one element, and uses the EXPLODE function to flatten the array elements into separate rows. The query also adds a column to identify the source array name and limits the results to 5

'# Summary of Complex Array Structures in the Works Table\n\nBased on my analysis, here are the key complex array structures in the works table and their characteristics:\n\n## Array Usage Statistics\n| Array Name | Total Records | Non-Empty Records | % With Data | Avg Elements |\n|------------|---------------|-------------------|------------|--------------|\n| identifiers | 1,166,657 | 1,166,657 | 100% | 3.43 |\n| availabilities | 1,166,657 | 1,106,148 | 94.8% | 1.11 |\n| production | 1,166,657 | 1,101,887 | 94.4% | 0.95 |\n| items | 1,166,657 | 928,750 | 79.6% | 1.00 |\n| contributors | 1,166,657 | 769,602 | 66.0% | 0.98 |\n| notes | 1,166,657 | 748,644 | 64.2% | 1.05 |\n| languages | 1,166,657 | 737,846 | 63.2% | 0.66 |\n| partOf | 1,166,657 | 682,162 | 58.5% | 0.64 |\n| subjects | 1,166,657 | 600,117 | 51.4% | 1.60 |\n| genres | 1,166,657 | 431,157 | 37.0% | 0.61 |\n| alternativeTitles | 1,166,657 | 235,633 | 20.2% | 0.21 |\n| parts | 1,166,657 | 26,898 | 2.3% | 0.22 |\n\n## Struct

# Now with Memory

In [10]:
from langgraph.checkpoint.memory import MemorySaver
from langchain_community.agent_toolkits import SparkSQLToolkit, create_spark_sql_agent

toolkit = SparkSQLToolkit(db=spark_sql, llm=llm)
memory = MemorySaver()
agent_executor_mem = create_spark_sql_agent(
    llm=llm,
    toolkit=toolkit,
    verbose=True,
    max_iterations=200,
    checkpointer=memory,
    config=dict(configurable=dict(thread_id="user_conversation_1"))
)

In [7]:
from IPython.display import Markdown

display(Markdown("""# Header
*some markdown* $\\phi$"""))

# Header
*some markdown* $\phi$

In [11]:
result = agent_executor_mem.run(
    "List each \"Complex Array Structure\" in works table")
display(Markdown(result))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you find the "Complex Array Structure" in the works table.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to understand what complex array structures it contains.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  designation ARRAY<STRING>,
  duration 

                                                                                


Observation: [36;1m[1;3m[('alternativeTitles', '235633'), ('availabilities', '1106148'), ('contributors', '769602'), ('designation', '12282'), ('formerFrequency', '728'), ('genres', '431157'), ('holdings', '210979'), ('identifiers', '1166657'), ('images', '84699'), ('items', '928750')][0m
Thought:[32;1m[1;3mI need to continue getting the counts for the remaining array fields.

Action: query_sql_db
Action Input: SELECT 
  'languages' as array_name, 
  COUNT(*) as count 
FROM works 
WHERE size(languages) > 0
UNION ALL
SELECT 
  'notes' as array_name, 
  COUNT(*) as count 
FROM works 
WHERE size(notes) > 0
UNION ALL
SELECT 
  'partOf' as array_name, 
  COUNT(*) as count 
FROM works 
WHERE size(partOf) > 0
UNION ALL
SELECT 
  'parts' as array_name, 
  COUNT(*) as count 
FROM works 
WHERE size(parts) > 0
UNION ALL
SELECT 
  'precededBy' as array_name, 
  COUNT(*) as count 
FROM works 
WHERE size(precededBy) > 0
UNION ALL
SELECT 
  'production' as array_name, 
  COUNT(*) as count 
FROM

                                                                                


Observation: [36;1m[1;3m[('languages', '737846'), ('notes', '748644'), ('partOf', '682162'), ('parts', '26898'), ('precededBy', '234251'), ('production', '1101887'), ('subjects', '600117'), ('succeededBy', '234261')][0m
Thought:[32;1m[1;3mI need to examine the complex array structures in the works table.

Action: query_sql_db
Action Input: SELECT column_name, data_type 
FROM information_schema.columns 
WHERE table_name = 'works' 
AND data_type LIKE 'ARRAY%'
ORDER BY column_name
[0m
Observation: [36;1m[1;3mError: [TABLE_OR_VIEW_NOT_FOUND] The table or view `information_schema`.`columns` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 2 pos 5;
'Sort ['column_name ASC NULLS FIRST], true
+- 'Project ['column_name, 'data_type]
  

25/04/15 13:43:14 WARN ObjectStore: Failed to get database information_schema, returning NoSuchObjectException


[32;1m[1;3mI need to use a different approach to get the schema information. Let me try to list all the complex array structures in the works table.

Action: query_sql_db
Action Input: SELECT 
  'alternativeTitles' as array_name, 
  'ARRAY<STRING>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(alternativeTitles) > 0
UNION ALL
SELECT 
  'availabilities' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(availabilities) > 0
UNION ALL
SELECT 
  'contributors' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(contributors) > 0
UNION ALL
SELECT 
  'designation' as array_name, 
  'ARRAY<STRING>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(designation) > 0
UNION ALL
SELECT 
  'formerFrequency' as array_name, 
  'ARRAY<STRING>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(formerFrequency) > 0
UNION ALL
SELECT 
  'genres' as array_name, 
  'ARRAY<STRUCT>' as array_typ

                                                                                


Observation: [36;1m[1;3m[('availabilities', 'ARRAY<STRUCT>', '1106148'), ('contributors', 'ARRAY<STRUCT>', '769602'), ('genres', 'ARRAY<STRUCT>', '431157'), ('alternativeTitles', 'ARRAY<STRING>', '235633'), ('designation', 'ARRAY<STRING>', '12282'), ('formerFrequency', 'ARRAY<STRING>', '728')][0m
Thought:[32;1m[1;3mI need to continue getting information about the complex array structures in the works table.

Action: query_sql_db
Action Input: SELECT 
  array_name,
  array_type,
  count
FROM (
  SELECT 
    'alternativeTitles' as array_name, 
    'ARRAY<STRING>' as array_type,
    COUNT(*) as count 
  FROM works 
  WHERE size(alternativeTitles) > 0
  UNION ALL
  SELECT 
    'availabilities' as array_name, 
    'ARRAY<STRUCT>' as array_type,
    COUNT(*) as count 
  FROM works 
  WHERE size(availabilities) > 0
  UNION ALL
  SELECT 
    'contributors' as array_name, 
    'ARRAY<STRUCT>' as array_type,
    COUNT(*) as count 
  FROM works 
  WHERE size(contributors) > 0
  UNION ALL
  

                                                                                


Observation: [36;1m[1;3m[('availabilities', 'ARRAY<STRUCT>', '1106148'), ('contributors', 'ARRAY<STRUCT>', '769602'), ('genres', 'ARRAY<STRUCT>', '431157'), ('alternativeTitles', 'ARRAY<STRING>', '235633'), ('designation', 'ARRAY<STRING>', '12282'), ('formerFrequency', 'ARRAY<STRING>', '728')][0m
Thought:[32;1m[1;3mI need to continue with more array fields to get a complete picture.

Action: query_sql_db
Action Input: SELECT 
  'holdings' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(holdings) > 0
UNION ALL
SELECT 
  'identifiers' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(identifiers) > 0
UNION ALL
SELECT 
  'images' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(images) > 0
UNION ALL
SELECT 
  'items' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(items) > 0
UNION ALL
SELECT 
  'languages' as

                                                                                


Observation: [36;1m[1;3m[('identifiers', 'ARRAY<STRUCT>', '1166657'), ('items', 'ARRAY<STRUCT>', '928750'), ('languages', 'ARRAY<STRUCT>', '737846'), ('holdings', 'ARRAY<STRUCT>', '210979'), ('images', 'ARRAY<STRUCT>', '84699')][0m
Thought:[32;1m[1;3mI need to get the remaining array fields.

Action: query_sql_db
Action Input: SELECT 
  'notes' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(notes) > 0
UNION ALL
SELECT 
  'partOf' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(partOf) > 0
UNION ALL
SELECT 
  'parts' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(parts) > 0
UNION ALL
SELECT 
  'precededBy' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(precededBy) > 0
UNION ALL
SELECT 
  'production' as array_name, 
  'ARRAY<STRUCT>' as array_type,
  COUNT(*) as count 
FROM works 
WHERE size(productio

                                                                                


Observation: [36;1m[1;3m[('production', 'ARRAY<STRUCT>', '1101887'), ('notes', 'ARRAY<STRUCT>', '748644'), ('partOf', 'ARRAY<STRUCT>', '682162'), ('subjects', 'ARRAY<STRUCT>', '600117'), ('succeededBy', 'ARRAY<STRUCT>', '234261'), ('precededBy', 'ARRAY<STRUCT>', '234251'), ('parts', 'ARRAY<STRUCT>', '26898')][0m
Thought:[32;1m[1;3mI now know the final answer

Final Answer: Here's a list of all the complex array structures in the works table, ordered by the number of records containing each structure:

1. identifiers (ARRAY<STRUCT>) - 1,166,657 records
2. availabilities (ARRAY<STRUCT>) - 1,106,148 records
3. production (ARRAY<STRUCT>) - 1,101,887 records
4. items (ARRAY<STRUCT>) - 928,750 records
5. contributors (ARRAY<STRUCT>) - 769,602 records
6. notes (ARRAY<STRUCT>) - 748,644 records
7. languages (ARRAY<STRUCT>) - 737,846 records
8. partOf (ARRAY<STRUCT>) - 682,162 records
9. subjects (ARRAY<STRUCT>) - 600,117 records
10. genres (ARRAY<STRUCT>) - 431,157 records
11. alternativ

Here's a list of all the complex array structures in the works table, ordered by the number of records containing each structure:

1. identifiers (ARRAY<STRUCT>) - 1,166,657 records
2. availabilities (ARRAY<STRUCT>) - 1,106,148 records
3. production (ARRAY<STRUCT>) - 1,101,887 records
4. items (ARRAY<STRUCT>) - 928,750 records
5. contributors (ARRAY<STRUCT>) - 769,602 records
6. notes (ARRAY<STRUCT>) - 748,644 records
7. languages (ARRAY<STRUCT>) - 737,846 records
8. partOf (ARRAY<STRUCT>) - 682,162 records
9. subjects (ARRAY<STRUCT>) - 600,117 records
10. genres (ARRAY<STRUCT>) - 431,157 records
11. alternativeTitles (ARRAY<STRING>) - 235,633 records
12. succeededBy (ARRAY<STRUCT>) - 234,261 records
13. precededBy (ARRAY<STRUCT>) - 234,251 records
14. holdings (ARRAY<STRUCT>) - 210,979 records
15. images (ARRAY<STRUCT>) - 84,699 records
16. parts (ARRAY<STRUCT>) - 26,898 records
17. designation (ARRAY<STRING>) - 12,282 records
18. formerFrequency (ARRAY<STRING>) - 728 records

The table contains 15 complex array structures of type ARRAY<STRUCT> and 3 simple array structures of type ARRAY<STRING>.

In [None]:
result = agent_executor_mem.run(
    'Describe the "genres" complex array structure in the works table, including sample values and/or enums for each field.')

display(Markdown(result))



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI'll help you understand the "genres" complex array structure in the works table.

Action: list_tables_sql_db
Action Input: 
[0m
Observation: [38;5;200m[1;3mworks[0m
Thought:[32;1m[1;3mI need to examine the schema of the works table to understand the "genres" complex array structure.

Action: schema_sql_db
Action Input: works
[0m
Observation: [33;1m[1;3mCREATE TABLE spark_catalog.default.works (
  alternativeTitles ARRAY<STRING>,
  availabilities ARRAY<STRUCT<id: STRING, label: STRING, type: STRING>>,
  contributors ARRAY<STRUCT<agent: STRUCT<id: STRING, identifiers: ARRAY<STRUCT<identifierType: STRUCT<id: STRING, label: STRING, type: STRING>, type: STRING, value: STRING>>, label: STRING, type: STRING>, primary: BOOLEAN, roles: ARRAY<STRUCT<label: STRING, type: STRING>>, type: STRING>>,
  createdDate STRUCT<label: STRING, type: STRING>,
  currentFrequency STRING,
  description STRING,
  designation ARRAY<STRING>,
  d

The "genres" field in the works table is a complex array structure that contains information about the genres associated with each work. Here's a detailed description of its structure:

The "genres" field is an ARRAY of STRUCT objects, where each struct has the following fields:

1. `concepts`: An ARRAY of STRUCT objects containing detailed information about the genre concept:
   - `id`: A string identifier for the genre concept (e.g., 'kqkakf66')
   - `identifiers`: An ARRAY of STRUCT objects with identification information:
     - `identifierType`: A STRUCT with:
       - `id`: String identifier for the identifier type (e.g., 'label-derived')
       - `label`: Human-readable label (e.g., 'Identifier derived from the label of the referent')
       - `type`: Type classification (e.g., 'IdentifierType')
     - `type`: String indicating the identifier type (e.g., 'Identifier')
     - `value`: The actual identifier value (e.g., 'electronic books')
   - `label`: Human-readable label for the genre concept (e.g., 'Electronic books')
   - `type`: Type classification (e.g., 'Genre')

2. `label`: Human-readable label for the genre (e.g., 'Electronic books')
3. `type`: Type classification (always 'Genre' in the samples)

Sample genre values found in the database include:
- Electronic books
- Annual reports
- MOH reports
- Statistics
- Wood engravings
- Steel engravings
- Prints
- Dictionaries
- Colonial reports
- Periodicals
- Etchings
- Engravings

This structure allows for detailed classification and identification of work genres with standardized identifiers and human-readable labels.

25/04/15 15:02:01 WARN HikariPool: HikariPool-1 - Thread starvation or clock leap detected (housekeeper delta=1m6s286ms).
25/04/15 15:02:01 WARN HikariPool: HikariPool-2 - Thread starvation or clock leap detected (housekeeper delta=1m6s254ms).
25/04/15 15:51:20 WARN HikariPool: HikariPool-1 - Thread starvation or clock leap detected (housekeeper delta=1m48s792ms).
25/04/15 15:51:20 WARN HikariPool: HikariPool-2 - Thread starvation or clock leap detected (housekeeper delta=1m48s792ms).
25/04/15 15:54:18 WARN HikariPool: HikariPool-2 - Thread starvation or clock leap detected (housekeeper delta=2m57s100ms).
25/04/15 15:54:18 WARN HikariPool: HikariPool-1 - Thread starvation or clock leap detected (housekeeper delta=2m57s95ms).
25/04/15 16:09:53 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 931844 ms exceeds timeout 120000 ms
25/04/15 16:09:53 WARN SparkContext: Killing executors is not supported by current scheduler.
25/04/15 16:10:38 ERROR Inbox: Ignoring e

# Attempt at using a JSON agent

In [21]:
from langchain_community.agent_toolkits import JsonToolkit, create_json_agent
from langchain_community.tools.json.tool import JsonSpec


# Load JSON file
import json
with open("data/works.json", "r") as f:
    data = json.load(f)
# Create JSON spec
json_spec = JsonSpec(data=data)


: 

: 

In [None]:
json_toolkit = JsonToolkit(spec=json_spec)

json_agent_executor = create_json_agent(
    llm=llm, toolkit=json_toolkit, verbose=True
)