From 6b253f5dfbe9eb4717aa172c445ef674e0305de4 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Wed, 15 Apr 2026 11:39:04 +0000 Subject: [PATCH 1/3] fix: handle ARROW_STREAM attachment in type generator When serverless warehouses return ARROW_STREAM format, the DESCRIBE QUERY result comes as an inline base64 Arrow IPC attachment rather than data_array. This caused convertToQueryType to generate empty types {}. Add a fallback that decodes the Arrow IPC attachment schema to extract column names and types when data_array is empty. Co-authored-by: Isaac Signed-off-by: James Broadhead --- .../src/type-generator/query-registry.ts | 73 ++++++++++++++++++- packages/appkit/src/type-generator/types.ts | 2 + 2 files changed, 73 insertions(+), 2 deletions(-) diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index 196690c2d..0285d883c 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -1,6 +1,7 @@ import fs from "node:fs/promises"; import path from "node:path"; import { WorkspaceClient } from "@databricks/sdk-experimental"; +import { tableFromIPC } from "apache-arrow"; import pc from "picocolors"; import { createLogger } from "../logging/logger"; import { CACHE_VERSION, hashSQL, loadCache, saveCache } from "./cache"; @@ -129,18 +130,85 @@ function formatParametersType(sql: string): string { : "Record"; } +/** + * Map Arrow DataType IDs to Databricks SQL type names. + * Arrow type IDs come from the Arrow spec (apache-arrow TypeId enum). + * We only need to cover the types that DESCRIBE QUERY can return. + */ +function arrowTypeToSqlName(arrowType: { typeId: number }): string { + switch (arrowType.typeId) { + case 1: // Bool + return "BOOLEAN"; + case 2: // Int (covers TINYINT, SMALLINT, INT, BIGINT depending on bitWidth) + return "INT"; + case 3: // Float (covers FLOAT, DOUBLE) + return "DOUBLE"; + case 4: // Decimal + return "DECIMAL"; + case 5: // Utf8 + return "STRING"; + case 6: // Binary + return "BINARY"; + case 7: // FixedSizeBinary + return "BINARY"; + case 8: // Date + return "DATE"; + case 10: // Timestamp + return "TIMESTAMP"; + case 12: // List + return "ARRAY"; + case 14: // Struct + return "STRUCT"; + case 15: // Map + return "MAP"; + default: + return "STRING"; + } +} + +/** + * Decode a base64 Arrow IPC attachment and extract column metadata. + * Returns the same shape as rows parsed from DESCRIBE QUERY data_array. + */ +function columnsFromArrowAttachment( + attachment: string, +): Array<{ name: string; type_name: string; comment: string | undefined }> { + const buf = Buffer.from(attachment, "base64"); + const table = tableFromIPC(buf); + return table.schema.fields.map((field) => ({ + name: field.name, + type_name: arrowTypeToSqlName(field.type), + comment: undefined, + })); +} + export function convertToQueryType( result: DatabricksStatementExecutionResponse, sql: string, queryName: string, ): { type: string; hasResults: boolean } { const dataRows = result.result?.data_array || []; - const columns = dataRows.map((row) => ({ + let columns = dataRows.map((row) => ({ name: row[0] || "", type_name: row[1]?.toUpperCase() || "STRING", comment: row[2] || undefined, })); + // Fallback: serverless warehouses may return ARROW_STREAM format with an + // inline base64 attachment instead of data_array. Decode the Arrow IPC + // schema to extract column names and types. + if (columns.length === 0 && result.result?.attachment) { + logger.debug("data_array empty, decoding Arrow IPC attachment for schema"); + try { + columns = columnsFromArrowAttachment(result.result.attachment); + } catch (err) { + logger.warn( + "Failed to decode Arrow IPC attachment: %s", + err instanceof Error ? err.message : String(err), + ); + } + } + const paramsType = formatParametersType(sql); // generate result fields with JSDoc @@ -397,10 +465,11 @@ export async function generateQueriesFromDescribe( ); logger.debug( - "DESCRIBE result for %s: state=%s, rows=%d", + "DESCRIBE result for %s: state=%s, rows=%d, hasAttachment=%s", queryName, result.status.state, result.result?.data_array?.length ?? 0, + !!result.result?.attachment, ); if (result.status.state === "FAILED") { diff --git a/packages/appkit/src/type-generator/types.ts b/packages/appkit/src/type-generator/types.ts index 5af43591a..9a591f512 100644 --- a/packages/appkit/src/type-generator/types.ts +++ b/packages/appkit/src/type-generator/types.ts @@ -12,6 +12,8 @@ export interface DatabricksStatementExecutionResponse { }; result?: { data_array?: (string | null)[][]; + /** Base64-encoded Arrow IPC bytes (returned by serverless warehouses using ARROW_STREAM format) */ + attachment?: string; }; } From 55a3a97df5fec489f59b4e6bf4f9500742f6c29a Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Mon, 27 Apr 2026 17:30:28 +0000 Subject: [PATCH 2/3] chore(appkit): declare apache-arrow as direct dependency The Arrow IPC schema-decoding code in this PR imports apache-arrow from packages/appkit/src/type-generator/query-registry.ts. Until now it resolved transitively via packages/appkit-ui, which knip flags as an unlisted dependency. Declare it directly to satisfy knip and make the dependency explicit. Signed-off-by: James Broadhead --- packages/appkit/package.json | 1 + pnpm-lock.yaml | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/packages/appkit/package.json b/packages/appkit/package.json index 3b57014c0..04232f88b 100644 --- a/packages/appkit/package.json +++ b/packages/appkit/package.json @@ -69,6 +69,7 @@ "@opentelemetry/sdk-trace-base": "2.6.0", "@opentelemetry/semantic-conventions": "1.38.0", "@types/semver": "7.7.1", + "apache-arrow": "21.1.0", "dotenv": "16.6.1", "express": "4.22.0", "obug": "2.1.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9ca11b818..46096f433 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -299,6 +299,9 @@ importers: '@types/semver': specifier: 7.7.1 version: 7.7.1 + apache-arrow: + specifier: 21.1.0 + version: 21.1.0 dotenv: specifier: 16.6.1 version: 16.6.1 @@ -5539,7 +5542,7 @@ packages: basic-ftp@5.0.5: resolution: {integrity: sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==} engines: {node: '>=10.0.0'} - deprecated: Security vulnerability fixed in 5.2.0, please upgrade + deprecated: Security vulnerability fixed in 5.2.1, please upgrade batch@0.6.1: resolution: {integrity: sha512-x+VAiMRL6UPkx+kudNvxTl6hB2XNNCG2r+7wixVfIYwu/2HKRXimwQyaumLjMveWvT2Hkd/cAJw+QBMfJ/EKVw==} @@ -6653,6 +6656,7 @@ packages: dottie@2.0.6: resolution: {integrity: sha512-iGCHkfUc5kFekGiqhe8B/mdaurD+lakO9txNnTvKtA6PISrw86LgqHvRzWYPyoE2Ph5aMIrCw9/uko6XHTKCwA==} + deprecated: Package no longer supported. Contact Support at https://www.npmjs.com/support for more info. drizzle-orm@0.45.1: resolution: {integrity: sha512-Te0FOdKIistGNPMq2jscdqngBRfBpC8uMFVwqjf6gtTVJHIQ/dosgV/CLBU2N4ZJBsXL5savCba9b0YJskKdcA==} From 52e9cf47831da5619905eecabad16c8adb72eccb Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Mon, 27 Apr 2026 20:01:07 +0000 Subject: [PATCH 3/3] fix: read DESCRIBE QUERY data rows from Arrow attachment, not schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous implementation read `table.schema.fields` from the ARROW_STREAM attachment. A DESCRIBE QUERY response is a result *table* whose own schema is `(col_name, data_type, comment)` — so this would generate the same bogus type `{ col_name: string; data_type: string; comment: string }` for every query routed through serverless. - Replace columnsFromArrowAttachment with one that iterates table.toArray() and reads the col_name / data_type / comment values per row, matching the legacy data_array path. - Drop arrowTypeToSqlName entirely. The numeric TypeId map was wrong (e.g. case 1 -> Bool but apache-arrow Type=Null, case 6 -> Binary but Type=Bool, case 14 -> Struct but Type=Union); since data_type already carries the SQL type name as a string, the helper is no longer needed. - Add tests covering the attachment fallback, the data_array-prefers-attachment case, lowercase data_type normalization, and the malformed-attachment path. Signed-off-by: James Broadhead --- .../src/type-generator/query-registry.ts | 76 ++++++--------- .../tests/query-registry.test.ts | 96 +++++++++++++++++++ 2 files changed, 126 insertions(+), 46 deletions(-) diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index 0285d883c..df1f1e4cb 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -131,55 +131,39 @@ function formatParametersType(sql: string): string { } /** - * Map Arrow DataType IDs to Databricks SQL type names. - * Arrow type IDs come from the Arrow spec (apache-arrow TypeId enum). - * We only need to cover the types that DESCRIBE QUERY can return. - */ -function arrowTypeToSqlName(arrowType: { typeId: number }): string { - switch (arrowType.typeId) { - case 1: // Bool - return "BOOLEAN"; - case 2: // Int (covers TINYINT, SMALLINT, INT, BIGINT depending on bitWidth) - return "INT"; - case 3: // Float (covers FLOAT, DOUBLE) - return "DOUBLE"; - case 4: // Decimal - return "DECIMAL"; - case 5: // Utf8 - return "STRING"; - case 6: // Binary - return "BINARY"; - case 7: // FixedSizeBinary - return "BINARY"; - case 8: // Date - return "DATE"; - case 10: // Timestamp - return "TIMESTAMP"; - case 12: // List - return "ARRAY"; - case 14: // Struct - return "STRUCT"; - case 15: // Map - return "MAP"; - default: - return "STRING"; - } -} - -/** - * Decode a base64 Arrow IPC attachment and extract column metadata. - * Returns the same shape as rows parsed from DESCRIBE QUERY data_array. + * Decode a base64 Arrow IPC attachment from a DESCRIBE QUERY response and + * extract column metadata. Returns the same shape as rows parsed from the + * legacy data_array path. + * + * IMPORTANT: a DESCRIBE QUERY response is itself a result *table* with rows + * shaped like `(col_name, data_type, comment)` describing the user query's + * output schema. We must read those rows — NOT `table.schema.fields`, which + * would describe DESCRIBE QUERY's own output (`col_name`, `data_type`, + * `comment`) and yield bogus types for every query. */ function columnsFromArrowAttachment( attachment: string, ): Array<{ name: string; type_name: string; comment: string | undefined }> { const buf = Buffer.from(attachment, "base64"); const table = tableFromIPC(buf); - return table.schema.fields.map((field) => ({ - name: field.name, - type_name: arrowTypeToSqlName(field.type), - comment: undefined, - })); + return table.toArray().map((row) => { + const obj = row.toJSON() as { + col_name?: unknown; + data_type?: unknown; + comment?: unknown; + }; + return { + name: typeof obj.col_name === "string" ? obj.col_name : "", + type_name: + typeof obj.data_type === "string" + ? obj.data_type.toUpperCase() + : "STRING", + comment: + typeof obj.comment === "string" && obj.comment !== "" + ? obj.comment + : undefined, + }; + }); } export function convertToQueryType( @@ -194,9 +178,9 @@ export function convertToQueryType( comment: row[2] || undefined, })); - // Fallback: serverless warehouses may return ARROW_STREAM format with an - // inline base64 attachment instead of data_array. Decode the Arrow IPC - // schema to extract column names and types. + // Fallback: serverless warehouses return ARROW_STREAM format with an inline + // base64 attachment instead of data_array. Decode the Arrow IPC rows (the + // DESCRIBE QUERY result table) to extract column names and types. if (columns.length === 0 && result.result?.attachment) { logger.debug("data_array empty, decoding Arrow IPC attachment for schema"); try { diff --git a/packages/appkit/src/type-generator/tests/query-registry.test.ts b/packages/appkit/src/type-generator/tests/query-registry.test.ts index 8d46f98e9..d3c5be55e 100644 --- a/packages/appkit/src/type-generator/tests/query-registry.test.ts +++ b/packages/appkit/src/type-generator/tests/query-registry.test.ts @@ -1,3 +1,4 @@ +import { Table, tableToIPC, vectorFromArray } from "apache-arrow"; import { describe, expect, test } from "vitest"; import { convertToQueryType, @@ -11,6 +12,20 @@ import { } from "../query-registry"; import type { DatabricksStatementExecutionResponse } from "../types"; +// Build a base64 Arrow IPC payload that mimics a DESCRIBE QUERY response — +// a result *table* with columns (col_name, data_type, comment) describing +// the user query's output schema. +function describeQueryAttachment( + rows: Array<{ col_name: string; data_type: string; comment: string | null }>, +): string { + const table = new Table({ + col_name: vectorFromArray(rows.map((r) => r.col_name)), + data_type: vectorFromArray(rows.map((r) => r.data_type)), + comment: vectorFromArray(rows.map((r) => r.comment ?? "")), + }); + return Buffer.from(tableToIPC(table, "stream")).toString("base64"); +} + describe("normalizeTypeName", () => { test("returns simple types unchanged", () => { expect(normalizeTypeName("STRING")).toBe("STRING"); @@ -346,6 +361,87 @@ SELECT * FROM users WHERE date = :startDate AND count = :count AND name = :name` ); expect(hasResults).toBe(false); }); + + describe("ARROW_STREAM attachment fallback (serverless warehouses)", () => { + test("decodes column metadata from Arrow IPC data rows, not schema fields", () => { + // Critical regression test: it would be a bug to read + // `table.schema.fields` here, which would generate types like + // { col_name: string; data_type: string; comment: string } for every + // query (those are DESCRIBE QUERY's own output columns). We must read + // the data rows. + const attachment = describeQueryAttachment([ + { col_name: "user_id", data_type: "BIGINT", comment: null }, + { col_name: "name", data_type: "STRING", comment: "display name" }, + { col_name: "active", data_type: "BOOLEAN", comment: null }, + ]); + const response: DatabricksStatementExecutionResponse = { + statement_id: "test-arrow", + status: { state: "SUCCEEDED" }, + result: { attachment }, + }; + + const { type, hasResults } = convertToQueryType( + response, + "SELECT user_id, name, active FROM users", + "users", + ); + + expect(hasResults).toBe(true); + // Real query columns appear in the generated type: + expect(type).toContain("user_id: number"); + expect(type).toContain("name: string"); + expect(type).toContain("active: boolean"); + // Column comments survive: + expect(type).toContain("/** display name"); + // The DESCRIBE QUERY metadata column names must NOT leak as user types: + expect(type).not.toContain("col_name: string"); + expect(type).not.toContain("data_type: string"); + }); + + test("normalizes lowercase data_type values to uppercase", () => { + const attachment = describeQueryAttachment([ + { col_name: "id", data_type: "int", comment: null }, + ]); + const response: DatabricksStatementExecutionResponse = { + statement_id: "test-arrow", + status: { state: "SUCCEEDED" }, + result: { attachment }, + }; + + const { type } = convertToQueryType(response, "SELECT 1", "test"); + expect(type).toContain("@sqlType INT"); + expect(type).toContain("id: number"); + }); + + test("prefers data_array over attachment when both are present", () => { + const attachment = describeQueryAttachment([ + { col_name: "from_arrow", data_type: "STRING", comment: null }, + ]); + const response: DatabricksStatementExecutionResponse = { + statement_id: "test-both", + status: { state: "SUCCEEDED" }, + result: { + data_array: [["from_data_array", "INT", null]], + attachment, + }, + }; + + const { type } = convertToQueryType(response, "SELECT 1", "test"); + expect(type).toContain("from_data_array: number"); + expect(type).not.toContain("from_arrow"); + }); + + test("logs a warning and yields no columns on malformed attachment", () => { + const response: DatabricksStatementExecutionResponse = { + statement_id: "test-bad", + status: { state: "SUCCEEDED" }, + result: { attachment: "not-valid-arrow-ipc" }, + }; + + const { hasResults } = convertToQueryType(response, "SELECT 1", "test"); + expect(hasResults).toBe(false); + }); + }); }); describe("inferParameterTypes", () => {