From 6b253f5dfbe9eb4717aa172c445ef674e0305de4 Mon Sep 17 00:00:00 2001
From: James Broadhead <jamesbroadhead@gmail.com>
Date: Wed, 15 Apr 2026 11:39:04 +0000
Subject: [PATCH 1/3] fix: handle ARROW_STREAM attachment in type generator

When serverless warehouses return ARROW_STREAM format, the DESCRIBE QUERY
result comes as an inline base64 Arrow IPC attachment rather than data_array.
This caused convertToQueryType to generate empty types {}.

Add a fallback that decodes the Arrow IPC attachment schema to extract column
names and types when data_array is empty.

Co-authored-by: Isaac
Signed-off-by: James Broadhead <jamesbroadhead@gmail.com>
---
 .../src/type-generator/query-registry.ts      | 73 ++++++++++++++++++-
 packages/appkit/src/type-generator/types.ts   |  2 +
 2 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts
index 196690c2d..0285d883c 100644
--- a/packages/appkit/src/type-generator/query-registry.ts
+++ b/packages/appkit/src/type-generator/query-registry.ts
@@ -1,6 +1,7 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { WorkspaceClient } from "@databricks/sdk-experimental";
+import { tableFromIPC } from "apache-arrow";
 import pc from "picocolors";
 import { createLogger } from "../logging/logger";
 import { CACHE_VERSION, hashSQL, loadCache, saveCache } from "./cache";
@@ -129,18 +130,85 @@ function formatParametersType(sql: string): string {
     : "Record<string, never>";
 }
 
+/**
+ * Map Arrow DataType IDs to Databricks SQL type names.
+ * Arrow type IDs come from the Arrow spec (apache-arrow TypeId enum).
+ * We only need to cover the types that DESCRIBE QUERY can return.
+ */
+function arrowTypeToSqlName(arrowType: { typeId: number }): string {
+  switch (arrowType.typeId) {
+    case 1: // Bool
+      return "BOOLEAN";
+    case 2: // Int (covers TINYINT, SMALLINT, INT, BIGINT depending on bitWidth)
+      return "INT";
+    case 3: // Float (covers FLOAT, DOUBLE)
+      return "DOUBLE";
+    case 4: // Decimal
+      return "DECIMAL";
+    case 5: // Utf8
+      return "STRING";
+    case 6: // Binary
+      return "BINARY";
+    case 7: // FixedSizeBinary
+      return "BINARY";
+    case 8: // Date
+      return "DATE";
+    case 10: // Timestamp
+      return "TIMESTAMP";
+    case 12: // List
+      return "ARRAY";
+    case 14: // Struct
+      return "STRUCT";
+    case 15: // Map
+      return "MAP";
+    default:
+      return "STRING";
+  }
+}
+
+/**
+ * Decode a base64 Arrow IPC attachment and extract column metadata.
+ * Returns the same shape as rows parsed from DESCRIBE QUERY data_array.
+ */
+function columnsFromArrowAttachment(
+  attachment: string,
+): Array<{ name: string; type_name: string; comment: string | undefined }> {
+  const buf = Buffer.from(attachment, "base64");
+  const table = tableFromIPC(buf);
+  return table.schema.fields.map((field) => ({
+    name: field.name,
+    type_name: arrowTypeToSqlName(field.type),
+    comment: undefined,
+  }));
+}
+
 export function convertToQueryType(
   result: DatabricksStatementExecutionResponse,
   sql: string,
   queryName: string,
 ): { type: string; hasResults: boolean } {
   const dataRows = result.result?.data_array || [];
-  const columns = dataRows.map((row) => ({
+  let columns = dataRows.map((row) => ({
     name: row[0] || "",
     type_name: row[1]?.toUpperCase() || "STRING",
     comment: row[2] || undefined,
   }));
 
+  // Fallback: serverless warehouses may return ARROW_STREAM format with an
+  // inline base64 attachment instead of data_array. Decode the Arrow IPC
+  // schema to extract column names and types.
+  if (columns.length === 0 && result.result?.attachment) {
+    logger.debug("data_array empty, decoding Arrow IPC attachment for schema");
+    try {
+      columns = columnsFromArrowAttachment(result.result.attachment);
+    } catch (err) {
+      logger.warn(
+        "Failed to decode Arrow IPC attachment: %s",
+        err instanceof Error ? err.message : String(err),
+      );
+    }
+  }
+
   const paramsType = formatParametersType(sql);
 
   // generate result fields with JSDoc
@@ -397,10 +465,11 @@ export async function generateQueriesFromDescribe(
       );
 
       logger.debug(
-        "DESCRIBE result for %s: state=%s, rows=%d",
+        "DESCRIBE result for %s: state=%s, rows=%d, hasAttachment=%s",
         queryName,
         result.status.state,
         result.result?.data_array?.length ?? 0,
+        !!result.result?.attachment,
       );
 
       if (result.status.state === "FAILED") {
diff --git a/packages/appkit/src/type-generator/types.ts b/packages/appkit/src/type-generator/types.ts
index 5af43591a..9a591f512 100644
--- a/packages/appkit/src/type-generator/types.ts
+++ b/packages/appkit/src/type-generator/types.ts
@@ -12,6 +12,8 @@ export interface DatabricksStatementExecutionResponse {
   };
   result?: {
     data_array?: (string | null)[][];
+    /** Base64-encoded Arrow IPC bytes (returned by serverless warehouses using ARROW_STREAM format) */
+    attachment?: string;
   };
 }
 

From 55a3a97df5fec489f59b4e6bf4f9500742f6c29a Mon Sep 17 00:00:00 2001
From: James Broadhead <jamesbroadhead@gmail.com>
Date: Mon, 27 Apr 2026 17:30:28 +0000
Subject: [PATCH 2/3] chore(appkit): declare apache-arrow as direct dependency

The Arrow IPC schema-decoding code in this PR imports apache-arrow
from packages/appkit/src/type-generator/query-registry.ts. Until
now it resolved transitively via packages/appkit-ui, which knip
flags as an unlisted dependency. Declare it directly to satisfy
knip and make the dependency explicit.

Signed-off-by: James Broadhead <jamesbroadhead@gmail.com>
---
 packages/appkit/package.json | 1 +
 pnpm-lock.yaml               | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/packages/appkit/package.json b/packages/appkit/package.json
index 3b57014c0..04232f88b 100644
--- a/packages/appkit/package.json
+++ b/packages/appkit/package.json
@@ -69,6 +69,7 @@
     "@opentelemetry/sdk-trace-base": "2.6.0",
     "@opentelemetry/semantic-conventions": "1.38.0",
     "@types/semver": "7.7.1",
+    "apache-arrow": "21.1.0",
     "dotenv": "16.6.1",
     "express": "4.22.0",
     "obug": "2.1.1",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 9ca11b818..46096f433 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -299,6 +299,9 @@ importers:
       '@types/semver':
         specifier: 7.7.1
         version: 7.7.1
+      apache-arrow:
+        specifier: 21.1.0
+        version: 21.1.0
       dotenv:
         specifier: 16.6.1
         version: 16.6.1
@@ -5539,7 +5542,7 @@ packages:
   basic-ftp@5.0.5:
     resolution: {integrity: sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==}
     engines: {node: '>=10.0.0'}
-    deprecated: Security vulnerability fixed in 5.2.0, please upgrade
+    deprecated: Security vulnerability fixed in 5.2.1, please upgrade
 
   batch@0.6.1:
     resolution: {integrity: sha512-x+VAiMRL6UPkx+kudNvxTl6hB2XNNCG2r+7wixVfIYwu/2HKRXimwQyaumLjMveWvT2Hkd/cAJw+QBMfJ/EKVw==}
@@ -6653,6 +6656,7 @@ packages:
 
   dottie@2.0.6:
     resolution: {integrity: sha512-iGCHkfUc5kFekGiqhe8B/mdaurD+lakO9txNnTvKtA6PISrw86LgqHvRzWYPyoE2Ph5aMIrCw9/uko6XHTKCwA==}
+    deprecated: Package no longer supported. Contact Support at https://www.npmjs.com/support for more info.
 
   drizzle-orm@0.45.1:
     resolution: {integrity: sha512-Te0FOdKIistGNPMq2jscdqngBRfBpC8uMFVwqjf6gtTVJHIQ/dosgV/CLBU2N4ZJBsXL5savCba9b0YJskKdcA==}

From 52e9cf47831da5619905eecabad16c8adb72eccb Mon Sep 17 00:00:00 2001
From: James Broadhead <jamesbroadhead@gmail.com>
Date: Mon, 27 Apr 2026 20:01:07 +0000
Subject: [PATCH 3/3] fix: read DESCRIBE QUERY data rows from Arrow attachment,
 not schema
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous implementation read `table.schema.fields` from the
ARROW_STREAM attachment. A DESCRIBE QUERY response is a result *table*
whose own schema is `(col_name, data_type, comment)` — so this would
generate the same bogus type `{ col_name: string; data_type: string;
comment: string }` for every query routed through serverless.

- Replace columnsFromArrowAttachment with one that iterates table.toArray()
  and reads the col_name / data_type / comment values per row, matching
  the legacy data_array path.
- Drop arrowTypeToSqlName entirely. The numeric TypeId map was wrong
  (e.g. case 1 -> Bool but apache-arrow Type=Null, case 6 -> Binary but
  Type=Bool, case 14 -> Struct but Type=Union); since data_type already
  carries the SQL type name as a string, the helper is no longer needed.
- Add tests covering the attachment fallback, the data_array-prefers-attachment
  case, lowercase data_type normalization, and the malformed-attachment path.

Signed-off-by: James Broadhead <jamesbroadhead@gmail.com>
---
 .../src/type-generator/query-registry.ts      | 76 ++++++---------
 .../tests/query-registry.test.ts              | 96 +++++++++++++++++++
 2 files changed, 126 insertions(+), 46 deletions(-)

diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts
index 0285d883c..df1f1e4cb 100644
--- a/packages/appkit/src/type-generator/query-registry.ts
+++ b/packages/appkit/src/type-generator/query-registry.ts
@@ -131,55 +131,39 @@ function formatParametersType(sql: string): string {
 }
 
 /**
- * Map Arrow DataType IDs to Databricks SQL type names.
- * Arrow type IDs come from the Arrow spec (apache-arrow TypeId enum).
- * We only need to cover the types that DESCRIBE QUERY can return.
- */
-function arrowTypeToSqlName(arrowType: { typeId: number }): string {
-  switch (arrowType.typeId) {
-    case 1: // Bool
-      return "BOOLEAN";
-    case 2: // Int (covers TINYINT, SMALLINT, INT, BIGINT depending on bitWidth)
-      return "INT";
-    case 3: // Float (covers FLOAT, DOUBLE)
-      return "DOUBLE";
-    case 4: // Decimal
-      return "DECIMAL";
-    case 5: // Utf8
-      return "STRING";
-    case 6: // Binary
-      return "BINARY";
-    case 7: // FixedSizeBinary
-      return "BINARY";
-    case 8: // Date
-      return "DATE";
-    case 10: // Timestamp
-      return "TIMESTAMP";
-    case 12: // List
-      return "ARRAY";
-    case 14: // Struct
-      return "STRUCT";
-    case 15: // Map
-      return "MAP";
-    default:
-      return "STRING";
-  }
-}
-
-/**
- * Decode a base64 Arrow IPC attachment and extract column metadata.
- * Returns the same shape as rows parsed from DESCRIBE QUERY data_array.
+ * Decode a base64 Arrow IPC attachment from a DESCRIBE QUERY response and
+ * extract column metadata. Returns the same shape as rows parsed from the
+ * legacy data_array path.
+ *
+ * IMPORTANT: a DESCRIBE QUERY response is itself a result *table* with rows
+ * shaped like `(col_name, data_type, comment)` describing the user query's
+ * output schema. We must read those rows — NOT `table.schema.fields`, which
+ * would describe DESCRIBE QUERY's own output (`col_name`, `data_type`,
+ * `comment`) and yield bogus types for every query.
  */
 function columnsFromArrowAttachment(
   attachment: string,
 ): Array<{ name: string; type_name: string; comment: string | undefined }> {
   const buf = Buffer.from(attachment, "base64");
   const table = tableFromIPC(buf);
-  return table.schema.fields.map((field) => ({
-    name: field.name,
-    type_name: arrowTypeToSqlName(field.type),
-    comment: undefined,
-  }));
+  return table.toArray().map((row) => {
+    const obj = row.toJSON() as {
+      col_name?: unknown;
+      data_type?: unknown;
+      comment?: unknown;
+    };
+    return {
+      name: typeof obj.col_name === "string" ? obj.col_name : "",
+      type_name:
+        typeof obj.data_type === "string"
+          ? obj.data_type.toUpperCase()
+          : "STRING",
+      comment:
+        typeof obj.comment === "string" && obj.comment !== ""
+          ? obj.comment
+          : undefined,
+    };
+  });
 }
 
 export function convertToQueryType(
@@ -194,9 +178,9 @@ export function convertToQueryType(
     comment: row[2] || undefined,
   }));
 
-  // Fallback: serverless warehouses may return ARROW_STREAM format with an
-  // inline base64 attachment instead of data_array. Decode the Arrow IPC
-  // schema to extract column names and types.
+  // Fallback: serverless warehouses return ARROW_STREAM format with an inline
+  // base64 attachment instead of data_array. Decode the Arrow IPC rows (the
+  // DESCRIBE QUERY result table) to extract column names and types.
   if (columns.length === 0 && result.result?.attachment) {
     logger.debug("data_array empty, decoding Arrow IPC attachment for schema");
     try {
diff --git a/packages/appkit/src/type-generator/tests/query-registry.test.ts b/packages/appkit/src/type-generator/tests/query-registry.test.ts
index 8d46f98e9..d3c5be55e 100644
--- a/packages/appkit/src/type-generator/tests/query-registry.test.ts
+++ b/packages/appkit/src/type-generator/tests/query-registry.test.ts
@@ -1,3 +1,4 @@
+import { Table, tableToIPC, vectorFromArray } from "apache-arrow";
 import { describe, expect, test } from "vitest";
 import {
   convertToQueryType,
@@ -11,6 +12,20 @@ import {
 } from "../query-registry";
 import type { DatabricksStatementExecutionResponse } from "../types";
 
+// Build a base64 Arrow IPC payload that mimics a DESCRIBE QUERY response —
+// a result *table* with columns (col_name, data_type, comment) describing
+// the user query's output schema.
+function describeQueryAttachment(
+  rows: Array<{ col_name: string; data_type: string; comment: string | null }>,
+): string {
+  const table = new Table({
+    col_name: vectorFromArray(rows.map((r) => r.col_name)),
+    data_type: vectorFromArray(rows.map((r) => r.data_type)),
+    comment: vectorFromArray(rows.map((r) => r.comment ?? "")),
+  });
+  return Buffer.from(tableToIPC(table, "stream")).toString("base64");
+}
+
 describe("normalizeTypeName", () => {
   test("returns simple types unchanged", () => {
     expect(normalizeTypeName("STRING")).toBe("STRING");
@@ -346,6 +361,87 @@ SELECT * FROM users WHERE date = :startDate AND count = :count AND name = :name`
     );
     expect(hasResults).toBe(false);
   });
+
+  describe("ARROW_STREAM attachment fallback (serverless warehouses)", () => {
+    test("decodes column metadata from Arrow IPC data rows, not schema fields", () => {
+      // Critical regression test: it would be a bug to read
+      // `table.schema.fields` here, which would generate types like
+      // { col_name: string; data_type: string; comment: string } for every
+      // query (those are DESCRIBE QUERY's own output columns). We must read
+      // the data rows.
+      const attachment = describeQueryAttachment([
+        { col_name: "user_id", data_type: "BIGINT", comment: null },
+        { col_name: "name", data_type: "STRING", comment: "display name" },
+        { col_name: "active", data_type: "BOOLEAN", comment: null },
+      ]);
+      const response: DatabricksStatementExecutionResponse = {
+        statement_id: "test-arrow",
+        status: { state: "SUCCEEDED" },
+        result: { attachment },
+      };
+
+      const { type, hasResults } = convertToQueryType(
+        response,
+        "SELECT user_id, name, active FROM users",
+        "users",
+      );
+
+      expect(hasResults).toBe(true);
+      // Real query columns appear in the generated type:
+      expect(type).toContain("user_id: number");
+      expect(type).toContain("name: string");
+      expect(type).toContain("active: boolean");
+      // Column comments survive:
+      expect(type).toContain("/** display name");
+      // The DESCRIBE QUERY metadata column names must NOT leak as user types:
+      expect(type).not.toContain("col_name: string");
+      expect(type).not.toContain("data_type: string");
+    });
+
+    test("normalizes lowercase data_type values to uppercase", () => {
+      const attachment = describeQueryAttachment([
+        { col_name: "id", data_type: "int", comment: null },
+      ]);
+      const response: DatabricksStatementExecutionResponse = {
+        statement_id: "test-arrow",
+        status: { state: "SUCCEEDED" },
+        result: { attachment },
+      };
+
+      const { type } = convertToQueryType(response, "SELECT 1", "test");
+      expect(type).toContain("@sqlType INT");
+      expect(type).toContain("id: number");
+    });
+
+    test("prefers data_array over attachment when both are present", () => {
+      const attachment = describeQueryAttachment([
+        { col_name: "from_arrow", data_type: "STRING", comment: null },
+      ]);
+      const response: DatabricksStatementExecutionResponse = {
+        statement_id: "test-both",
+        status: { state: "SUCCEEDED" },
+        result: {
+          data_array: [["from_data_array", "INT", null]],
+          attachment,
+        },
+      };
+
+      const { type } = convertToQueryType(response, "SELECT 1", "test");
+      expect(type).toContain("from_data_array: number");
+      expect(type).not.toContain("from_arrow");
+    });
+
+    test("logs a warning and yields no columns on malformed attachment", () => {
+      const response: DatabricksStatementExecutionResponse = {
+        statement_id: "test-bad",
+        status: { state: "SUCCEEDED" },
+        result: { attachment: "not-valid-arrow-ipc" },
+      };
+
+      const { hasResults } = convertToQueryType(response, "SELECT 1", "test");
+      expect(hasResults).toBe(false);
+    });
+  });
 });
 
 describe("inferParameterTypes", () => {