From 5496746f1aff29c3109cd251cf51b40575fd9ee1 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Sun, 28 Sep 2025 18:20:06 +0800 Subject: [PATCH 01/22] chore: add ChunkCollectorOutputFormat.cpp --- programs/local/ChunkCollectorOutputFormat.cpp | 49 +++++++++++++++++++ programs/local/ChunkCollectorOutputFormat.h | 44 +++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 programs/local/ChunkCollectorOutputFormat.cpp create mode 100644 programs/local/ChunkCollectorOutputFormat.h diff --git a/programs/local/ChunkCollectorOutputFormat.cpp b/programs/local/ChunkCollectorOutputFormat.cpp new file mode 100644 index 00000000000..3e4451dc09d --- /dev/null +++ b/programs/local/ChunkCollectorOutputFormat.cpp @@ -0,0 +1,49 @@ +#include "ChunkCollectorOutputFormat.h" +#include "PandasDataFrameBuilder.h" + +#include +#include +#include + +namespace DB +{ + +NullWriteBuffer ChunkCollectorOutputFormat::out; + +ChunkCollectorOutputFormat::ChunkCollectorOutputFormat( + const Block & header, + PandasDataFrameBuilder & builder) + : IOutputFormat(header, out) + , dataframe_builder(builder) +{} + +void ChunkCollectorOutputFormat::consume(Chunk chunk) +{ + chunks.emplace_back(std::move(chunk)); +} + +void ChunkCollectorOutputFormat::consumeTotals(Chunk totals) +{ + chunks.emplace_back(std::move(totals)); +} + +void ChunkCollectorOutputFormat::consumeExtremes(Chunk extremes) +{ + chunks.emplace_back(std::move(extremes)); +} + +void ChunkCollectorOutputFormat::finalizeImpl() +{ + // Add all collected chunks to the builder + for (const auto & chunk : chunks) + { + dataframe_builder.addChunk(chunk); + } + + // Finalize the DataFrame generation + dataframe_builder.finalize(); + + chunks.clear(); +} + +} diff --git a/programs/local/ChunkCollectorOutputFormat.h b/programs/local/ChunkCollectorOutputFormat.h new file mode 100644 index 00000000000..707b2ae6bb3 --- /dev/null +++ b/programs/local/ChunkCollectorOutputFormat.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class NullWriteBuffer; +class PandasDataFrameBuilder; + +/// OutputFormat that collects all chunks into memory for further processing +/// Does not write to WriteBuffer, instead accumulates data for conversion to pandas DataFrame objects +class ChunkCollectorOutputFormat : public IOutputFormat +{ +public: + ChunkCollectorOutputFormat(const Block & header, PandasDataFrameBuilder & builder); + + String getName() const override { return "ChunkCollectorOutputFormat"; } + + void onCancel() noexcept override + { + chunks.clear(); + } + +protected: + void consume(Chunk chunk) override; + + void consumeTotals(Chunk totals) override; + + void consumeExtremes(Chunk extremes) override; + + void finalizeImpl() override; + +private: + std::vector chunks; + + PandasDataFrameBuilder & dataframe_builder; + + /// Is not used. + static NullWriteBuffer out; +}; + +} From bd998e2b988d014c388daeba3e3f98d16cb04398 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Mon, 29 Sep 2025 15:57:24 +0800 Subject: [PATCH 02/22] chore: adjust Python-C interface to support direct DataFrame return --- chdb/__init__.py | 30 +-------- chdb/state/sqlitelike.py | 44 +------------ programs/local/CMakeLists.txt | 2 + programs/local/ChunkCollectorOutputFormat.cpp | 37 +++++++++++ programs/local/ChunkCollectorOutputFormat.h | 12 ++++ programs/local/LocalChdb.cpp | 63 ++++++++++++++----- programs/local/LocalChdb.h | 2 +- programs/local/LocalServer.cpp | 4 +- programs/local/QueryResult.h | 3 + src/Client/ClientBase.cpp | 31 +++++++++ src/Client/ClientBase.h | 13 ++++ 11 files changed, 151 insertions(+), 90 deletions(-) diff --git a/chdb/__init__.py b/chdb/__init__.py index 0674a46927c..2ad5d96d720 100644 --- a/chdb/__init__.py +++ b/chdb/__init__.py @@ -38,9 +38,8 @@ class ChdbError(Exception): """ -_arrow_format = set({"dataframe", "arrowtable"}) +_arrow_format = set({"arrowtable"}) _process_result_format_funs = { - "dataframe": lambda x: to_df(x), "arrowtable": lambda x: to_arrowTable(x), } @@ -106,33 +105,6 @@ def to_arrowTable(res): return pa.RecordBatchFileReader(res.bytes()).read_all() -# return pandas dataframe -def to_df(r): - """Convert query result to pandas DataFrame. - - Converts a chDB query result to a pandas DataFrame by first converting to - PyArrow Table and then to pandas using multi-threading for better performance. - - Args: - r: chDB query result object containing binary Arrow data - - Returns: - pd.DataFrame: pandas DataFrame containing the query results - - Raises: - ImportError: If pyarrow or pandas are not installed - - Example: - >>> result = chdb.query("SELECT 1 as id, 'hello' as msg", "Arrow") - >>> df = chdb.to_df(result) - >>> print(df) - id msg - 0 1 hello - """ - t = to_arrowTable(r) - return t.to_pandas(use_threads=True) - - # global connection lock, for multi-threading use of legacy chdb.query() g_conn_lock = threading.Lock() diff --git a/chdb/state/sqlitelike.py b/chdb/state/sqlitelike.py index 7694cb42ece..e9016a8417c 100644 --- a/chdb/state/sqlitelike.py +++ b/chdb/state/sqlitelike.py @@ -10,9 +10,8 @@ raise ImportError("Failed to import pyarrow") from None -_arrow_format = set({"dataframe", "arrowtable"}) +_arrow_format = set({"arrowtable"}) _process_result_format_funs = { - "dataframe": lambda x: to_df(x), "arrowtable": lambda x: to_arrowTable(x), } @@ -65,47 +64,6 @@ def to_arrowTable(res): return pa.RecordBatchFileReader(res.bytes()).read_all() -# return pandas dataframe -def to_df(r): - """Convert query result to Pandas DataFrame. - - This function converts chdb query results to a Pandas DataFrame format - by first converting to PyArrow Table and then to DataFrame. This provides - convenient data analysis capabilities with Pandas API. - - Args: - r: Query result object from chdb containing Arrow format data - - Returns: - pandas.DataFrame: DataFrame containing the query results with - appropriate column names and data types - - Raises: - ImportError: If pyarrow or pandas packages are not installed - - .. note:: - This function uses multi-threading for the Arrow to Pandas conversion - to improve performance on large datasets. - - .. seealso:: - :func:`to_arrowTable` - For PyArrow Table format conversion - - Examples: - >>> import chdb - >>> result = chdb.query("SELECT 1 as num, 'hello' as text", "Arrow") - >>> df = to_df(result) - >>> print(df) - num text - 0 1 hello - >>> print(df.dtypes) - num int64 - text object - dtype: object - """ - t = to_arrowTable(r) - return t.to_pandas(use_threads=True) - - class StreamingResult: def __init__(self, c_result, conn, result_func, supports_record_batch): self._result = c_result diff --git a/programs/local/CMakeLists.txt b/programs/local/CMakeLists.txt index 83095fe2dd0..614d70f1cbc 100644 --- a/programs/local/CMakeLists.txt +++ b/programs/local/CMakeLists.txt @@ -12,6 +12,7 @@ endif() if (USE_PYTHON) set (CHDB_SOURCES chdb.cpp + ChunkCollectorOutputFormat.cpp FormatHelper.cpp ListScan.cpp LocalChdb.cpp @@ -19,6 +20,7 @@ if (USE_PYTHON) NumpyType.cpp PandasAnalyzer.cpp PandasDataFrame.cpp + PandasDataFrameBuilder.cpp PandasScan.cpp PybindWrapper.cpp PythonConversion.cpp diff --git a/programs/local/ChunkCollectorOutputFormat.cpp b/programs/local/ChunkCollectorOutputFormat.cpp index 3e4451dc09d..38d31883a60 100644 --- a/programs/local/ChunkCollectorOutputFormat.cpp +++ b/programs/local/ChunkCollectorOutputFormat.cpp @@ -3,6 +3,7 @@ #include #include +#include #include namespace DB @@ -46,4 +47,40 @@ void ChunkCollectorOutputFormat::finalizeImpl() chunks.clear(); } +/// Global dataframe builder +static std::unique_ptr g_dataframe_builder = nullptr; + +PandasDataFrameBuilder * getGlobalDataFrameBuilder() +{ + return g_dataframe_builder.get(); +} + +void setGlobalDataFrameBuilder(std::unique_ptr builder) +{ + g_dataframe_builder = std::move(builder); +} + +void resetGlobalDataFrameBuilder() +{ + g_dataframe_builder.reset(); +} + +/// create ChunkCollectorOutputFormat for use with function pointer +std::shared_ptr createDataFrameOutputFormat(const Block & header) +{ + /// Create a PandasDataFrameBuilder and set it globally + auto dataframe_builder = std::make_unique(header); + PandasDataFrameBuilder * builder_ptr = dataframe_builder.get(); + setGlobalDataFrameBuilder(std::move(dataframe_builder)); + + /// Create and return the format with the builder + return std::make_shared(header, *builder_ptr); +} + +/// Registration function to be called during initialization +void registerDataFrameOutputFormat() +{ + ClientBase::setDataFrameFormatCreator(&createDataFrameOutputFormat); +} + } diff --git a/programs/local/ChunkCollectorOutputFormat.h b/programs/local/ChunkCollectorOutputFormat.h index 707b2ae6bb3..8c588cd9711 100644 --- a/programs/local/ChunkCollectorOutputFormat.h +++ b/programs/local/ChunkCollectorOutputFormat.h @@ -41,4 +41,16 @@ class ChunkCollectorOutputFormat : public IOutputFormat static NullWriteBuffer out; }; +/// Registration function to be called during initialization +void registerDataFrameOutputFormat(); + +/// Get the global dataframe builder +PandasDataFrameBuilder * getGlobalDataFrameBuilder(); + +/// Set the global dataframe builder +void setGlobalDataFrameBuilder(std::unique_ptr builder); + +/// Reset the global dataframe builder +void resetGlobalDataFrameBuilder(); + } diff --git a/programs/local/LocalChdb.cpp b/programs/local/LocalChdb.cpp index 7d430f75bbf..e40346c4ddb 100644 --- a/programs/local/LocalChdb.cpp +++ b/programs/local/LocalChdb.cpp @@ -1,14 +1,13 @@ #include "LocalChdb.h" -#include +#include "chdb-internal.h" +#include "ChunkCollectorOutputFormat.h" +#include "PandasDataFrameBuilder.h" #include "PythonImporter.h" #include "PythonTableCache.h" #include "StoragePython.h" -#include "chdb-internal.h" -#include "chdb.h" #include #include - #include namespace py = pybind11; @@ -76,13 +75,31 @@ chdb_result * queryToBuffer( // Pybind11 will take over the ownership of the `query_result` object // using smart ptr will cause early free of the object -query_result * query( +py::object query( const std::string & queryStr, const std::string & output_format = "CSV", const std::string & path = {}, const std::string & udfPath = {}) { - return new query_result(queryToBuffer(queryStr, output_format, path, udfPath)); + auto * result = queryToBuffer(queryStr, output_format, path, udfPath); + + if (output_format == "dataframe") + { + chdb_destroy_query_result(result); + + auto * builder = DB::getGlobalDataFrameBuilder(); + if (builder && builder->hasData()) + { + return builder->getDataFrame(); + } + else + { + throw std::runtime_error("DataFrame not available - query may have failed"); + } + } + + // Default behavior - return query_result + return py::cast(new query_result(result)); } // The `query_result` and `memoryview_wrapper` will hold `local_result_wrapper` with shared_ptr @@ -260,17 +277,12 @@ void connection_wrapper::commit() // do nothing } -query_result * connection_wrapper::query(const std::string & query_str, const std::string & format) +py::object connection_wrapper::query(const std::string & query_str, const std::string & format) { CHDB::PythonTableCache::findQueryableObjFromQuery(query_str); py::gil_scoped_release release; auto * result = chdb_query_n(*conn, query_str.data(), query_str.size(), format.data(), format.size()); - if (chdb_result_length(result)) - { - LOG_DEBUG(getLogger("CHDB"), "Empty result returned for query: {}", query_str); - } - auto error_msg = CHDB::chdb_result_error_string(result); if (!error_msg.empty()) { @@ -278,7 +290,28 @@ query_result * connection_wrapper::query(const std::string & query_str, const st chdb_destroy_query_result(result); throw std::runtime_error(msg_copy); } - return new query_result(result, false); + + if (format == "dataframe") + { + chdb_destroy_query_result(result); + + auto * builder = DB::getGlobalDataFrameBuilder(); + if (builder && builder->hasData()) + { + return builder->getDataFrame(); + } + else + { + throw std::runtime_error("DataFrame not available - query may have failed"); + } + } + + if (chdb_result_length(result)) + { + LOG_DEBUG(getLogger("CHDB"), "Empty result returned for query: {}", query_str); + } + + return py::cast(new query_result(result, false)); } streaming_query_result * connection_wrapper::send_query(const std::string & query_str, const std::string & format) @@ -480,7 +513,7 @@ PYBIND11_MODULE(_chdb, m) &connection_wrapper::query, py::arg("query_str"), py::arg("format") = "CSV", - "Execute a query and return a query_result object") + "Execute a query and return a query_result object or DataFrame") .def( "send_query", &connection_wrapper::send_query, @@ -506,7 +539,7 @@ PYBIND11_MODULE(_chdb, m) py::kw_only(), py::arg("path") = "", py::arg("udf_path") = "", - "Query chDB and return a query_result object"); + "Query chDB and return a query_result object or DataFrame"); auto destroy_import_cache = []() { diff --git a/programs/local/LocalChdb.h b/programs/local/LocalChdb.h index 5cf30058135..076103cca19 100644 --- a/programs/local/LocalChdb.h +++ b/programs/local/LocalChdb.h @@ -30,7 +30,7 @@ class connection_wrapper cursor_wrapper * cursor(); void commit(); void close(); - query_result * query(const std::string & query_str, const std::string & format = "CSV"); + py::object query(const std::string & query_str, const std::string & format = "CSV"); streaming_query_result * send_query(const std::string & query_str, const std::string & format = "CSV"); query_result * streaming_fetch_result(streaming_query_result * streaming_result); void streaming_cancel_query(streaming_query_result * streaming_result); diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index edf3f67ad20..04023c82b2a 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -1,13 +1,12 @@ #include "LocalServer.h" - #include "chdb-internal.h" #if USE_PYTHON +#include "ChunkCollectorOutputFormat.h" #include "TableFunctionPython.h" #include #include #endif #include - #include #include #include @@ -656,6 +655,7 @@ try #if USE_PYTHON auto & storage_factory = StorageFactory::instance(); registerStoragePython(storage_factory); + registerDataFrameOutputFormat(); #endif registerDictionaries(); diff --git a/programs/local/QueryResult.h b/programs/local/QueryResult.h index ebd79ec042e..bbd924e3931 100644 --- a/programs/local/QueryResult.h +++ b/programs/local/QueryResult.h @@ -64,6 +64,9 @@ class MaterializedQueryResult : public QueryResult { String string() { + if (!result_buffer) + return {}; + return String(result_buffer->begin(), result_buffer->end()); } diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 0f1d0c3ae22..87f3c02d1be 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -138,6 +138,9 @@ namespace ErrorCodes extern const int CANNOT_WRITE_TO_FILE; } +/// Custom DataFrame format creator function pointer +static CustomOutputFormatCreator g_dataframe_format_creator = nullptr; + } namespace ProfileEvents @@ -615,6 +618,22 @@ try { if (!output_format) { +#if USE_PYTHON + if (default_output_format == "dataframe") + { + auto creator = getDataFrameFormatCreator(); + if (creator) + { + output_format = creator(block); + return; + } + else + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "DataFrame output format creator not set"); + } + } +#endif + /// Ignore all results when fuzzing as they can be huge. if (query_fuzzer_runs) { @@ -3755,4 +3774,16 @@ void ClientBase::showClientVersion() output_stream << VERSION_NAME << " " + getName() + " version " << VERSION_STRING << VERSION_OFFICIAL << "." << std::endl; } +#if USE_PYTHON +void ClientBase::setDataFrameFormatCreator(CustomOutputFormatCreator creator) +{ + g_dataframe_format_creator = std::move(creator); +} + +CustomOutputFormatCreator ClientBase::getDataFrameFormatCreator() +{ + return g_dataframe_format_creator; +} +#endif + } diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index cb3e0a939ac..4d1a5a897bd 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -88,6 +88,11 @@ struct StreamingQueryContext StreamingQueryContext() = default; }; +#if USE_PYTHON +/// Function pointer type for creating custom output formats (e.g. DataFrame) +using CustomOutputFormatCreator = std::function(const Block &)>; +#endif + /** * The base class which encapsulates the core functionality of a client. * Can be used in a standalone application (clickhouse-client or clickhouse-local), @@ -329,6 +334,14 @@ class ClientBase String appendSmileyIfNeeded(const String & prompt); +#if USE_PYTHON + /// Set custom DataFrame format creator + static void setDataFrameFormatCreator(CustomOutputFormatCreator creator); + + /// Get custom DataFrame format creator + static CustomOutputFormatCreator getDataFrameFormatCreator(); +#endif + /// Should be one of the first, to be destroyed the last, /// since other members can use them. /// This holder may not be initialized in case if we run the client in the embedded mode (SSH). From 4b0276b2ef3121967ea6aaf9fd21a9f5f6392672 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Mon, 29 Sep 2025 22:18:32 +0800 Subject: [PATCH 03/22] chore: update NumpyType.cpp --- programs/local/NumpyType.cpp | 115 +++++++++++++++++++++++++++++++++++ programs/local/NumpyType.h | 1 + 2 files changed, 116 insertions(+) diff --git a/programs/local/NumpyType.cpp b/programs/local/NumpyType.cpp index cf92c8dece9..0112731e02a 100644 --- a/programs/local/NumpyType.cpp +++ b/programs/local/NumpyType.cpp @@ -231,4 +231,119 @@ std::shared_ptr NumpyToDataType(const NumpyType & col_type) } } +String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type) +{ + if (!data_type) + return "object"; + + /// First, try to handle most types efficiently using getTypeId() + TypeIndex type_id = data_type->getTypeId(); + switch (type_id) + { + case TypeIndex::Int8: + return "int8"; + case TypeIndex::UInt8: + /// Special case: UInt8 could be Bool type, need to check getName() + { + const String & type_name = data_type->getName(); + return (type_name == "Bool") ? "bool" : "uint8"; + } + case TypeIndex::Int16: + return "int16"; + case TypeIndex::UInt16: + return "uint16"; + case TypeIndex::Int32: + return "int32"; + case TypeIndex::UInt32: + return "uint32"; + case TypeIndex::Int64: + return "int64"; + case TypeIndex::UInt64: + return "uint64"; + case TypeIndex::Float32: + return "float32"; + case TypeIndex::Float64: + return "float64"; + case TypeIndex::String: + case TypeIndex::FixedString: + return "object"; + case TypeIndex::DateTime: + return "datetime64[s]"; + case TypeIndex::DateTime64: + // DateTime64 needs precision info from the actual type + { + if (const auto * dt64 = typeid_cast(data_type.get())) + { + UInt32 scale = dt64->getScale(); + if (scale == 0) + return "datetime64[s]"; + else if (scale == 3) + return "datetime64[ms]"; + else if (scale == 6) + return "datetime64[us]"; + else if (scale == 9) + return "datetime64[ns]"; + else + return "datetime64[ns]"; // Default to nanoseconds + } + return "datetime64[ns]"; // Default fallback + } + case TypeIndex::Date: + case TypeIndex::Date32: + return "datetime64[D]"; + case TypeIndex::UUID: + case TypeIndex::IPv4: + case TypeIndex::IPv6: + return "object"; + case TypeIndex::Decimal32: + case TypeIndex::Decimal64: + case TypeIndex::Decimal128: + case TypeIndex::Decimal256: + return "float64"; // Decimals are converted to float64 + case TypeIndex::Array: + case TypeIndex::Tuple: + case TypeIndex::Map: + return "object"; + case TypeIndex::Nullable: + // Handle Nullable types - need to check inner type + { + const String & type_name = data_type->getName(); + if (startsWith(type_name, "Nullable(")) + { + // Extract the inner type from "Nullable(InnerType)" + size_t start = 9; // Length of "Nullable(" + size_t end = type_name.length() - 1; // Exclude the closing ")" + if (end > start) + { + String inner_type_name = type_name.substr(start, end - start); + // Nullable integers become float64 in pandas + if (inner_type_name == "Int64" || inner_type_name == "Int32" || + inner_type_name == "Int16" || inner_type_name == "Int8" || + inner_type_name == "UInt64" || inner_type_name == "UInt32" || + inner_type_name == "UInt16" || inner_type_name == "UInt8") + return "float64"; + else if (inner_type_name == "Float64") + return "float64"; + else if (inner_type_name == "Float32") + return "float32"; + else if (inner_type_name == "String") + return "object"; + } + } + return "object"; + } + default: + // For other complex types, fall back to getName() parsing + { + const String & type_name = data_type->getName(); + if (startsWith(type_name, "Array(") || startsWith(type_name, "Tuple(") || + startsWith(type_name, "Map(")) + return "object"; + + // Default fallback for unknown types + return "object"; + } + } +} + } // namespace CHDB diff --git a/programs/local/NumpyType.h b/programs/local/NumpyType.h index c58fee13768..91f0d3e3a85 100644 --- a/programs/local/NumpyType.h +++ b/programs/local/NumpyType.h @@ -49,5 +49,6 @@ enum class NumpyObjectType : uint8_t { NumpyType ConvertNumpyType(const py::handle & col_type); std::shared_ptr NumpyToDataType(const NumpyType & col_type); +String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type); } // namespace CHDB From fbaf1c775439df6a37eb4c0e471c7108a4f636ad Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Wed, 29 Oct 2025 22:28:44 +0800 Subject: [PATCH 04/22] chore: add NumpyArray --- programs/local/LocalServer.cpp | 1 + programs/local/NumpyType.cpp | 111 +++++++++++++++++++++------------ 2 files changed, 73 insertions(+), 39 deletions(-) diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 297f33d8040..32dac7fa278 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -1,6 +1,7 @@ #include "LocalServer.h" #include "chdb-internal.h" #if USE_PYTHON +#include "ChunkCollectorOutputFormat.h" #include "StoragePython.h" #include "TableFunctionPython.h" #else diff --git a/programs/local/NumpyType.cpp b/programs/local/NumpyType.cpp index 0112731e02a..98fb1bf76a0 100644 --- a/programs/local/NumpyType.cpp +++ b/programs/local/NumpyType.cpp @@ -1,11 +1,12 @@ #include "NumpyType.h" -#include #include #include +#include #include #include #include +#include using namespace DB; @@ -231,7 +232,7 @@ std::shared_ptr NumpyToDataType(const NumpyType & col_type) } } -String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type) +String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type) { if (!data_type) return "object"; @@ -270,79 +271,111 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type) case TypeIndex::DateTime: return "datetime64[s]"; case TypeIndex::DateTime64: - // DateTime64 needs precision info from the actual type { if (const auto * dt64 = typeid_cast(data_type.get())) { UInt32 scale = dt64->getScale(); if (scale == 0) return "datetime64[s]"; - else if (scale == 3) + else if (scale == 3) return "datetime64[ms]"; else if (scale == 6) return "datetime64[us]"; else if (scale == 9) return "datetime64[ns]"; else - return "datetime64[ns]"; // Default to nanoseconds + return "datetime64[ns]"; } - return "datetime64[ns]"; // Default fallback + return "datetime64[ns]"; } case TypeIndex::Date: case TypeIndex::Date32: return "datetime64[D]"; + case TypeIndex::Time: + return "timedelta64[s]"; + case TypeIndex::Time64: + { + if (const auto * time64 = typeid_cast(data_type.get())) + { + UInt32 scale = time64->getScale(); + if (scale == 0) + return "timedelta64[s]"; + else if (scale == 3) + return "timedelta64[ms]"; + else if (scale == 6) + return "timedelta64[us]"; + else if (scale == 9) + return "timedelta64[ns]"; + else + return "timedelta64[ns]"; + } + return "timedelta64[ns]"; + } + case TypeIndex::Interval: + { + if (const auto * interval = typeid_cast(data_type.get())) + { + IntervalKind kind = interval->getKind(); + switch (kind.kind) + { + case IntervalKind::Kind::Nanosecond: + return "timedelta64[ns]"; + case IntervalKind::Kind::Microsecond: + return "timedelta64[us]"; + case IntervalKind::Kind::Millisecond: + return "timedelta64[ms]"; + case IntervalKind::Kind::Second: + return "timedelta64[s]"; + case IntervalKind::Kind::Minute: + return "timedelta64[m]"; + case IntervalKind::Kind::Hour: + return "timedelta64[h]"; + case IntervalKind::Kind::Day: + return "timedelta64[D]"; + case IntervalKind::Kind::Week: + return "timedelta64[W]"; + case IntervalKind::Kind::Month: + return "timedelta64[M]"; + case IntervalKind::Kind::Quarter: + return "object"; + case IntervalKind::Kind::Year: + return "timedelta64[Y]"; + default: + return "timedelta64[s]"; + } + } + return "timedelta64[s]"; + } + case TypeIndex::UUID: case TypeIndex::IPv4: case TypeIndex::IPv6: return "object"; + case TypeIndex::BFloat16: case TypeIndex::Decimal32: case TypeIndex::Decimal64: case TypeIndex::Decimal128: case TypeIndex::Decimal256: - return "float64"; // Decimals are converted to float64 + return "object"; case TypeIndex::Array: case TypeIndex::Tuple: case TypeIndex::Map: + case TypeIndex::Set: + case TypeIndex::Dynamic: + case TypeIndex::Variant: + case TypeIndex::Object: return "object"; case TypeIndex::Nullable: - // Handle Nullable types - need to check inner type { - const String & type_name = data_type->getName(); - if (startsWith(type_name, "Nullable(")) + if (const auto * nullable = typeid_cast(data_type.get())) { - // Extract the inner type from "Nullable(InnerType)" - size_t start = 9; // Length of "Nullable(" - size_t end = type_name.length() - 1; // Exclude the closing ")" - if (end > start) - { - String inner_type_name = type_name.substr(start, end - start); - // Nullable integers become float64 in pandas - if (inner_type_name == "Int64" || inner_type_name == "Int32" || - inner_type_name == "Int16" || inner_type_name == "Int8" || - inner_type_name == "UInt64" || inner_type_name == "UInt32" || - inner_type_name == "UInt16" || inner_type_name == "UInt8") - return "float64"; - else if (inner_type_name == "Float64") - return "float64"; - else if (inner_type_name == "Float32") - return "float32"; - else if (inner_type_name == "String") - return "object"; - } + return DataTypeToNumpyTypeStr(nullable->getNestedType()); } return "object"; } default: - // For other complex types, fall back to getName() parsing - { - const String & type_name = data_type->getName(); - if (startsWith(type_name, "Array(") || startsWith(type_name, "Tuple(") || - startsWith(type_name, "Map(")) - return "object"; - - // Default fallback for unknown types - return "object"; - } + return "object"; + } } } From 8391e2fcc31b472c1157a37f02da0c63ae220971 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Thu, 30 Oct 2025 01:45:02 +0800 Subject: [PATCH 05/22] chore: update NumpyArray --- programs/local/NumpyArray.cpp | 231 +++++++++++++++++ programs/local/NumpyArray.h | 45 ++++ programs/local/NumpyCacheItem.h | 66 +++++ programs/local/NumpyType.cpp | 286 +++++++++++++--------- programs/local/NumpyType.h | 4 + programs/local/PandasAnalyzer.cpp | 2 +- programs/local/PandasDataFrameBuilder.cpp | 112 +++++++++ programs/local/PandasDataFrameBuilder.h | 46 ++++ programs/local/PythonImportCache.h | 2 + 9 files changed, 671 insertions(+), 123 deletions(-) create mode 100644 programs/local/NumpyArray.cpp create mode 100644 programs/local/NumpyArray.h create mode 100644 programs/local/NumpyCacheItem.h create mode 100644 programs/local/PandasDataFrameBuilder.cpp create mode 100644 programs/local/PandasDataFrameBuilder.h diff --git a/programs/local/NumpyArray.cpp b/programs/local/NumpyArray.cpp new file mode 100644 index 00000000000..35440e5b03d --- /dev/null +++ b/programs/local/NumpyArray.cpp @@ -0,0 +1,231 @@ +#include "NumpyArray.h" +#include "NumpyType.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +} + +using namespace DB; + +namespace CHDB +{ + +struct NumpyAppendData +{ +public: + explicit NumpyAppendData(const IColumn & column) + : column(column) + { + } + + const IColumn & column; + + size_t count; + size_t dest_offset; + UInt8 * target_data; + bool * target_mask; +}; + +struct RegularConvert +{ + template + static NUMPYTYPE convertValue(CHTYPE val, NumpyAppendData & append_data) + { + (void)append_data; + return (NUMPYTYPE)val; + } + + template + static NUMPYTYPE nullValue(bool & set_mask) + { + set_mask = true; + return 0; + } +}; + +template +static bool TransformColumn(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * src_ptr = static_cast(data_column)->getRawDataBegin(); + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = 0; i < append_data.count; i++) { + size_t offset = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(i)) { + dest_ptr[offset] = CONVERT::template nullValue(mask_ptr[offset]); + has_null = has_null || mask_ptr[offset]; + } else { + dest_ptr[offset] = CONVERT::template convertValue(src_ptr[i], append_data); + mask_ptr[offset] = false; + } + } + return has_null; +} + +template +static bool CHColumnToNumpyArray(NumpyAppendData & append_data) +{ + return TransformColumn(append_data); +} + +InternalNumpyArray::InternalNumpyArray(const DataTypePtr & type_) + : data(nullptr) + , type(type_) + , count(0) +{ +} + +void InternalNumpyArray::init(size_t capacity) +{ + String type_str = DataTypeToNumpyTypeStr(type); + + array = py::array(py::dtype(type_str), capacity); + data = reinterpret_cast(array.mutable_data()); +} + +void InternalNumpyArray::resize(size_t capacity) +{ + std::vector new_shape {py::ssize_t(capacity)}; + + array.resize(new_shape, false); + data = reinterpret_cast(array.mutable_data()); +} + +NumpyArray::NumpyArray(const DataTypePtr & type_) + : hava_null(false) +{ + data_array = std::make_unique(type_); + mask_array = std::make_unique(DataTypeFactory::instance().get("Bool")); +} + +void NumpyArray::init(size_t capacity) +{ + data_array->init(capacity); + mask_array->init(capacity); +} + +void NumpyArray::resize(size_t capacity) +{ + data_array->resize(capacity); + mask_array->resize(capacity); +} + +void NumpyArray::append(const ColumnPtr & column) +{ + chassert(data_array); + chassert(mask_array); + + auto * data_ptr = data_array->data; + auto * mask_ptr = reinterpret_cast(mask_array->data); + chassert(data_ptr); + chassert(mask_ptr); + chassert(column->getDataType() == data_array->type->getColumnType()); + + size_t size = column->size(); + data_array->count += size; + mask_array->count += size; + bool may_have_null = false; + + NumpyAppendData append_data(*column); + append_data.count = size; + append_data.target_data = data_ptr; + append_data.target_mask = mask_ptr; + append_data.dest_offset = data_array->count - size; + + switch (data_array->type->getTypeId()) + { + case TypeIndex::Int8: + may_have_null = CHColumnToNumpyArray(append_data); + break; + case TypeIndex::UInt8: + { + const String & type_name = data_array->type->getName(); + if (type_name == "Bool") + { + may_have_null = CHColumnToNumpyArray(append_data); + } + else + { + may_have_null = CHColumnToNumpyArray(append_data); + } + } + break; + case TypeIndex::Int16: + may_have_null = CHColumnToNumpyArray(append_data); + break; + case TypeIndex::UInt16: + may_have_null = CHColumnToNumpyArray(append_data); + break; + case TypeIndex::Int32: + may_have_null = CHColumnToNumpyArray(append_data); + break; + case TypeIndex::UInt32: + may_have_null = CHColumnToNumpyArray(append_data); + break; + case TypeIndex::Int64: + may_have_null = CHColumnToNumpyArray(append_data); + break; + case TypeIndex::UInt64: + may_have_null = CHColumnToNumpyArray(append_data); + break; + case TypeIndex::Float32: + may_have_null = CHColumnToNumpyArray(append_data); + break; + case TypeIndex::Float64: + may_have_null = CHColumnToNumpyArray(append_data); + break; + default: + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", data_array->type->getName()); + } + + if (may_have_null) + { + hava_null = true; + } +} + +py::object NumpyArray::toArray() const +{ + chassert(data_array && mask_array); + + data_array->resize(data_array->count); + if (!hava_null) + { + return std::move(data_array->array); + } + + mask_array->resize(mask_array->count); + auto data_values = std::move(data_array->array); + auto null_values = std::move(mask_array->array); + + auto masked_array = py::module::import("numpy.ma").attr("masked_array")(data_values, null_values); + return masked_array; +} + +} // namespace CHDB diff --git a/programs/local/NumpyArray.h b/programs/local/NumpyArray.h new file mode 100644 index 00000000000..7927faf1ec0 --- /dev/null +++ b/programs/local/NumpyArray.h @@ -0,0 +1,45 @@ +#pragma once + +#include "PybindWrapper.h" + +#include +#include +#include + +namespace CHDB +{ + +class InternalNumpyArray +{ +public: + explicit InternalNumpyArray(const DB::DataTypePtr & type); + + void init(size_t capacity); + + void resize(size_t capacity); + + py::array array; + UInt8 * data; + DB::DataTypePtr type; + size_t count; +}; + +class NumpyArray { +public: + explicit NumpyArray(const DB::DataTypePtr & type_); + + void init(size_t capacity); + + void resize(size_t capacity); + + void append(const DB::ColumnPtr & column); + + py::object toArray() const; + +private: + bool hava_null; + std::unique_ptr data_array; + std::unique_ptr mask_array; +}; + +} // namespace CHDB diff --git a/programs/local/NumpyCacheItem.h b/programs/local/NumpyCacheItem.h new file mode 100644 index 00000000000..5d75cc5ed0a --- /dev/null +++ b/programs/local/NumpyCacheItem.h @@ -0,0 +1,66 @@ +#pragma once + +#include "PythonImportCacheItem.h" + +namespace CHDB { + +struct NumpyMaCacheItem : public PythonImportCacheItem +{ +public: + NumpyMaCacheItem(PythonImportCacheItem * parent) + : PythonImportCacheItem("ma", parent), masked("masked", this), masked_array("masked_array", this) { + } + ~NumpyMaCacheItem() override = default; + + PythonImportCacheItem masked; + PythonImportCacheItem masked_array; +}; + +struct NumpyCacheItem : public PythonImportCacheItem +{ +public: + static constexpr const char * Name = "numpy"; + + NumpyCacheItem() + : PythonImportCacheItem("numpy"), ma(this), ndarray("ndarray", this), datetime64("datetime64", this), + generic("generic", this), int64("int64", this), bool_("bool_", this), byte("byte", this), + ubyte("ubyte", this), short_("short", this), ushort_("ushort", this), intc("intc", this), + uintc("uintc", this), int_("int_", this), uint("uint", this), longlong("longlong", this), + ulonglong("ulonglong", this), half("half", this), float16("float16", this), single("single", this), + longdouble("longdouble", this), csingle("csingle", this), cdouble("cdouble", this), + clongdouble("clongdouble", this) { + } + ~NumpyCacheItem() override = default; + + NumpyMaCacheItem ma; + PythonImportCacheItem ndarray; + PythonImportCacheItem datetime64; + PythonImportCacheItem generic; + PythonImportCacheItem int64; + PythonImportCacheItem bool_; + PythonImportCacheItem byte; + PythonImportCacheItem ubyte; + PythonImportCacheItem short_; + PythonImportCacheItem ushort_; + PythonImportCacheItem intc; + PythonImportCacheItem uintc; + PythonImportCacheItem int_; + PythonImportCacheItem uint; + PythonImportCacheItem longlong; + PythonImportCacheItem ulonglong; + PythonImportCacheItem half; + PythonImportCacheItem float16; + PythonImportCacheItem single; + PythonImportCacheItem longdouble; + PythonImportCacheItem csingle; + PythonImportCacheItem cdouble; + PythonImportCacheItem clongdouble; + +protected: + bool IsRequired() const override final + { + return false; + } +}; + +} // namespace CHDB diff --git a/programs/local/NumpyType.cpp b/programs/local/NumpyType.cpp index 98fb1bf76a0..cf487e6d644 100644 --- a/programs/local/NumpyType.cpp +++ b/programs/local/NumpyType.cpp @@ -1,4 +1,5 @@ #include "NumpyType.h" +#include "PythonImporter.h" #include #include @@ -237,146 +238,187 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type if (!data_type) return "object"; - /// First, try to handle most types efficiently using getTypeId() TypeIndex type_id = data_type->getTypeId(); switch (type_id) { - case TypeIndex::Int8: - return "int8"; - case TypeIndex::UInt8: - /// Special case: UInt8 could be Bool type, need to check getName() + case TypeIndex::Int8: + return "int8"; + case TypeIndex::UInt8: + /// Special case: UInt8 could be Bool type, need to check getName() + { + const String & type_name = data_type->getName(); + return (type_name == "Bool") ? "bool" : "uint8"; + } + case TypeIndex::Int16: + return "int16"; + case TypeIndex::UInt16: + return "uint16"; + case TypeIndex::Int32: + return "int32"; + case TypeIndex::UInt32: + return "uint32"; + case TypeIndex::Int64: + return "int64"; + case TypeIndex::UInt64: + return "uint64"; + case TypeIndex::Float32: + return "float32"; + case TypeIndex::Float64: + return "float64"; + case TypeIndex::String: + case TypeIndex::FixedString: + return "object"; + case TypeIndex::DateTime: + return "datetime64[s]"; + case TypeIndex::DateTime64: + { + if (const auto * dt64 = typeid_cast(data_type.get())) { - const String & type_name = data_type->getName(); - return (type_name == "Bool") ? "bool" : "uint8"; + UInt32 scale = dt64->getScale(); + if (scale == 0) + return "datetime64[s]"; + else if (scale == 3) + return "datetime64[ms]"; + else if (scale == 6) + return "datetime64[us]"; + else if (scale == 9) + return "datetime64[ns]"; + else + return "datetime64[ns]"; } - case TypeIndex::Int16: - return "int16"; - case TypeIndex::UInt16: - return "uint16"; - case TypeIndex::Int32: - return "int32"; - case TypeIndex::UInt32: - return "uint32"; - case TypeIndex::Int64: - return "int64"; - case TypeIndex::UInt64: - return "uint64"; - case TypeIndex::Float32: - return "float32"; - case TypeIndex::Float64: - return "float64"; - case TypeIndex::String: - case TypeIndex::FixedString: - return "object"; - case TypeIndex::DateTime: - return "datetime64[s]"; - case TypeIndex::DateTime64: + return "datetime64[ns]"; + } + case TypeIndex::Date: + case TypeIndex::Date32: + return "datetime64[D]"; + case TypeIndex::Time: + return "timedelta64[s]"; + case TypeIndex::Time64: + { + if (const auto * time64 = typeid_cast(data_type.get())) { - if (const auto * dt64 = typeid_cast(data_type.get())) - { - UInt32 scale = dt64->getScale(); - if (scale == 0) - return "datetime64[s]"; - else if (scale == 3) - return "datetime64[ms]"; - else if (scale == 6) - return "datetime64[us]"; - else if (scale == 9) - return "datetime64[ns]"; - else - return "datetime64[ns]"; - } - return "datetime64[ns]"; + UInt32 scale = time64->getScale(); + if (scale == 0) + return "timedelta64[s]"; + else if (scale == 3) + return "timedelta64[ms]"; + else if (scale == 6) + return "timedelta64[us]"; + else if (scale == 9) + return "timedelta64[ns]"; + else + return "timedelta64[ns]"; } - case TypeIndex::Date: - case TypeIndex::Date32: - return "datetime64[D]"; - case TypeIndex::Time: - return "timedelta64[s]"; - case TypeIndex::Time64: + return "timedelta64[ns]"; + } + case TypeIndex::Interval: + { + if (const auto * interval = typeid_cast(data_type.get())) { - if (const auto * time64 = typeid_cast(data_type.get())) + IntervalKind kind = interval->getKind(); + switch (kind.kind) { - UInt32 scale = time64->getScale(); - if (scale == 0) - return "timedelta64[s]"; - else if (scale == 3) - return "timedelta64[ms]"; - else if (scale == 6) - return "timedelta64[us]"; - else if (scale == 9) - return "timedelta64[ns]"; - else + case IntervalKind::Kind::Nanosecond: return "timedelta64[ns]"; + case IntervalKind::Kind::Microsecond: + return "timedelta64[us]"; + case IntervalKind::Kind::Millisecond: + return "timedelta64[ms]"; + case IntervalKind::Kind::Second: + return "timedelta64[s]"; + case IntervalKind::Kind::Minute: + return "timedelta64[m]"; + case IntervalKind::Kind::Hour: + return "timedelta64[h]"; + case IntervalKind::Kind::Day: + return "timedelta64[D]"; + case IntervalKind::Kind::Week: + return "timedelta64[W]"; + case IntervalKind::Kind::Month: + return "timedelta64[M]"; + case IntervalKind::Kind::Quarter: + return "object"; + case IntervalKind::Kind::Year: + return "timedelta64[Y]"; + default: + return "timedelta64[s]"; } - return "timedelta64[ns]"; - } - case TypeIndex::Interval: - { - if (const auto * interval = typeid_cast(data_type.get())) - { - IntervalKind kind = interval->getKind(); - switch (kind.kind) - { - case IntervalKind::Kind::Nanosecond: - return "timedelta64[ns]"; - case IntervalKind::Kind::Microsecond: - return "timedelta64[us]"; - case IntervalKind::Kind::Millisecond: - return "timedelta64[ms]"; - case IntervalKind::Kind::Second: - return "timedelta64[s]"; - case IntervalKind::Kind::Minute: - return "timedelta64[m]"; - case IntervalKind::Kind::Hour: - return "timedelta64[h]"; - case IntervalKind::Kind::Day: - return "timedelta64[D]"; - case IntervalKind::Kind::Week: - return "timedelta64[W]"; - case IntervalKind::Kind::Month: - return "timedelta64[M]"; - case IntervalKind::Kind::Quarter: - return "object"; - case IntervalKind::Kind::Year: - return "timedelta64[Y]"; - default: - return "timedelta64[s]"; - } - } - return "timedelta64[s]"; } + return "timedelta64[s]"; + } - case TypeIndex::UUID: - case TypeIndex::IPv4: - case TypeIndex::IPv6: - return "object"; - case TypeIndex::BFloat16: - case TypeIndex::Decimal32: - case TypeIndex::Decimal64: - case TypeIndex::Decimal128: - case TypeIndex::Decimal256: - return "object"; - case TypeIndex::Array: - case TypeIndex::Tuple: - case TypeIndex::Map: - case TypeIndex::Set: - case TypeIndex::Dynamic: - case TypeIndex::Variant: - case TypeIndex::Object: - return "object"; - case TypeIndex::Nullable: + case TypeIndex::UUID: + case TypeIndex::IPv4: + case TypeIndex::IPv6: + return "object"; + case TypeIndex::BFloat16: + case TypeIndex::Decimal32: + case TypeIndex::Decimal64: + case TypeIndex::Decimal128: + case TypeIndex::Decimal256: + return "object"; + case TypeIndex::Array: + case TypeIndex::Tuple: + case TypeIndex::Map: + case TypeIndex::Set: + case TypeIndex::Dynamic: + case TypeIndex::Variant: + case TypeIndex::Object: + return "object"; + case TypeIndex::Nullable: + { + if (const auto * nullable = typeid_cast(data_type.get())) { - if (const auto * nullable = typeid_cast(data_type.get())) - { - return DataTypeToNumpyTypeStr(nullable->getNestedType()); - } - return "object"; + return DataTypeToNumpyTypeStr(nullable->getNestedType()); } - default: return "object"; } + default: + return "object"; } } +py::object ConvertNumpyDtype(const py::handle & numpy_array) +{ + chassert(py::gil_check()); + + auto & import_cache = PythonImporter::ImportCache(); + + auto dtype = numpy_array.attr("dtype"); + if (!py::isinstance(numpy_array, import_cache.numpy.ma.masked_array())) + { + return dtype; + } + + auto numpy_type = ConvertNumpyType(dtype); + switch (numpy_type.type) + { + case NumpyNullableType::BOOL: + return import_cache.pandas.BooleanDtype()(); + case NumpyNullableType::UINT_8: + return import_cache.pandas.UInt8Dtype()(); + case NumpyNullableType::UINT_16: + return import_cache.pandas.UInt16Dtype()(); + case NumpyNullableType::UINT_32: + return import_cache.pandas.UInt32Dtype()(); + case NumpyNullableType::UINT_64: + return import_cache.pandas.UInt64Dtype()(); + case NumpyNullableType::INT_8: + return import_cache.pandas.Int8Dtype()(); + case NumpyNullableType::INT_16: + return import_cache.pandas.Int16Dtype()(); + case NumpyNullableType::INT_32: + return import_cache.pandas.Int32Dtype()(); + case NumpyNullableType::INT_64: + return import_cache.pandas.Int64Dtype()(); + case NumpyNullableType::FLOAT_32: + return import_cache.pandas.Float32Dtype()(); + case NumpyNullableType::FLOAT_64: + return import_cache.pandas.Float64Dtype()(); + case NumpyNullableType::FLOAT_16: + default: + return dtype; + } +} + } // namespace CHDB diff --git a/programs/local/NumpyType.h b/programs/local/NumpyType.h index 91f0d3e3a85..8a72ece2bb1 100644 --- a/programs/local/NumpyType.h +++ b/programs/local/NumpyType.h @@ -48,7 +48,11 @@ enum class NumpyObjectType : uint8_t { }; NumpyType ConvertNumpyType(const py::handle & col_type); + std::shared_ptr NumpyToDataType(const NumpyType & col_type); + String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type); +py::object ConvertNumpyDtype(py::handle & numpy_array); + } // namespace CHDB diff --git a/programs/local/PandasAnalyzer.cpp b/programs/local/PandasAnalyzer.cpp index f1c97c96772..57d6140c692 100644 --- a/programs/local/PandasAnalyzer.cpp +++ b/programs/local/PandasAnalyzer.cpp @@ -38,7 +38,7 @@ PandasAnalyzer::PandasAnalyzer(const DB::Settings & settings) bool PandasAnalyzer::Analyze(py::object column) { #if USE_JEMALLOC - ::Memory::MemoryCheckScope memory_check_scope; + ::Memory::MemoryCheckScope memory_check_scope; #endif if (sample_size == 0) return false; diff --git a/programs/local/PandasDataFrameBuilder.cpp b/programs/local/PandasDataFrameBuilder.cpp new file mode 100644 index 00000000000..4878af0c9f3 --- /dev/null +++ b/programs/local/PandasDataFrameBuilder.cpp @@ -0,0 +1,112 @@ +#include "PandasDataFrameBuilder.h" +#include "NumpyType.h" +#include "PythonUtils.h" +#include "PythonConversion.h" +#include "PythonImporter.h" + +#include +#include +#include +#include +#include +#include +#include + +using namespace CHDB; + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +PandasDataFrameBuilder::PandasDataFrameBuilder(const Block & sample) +{ + column_names.reserve(sample.columns()); + column_types.reserve(sample.columns()); + + for (const auto & column : sample) + { + column_names.push_back(column.name); + column_types.push_back(column.type); + } +} + +void PandasDataFrameBuilder::addChunk(const Chunk & chunk) +{ + if (chunk.hasRows()) + { + chunks.push_back(chunk.clone()); + total_rows += chunk.getNumRows(); + } +} + +py::object PandasDataFrameBuilder::genDataFrame(const py::handle & dict) +{ + auto & import_cache = PythonImporter::ImportCache(); + auto pandas = import_cache.pandas(); + if (!pandas) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Pandas is not installed"); + } + + py::object items = dict.attr("items")(); + for (const py::handle & item : items) { + auto key_value = py::cast(item); + py::handle key = key_value[0]; + py::handle value = key_value[1]; + + auto dtype = ConvertNumpyDtype(value); + if (py::isinstance(value, import_cache.numpy.ma.masked_array())) + { + auto series = pandas.attr("Series")(value.attr("data"), py::arg("dtype") = dtype); + series.attr("__setitem__")(value.attr("mask"), import_cache.pandas.NA()); + dict.attr("__setitem__")(key, series); + } + } + + auto df = pandas.attr("DataFrame").attr("from_dict")(dict); + return df; +} + +void PandasDataFrameBuilder::finalize() +{ + if (is_finalized) + return; + + columns_data.reserve(column_types.size()); + for (const auto & type : column_types) + { + columns_data.emplace_back(type); + } + + for (auto & column_data : columns_data) + { + column_data.init(total_rows); + } + + /// Process all chunks and append column data + for (const auto & chunk : chunks) + { + const auto & columns = chunk.getColumns(); + for (size_t col_idx = 0; col_idx < columns.size(); ++col_idx) + { + columns_data[col_idx].append(columns[col_idx]); + } + } + + /// Create pandas DataFrame + py::dict res; + for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) { + auto & name = column_names[col_idx]; + auto & column_data = columns_data[col_idx]; + res[name.c_str()] = column_data.toArray(); + } + final_dataframe = genDataFrame(res); + + is_finalized = true; +} + +} diff --git a/programs/local/PandasDataFrameBuilder.h b/programs/local/PandasDataFrameBuilder.h new file mode 100644 index 00000000000..cbb17811da7 --- /dev/null +++ b/programs/local/PandasDataFrameBuilder.h @@ -0,0 +1,46 @@ +#pragma once + +#include "PybindWrapper.h" +#include "NumpyArray.h" + +#include +#include +#include +#include + +namespace DB +{ + +/// Builder class to convert ClickHouse Chunks to Pandas DataFrame +/// Accumulates chunks and provides conversion to Python pandas DataFrame object +class PandasDataFrameBuilder +{ +public: + explicit PandasDataFrameBuilder(const Block & sample); + + /// Add data chunk + void addChunk(const Chunk & chunk); + + /// Finalize and build pandas DataFrame from all collected chunks + void finalize(); + + /// Get the finalized pandas DataFrame + pybind11::object getDataFrame() const { return final_dataframe; } + +private: + pybind11::object genDataFrame(const pybind11::handle & dict); + + std::vector column_names; + std::vector column_types; + + std::vector chunks; + std::vector columns_data; + + size_t total_rows = 0; + bool is_finalized = false; + pybind11::object final_dataframe; + + Poco::Logger * log = &Poco::Logger::get("PandasDataFrameBuilder"); +}; + +} diff --git a/programs/local/PythonImportCache.h b/programs/local/PythonImportCache.h index 6bdf5cf7c8f..1703a5103a5 100644 --- a/programs/local/PythonImportCache.h +++ b/programs/local/PythonImportCache.h @@ -2,6 +2,7 @@ #include "DatetimeCacheItem.h" #include "DecimalCacheItem.h" +#include "NumpyCacheItem.h" #include "PandasCacheItem.h" #include "PyArrowCacheItem.h" #include "PythonImportCacheItem.h" @@ -23,6 +24,7 @@ struct PythonImportCache { PyarrowCacheItem pyarrow; DatetimeCacheItem datetime; DecimalCacheItem decimal; + NumpyCacheItem numpy; py::handle AddCache(py::object item); From 8af227408e728785cfa4e4d93d830ecde8d6cbf2 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Thu, 30 Oct 2025 02:40:00 +0800 Subject: [PATCH 06/22] chore: update NumpyArray --- programs/local/NumpyArray.cpp | 26 ++++++++++++++++++++------ programs/local/NumpyType.cpp | 10 +++++++--- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/programs/local/NumpyArray.cpp b/programs/local/NumpyArray.cpp index 35440e5b03d..a6eca447e2c 100644 --- a/programs/local/NumpyArray.cpp +++ b/programs/local/NumpyArray.cpp @@ -165,15 +165,11 @@ void NumpyArray::append(const ColumnPtr & column) break; case TypeIndex::UInt8: { - const String & type_name = data_array->type->getName(); - if (type_name == "Bool") - { + auto is_bool = isBool(data_array->type); + if (is_bool) may_have_null = CHColumnToNumpyArray(append_data); - } else - { may_have_null = CHColumnToNumpyArray(append_data); - } } break; case TypeIndex::Int16: @@ -200,6 +196,24 @@ void NumpyArray::append(const ColumnPtr & column) case TypeIndex::Float64: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::Int128: + may_have_null = TransformColumn(append_data); + break; + case TypeIndex::Int256: + may_have_null = TransformColumn(append_data); + break; + case TypeIndex::UInt128: + may_have_null = TransformColumn(append_data); + break; + case TypeIndex::UInt256: + may_have_null = TransformColumn(append_data); + break; + case TypeIndex::BFloat16: + may_have_null = TransformColumn(append_data); + break; + /// case TypeIndex::Date: + /// may_have_null = TransformColumn(append_data); + /// break; default: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", data_array->type->getName()); } diff --git a/programs/local/NumpyType.cpp b/programs/local/NumpyType.cpp index cf487e6d644..1b5fa53c79b 100644 --- a/programs/local/NumpyType.cpp +++ b/programs/local/NumpyType.cpp @@ -246,8 +246,8 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type case TypeIndex::UInt8: /// Special case: UInt8 could be Bool type, need to check getName() { - const String & type_name = data_type->getName(); - return (type_name == "Bool") ? "bool" : "uint8"; + auto is_bool = isBool(data_type); + return is_bool ? "bool" : "uint8"; } case TypeIndex::Int16: return "int16"; @@ -261,8 +261,13 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type return "int64"; case TypeIndex::UInt64: return "uint64"; + case TypeIndex::BFloat16: case TypeIndex::Float32: return "float32"; + case TypeIndex::Int256: + case TypeIndex::UInt256: + case TypeIndex::Int128: + case TypeIndex::UInt128: case TypeIndex::Float64: return "float64"; case TypeIndex::String: @@ -351,7 +356,6 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type case TypeIndex::IPv4: case TypeIndex::IPv6: return "object"; - case TypeIndex::BFloat16: case TypeIndex::Decimal32: case TypeIndex::Decimal64: case TypeIndex::Decimal128: From 0255b90dfd356caf5d51f26109c27d71eb95e130 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Thu, 30 Oct 2025 11:00:40 +0800 Subject: [PATCH 07/22] chore: update NumpyArray --- programs/local/NumpyArray.cpp | 18 ++++++++++++++---- programs/local/NumpyType.cpp | 2 +- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/programs/local/NumpyArray.cpp b/programs/local/NumpyArray.cpp index a6eca447e2c..b7bf32ca7cf 100644 --- a/programs/local/NumpyArray.cpp +++ b/programs/local/NumpyArray.cpp @@ -71,7 +71,8 @@ static bool TransformColumn(NumpyAppendData & append_data) data_column = &nullable->getNestedColumn(); } - const auto * src_ptr = static_cast(data_column)->getRawDataBegin(); + const auto * tmp_ptr = static_cast(data_column)->getRawDataBegin(); + const auto * src_ptr = reinterpret_cast(tmp_ptr); auto * dest_ptr = reinterpret_cast(append_data.target_data); auto * mask_ptr = append_data.target_mask; @@ -211,9 +212,18 @@ void NumpyArray::append(const ColumnPtr & column) case TypeIndex::BFloat16: may_have_null = TransformColumn(append_data); break; - /// case TypeIndex::Date: - /// may_have_null = TransformColumn(append_data); - /// break; + case TypeIndex::Date: + may_have_null = TransformColumn(append_data); + break; + case TypeIndex::Date32: + may_have_null = TransformColumn(append_data); + break; + case TypeIndex::DateTime: + may_have_null = TransformColumn(append_data); + break; + case TypeIndex::DateTime64: + may_have_null = CHColumnToNumpyArray(append_data); + break; default: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", data_array->type->getName()); } diff --git a/programs/local/NumpyType.cpp b/programs/local/NumpyType.cpp index 1b5fa53c79b..45d03ac786a 100644 --- a/programs/local/NumpyType.cpp +++ b/programs/local/NumpyType.cpp @@ -378,7 +378,7 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type return "object"; } default: - return "object"; + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", data_type->getName()); } } From 1b6dade168c2da7f8f43faaf6fe04851fca691af Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Fri, 31 Oct 2025 18:15:43 +0800 Subject: [PATCH 08/22] chore: support timezone --- programs/local/PandasDataFrameBuilder.cpp | 53 +++++++++++++++++++++-- programs/local/PandasDataFrameBuilder.h | 5 +++ 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/programs/local/PandasDataFrameBuilder.cpp b/programs/local/PandasDataFrameBuilder.cpp index 4878af0c9f3..5764f6eab39 100644 --- a/programs/local/PandasDataFrameBuilder.cpp +++ b/programs/local/PandasDataFrameBuilder.cpp @@ -1,16 +1,22 @@ #include "PandasDataFrameBuilder.h" -#include "NumpyType.h" -#include "PythonUtils.h" -#include "PythonConversion.h" #include "PythonImporter.h" +#include "NumpyType.h" +#include +#include +#include +#include +#include +#include +#include #include +#include #include #include #include #include #include -#include +#include using namespace CHDB; @@ -31,6 +37,16 @@ PandasDataFrameBuilder::PandasDataFrameBuilder(const Block & sample) { column_names.push_back(column.name); column_types.push_back(column.type); + + /// Record timezone for timezone-aware types + if (const auto * dt = typeid_cast(column.type.get())) + column_timezones[column.name] = dt->getTimeZone().getTimeZone(); + else if (const auto * dt64 = typeid_cast(column.type.get())) + column_timezones[column.name] = dt64->getTimeZone().getTimeZone(); + else if (const auto * t = typeid_cast(column.type.get())) + column_timezones[column.name] = t->getTimeZone().getTimeZone(); + else if (const auto * t64 = typeid_cast(column.type.get())) + column_timezones[column.name] = t64->getTimeZone().getTimeZone(); } } @@ -68,9 +84,38 @@ py::object PandasDataFrameBuilder::genDataFrame(const py::handle & dict) } auto df = pandas.attr("DataFrame").attr("from_dict")(dict); + + /// Apply timezone conversion for timezone-aware columns + changeToTZType(df); + return df; } +void PandasDataFrameBuilder::changeToTZType(py::object & df) +{ + if (column_timezones.empty()) + return; + + for (const auto & [column_name, timezone_str] : column_timezones) + { + /// Check if column exists in DataFrame + if (!df.attr("__contains__")(column_name).cast()) + continue; + + /// Get the column + auto column = df[column_name.c_str()]; + + /// First localize to UTC (assuming the timestamps are in UTC) + auto utc_localized = column.attr("dt").attr("tz_localize")("UTC"); + + /// Then convert to the target timezone + auto tz_converted = utc_localized.attr("dt").attr("tz_convert")(timezone_str); + + /// Update the column in DataFrame + df.attr("__setitem__")(column_name.c_str(), tz_converted); + } +} + void PandasDataFrameBuilder::finalize() { if (is_finalized) diff --git a/programs/local/PandasDataFrameBuilder.h b/programs/local/PandasDataFrameBuilder.h index cbb17811da7..2f45b08e866 100644 --- a/programs/local/PandasDataFrameBuilder.h +++ b/programs/local/PandasDataFrameBuilder.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB { @@ -29,10 +30,14 @@ class PandasDataFrameBuilder private: pybind11::object genDataFrame(const pybind11::handle & dict); + void changeToTZType(pybind11::object & df); std::vector column_names; std::vector column_types; + /// Map column name to timezone string for timezone-aware types + std::unordered_map column_timezones; + std::vector chunks; std::vector columns_data; From 2a09093f5174e2c2ae04d5e5fceb22dae5bed2a2 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Sat, 1 Nov 2025 21:39:35 +0800 Subject: [PATCH 09/22] chore: add more CH types --- programs/local/NumpyArray.cpp | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/programs/local/NumpyArray.cpp b/programs/local/NumpyArray.cpp index b7bf32ca7cf..a73103eb518 100644 --- a/programs/local/NumpyArray.cpp +++ b/programs/local/NumpyArray.cpp @@ -224,6 +224,35 @@ void NumpyArray::append(const ColumnPtr & column) case TypeIndex::DateTime64: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::Time: + case TypeIndex::Time64: + case TypeIndex::String: + case TypeIndex::FixedString: + case TypeIndex::Enum8: + case TypeIndex::Enum16: + case TypeIndex::Decimal32: + case TypeIndex::Decimal64: + case TypeIndex::Decimal128: + case TypeIndex::Decimal256: + case TypeIndex::UUID: + case TypeIndex::Array: + case TypeIndex::Tuple: + case TypeIndex::Set: + case TypeIndex::Interval: + case TypeIndex::Map: + case TypeIndex::Object: + case TypeIndex::IPv4: + case TypeIndex::IPv6: + case TypeIndex::JSONPaths: + case TypeIndex::Variant: + case TypeIndex::Dynamic: + /// TODO + break; + + case TypeIndex::ObjectDeprecated: /// Deprecated type, should not appear in normal data processing + case TypeIndex::Function: /// Function types are not data types, should not appear here + case TypeIndex::AggregateFunction: /// Aggregate function types are not data types, should not appear here + case TypeIndex::LowCardinality: /// LowCardinality should be unwrapped before reaching this point default: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", data_array->type->getName()); } From af9cc164dbff3790f215558c2934653bef780f75 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Sun, 2 Nov 2025 23:52:35 +0800 Subject: [PATCH 10/22] chore: support time and time64 types --- programs/local/NumpyArray.cpp | 4 ++++ programs/local/PandasDataFrameBuilder.cpp | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/programs/local/NumpyArray.cpp b/programs/local/NumpyArray.cpp index a73103eb518..c4335e699c6 100644 --- a/programs/local/NumpyArray.cpp +++ b/programs/local/NumpyArray.cpp @@ -225,7 +225,11 @@ void NumpyArray::append(const ColumnPtr & column) may_have_null = CHColumnToNumpyArray(append_data); break; case TypeIndex::Time: + may_have_null = TransformColumn(append_data); + break; case TypeIndex::Time64: + may_have_null = CHColumnToNumpyArray(append_data); + break; case TypeIndex::String: case TypeIndex::FixedString: case TypeIndex::Enum8: diff --git a/programs/local/PandasDataFrameBuilder.cpp b/programs/local/PandasDataFrameBuilder.cpp index 5764f6eab39..9d81271a563 100644 --- a/programs/local/PandasDataFrameBuilder.cpp +++ b/programs/local/PandasDataFrameBuilder.cpp @@ -43,10 +43,6 @@ PandasDataFrameBuilder::PandasDataFrameBuilder(const Block & sample) column_timezones[column.name] = dt->getTimeZone().getTimeZone(); else if (const auto * dt64 = typeid_cast(column.type.get())) column_timezones[column.name] = dt64->getTimeZone().getTimeZone(); - else if (const auto * t = typeid_cast(column.type.get())) - column_timezones[column.name] = t->getTimeZone().getTimeZone(); - else if (const auto * t64 = typeid_cast(column.type.get())) - column_timezones[column.name] = t64->getTimeZone().getTimeZone(); } } From 373bd5e3556d3980a089e31cae559ca74d454816 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Mon, 3 Nov 2025 02:47:51 +0800 Subject: [PATCH 11/22] chore: support more types --- programs/local/IPAddressCacheItem.h | 25 +++ programs/local/NumpyArray.cpp | 304 +++++++++++++++++++++++++++- programs/local/NumpyType.cpp | 9 +- programs/local/PythonImportCache.h | 7 +- programs/local/UUIDCacheItem.h | 21 ++ 5 files changed, 355 insertions(+), 11 deletions(-) create mode 100644 programs/local/IPAddressCacheItem.h create mode 100644 programs/local/UUIDCacheItem.h diff --git a/programs/local/IPAddressCacheItem.h b/programs/local/IPAddressCacheItem.h new file mode 100644 index 00000000000..2d51a1a3e43 --- /dev/null +++ b/programs/local/IPAddressCacheItem.h @@ -0,0 +1,25 @@ +#pragma once + +#include "PythonImportCacheItem.h" + +namespace CHDB { + +struct IPAddressCacheItem : public PythonImportCacheItem +{ +public: + static constexpr const char * Name = "ipaddress"; + + IPAddressCacheItem() + : PythonImportCacheItem("ipaddress") + , ipv4_address("IPv4Address", this) + , ipv6_address("IPv6Address", this) + { + } + + ~IPAddressCacheItem() override = default; + + PythonImportCacheItem ipv4_address; + PythonImportCacheItem ipv6_address; +}; + +} // namespace CHDB diff --git a/programs/local/NumpyArray.cpp b/programs/local/NumpyArray.cpp index c4335e699c6..cb0c949f001 100644 --- a/programs/local/NumpyArray.cpp +++ b/programs/local/NumpyArray.cpp @@ -1,13 +1,23 @@ #include "NumpyArray.h" #include "NumpyType.h" +#include "PythonImporter.h" #include #include #include +#include #include +#include #include #include +#include +#include #include +#include +#include +#include +#include +#include namespace DB { @@ -15,6 +25,7 @@ namespace DB namespace ErrorCodes { extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR } } @@ -76,16 +87,21 @@ static bool TransformColumn(NumpyAppendData & append_data) auto * dest_ptr = reinterpret_cast(append_data.target_data); auto * mask_ptr = append_data.target_mask; - for (size_t i = 0; i < append_data.count; i++) { + for (size_t i = 0; i < append_data.count; i++) + { size_t offset = append_data.dest_offset + i; - if (nullable_column && nullable_column->isNullAt(i)) { + if (nullable_column && nullable_column->isNullAt(i)) + { dest_ptr[offset] = CONVERT::template nullValue(mask_ptr[offset]); has_null = has_null || mask_ptr[offset]; - } else { + } + else + { dest_ptr[offset] = CONVERT::template convertValue(src_ptr[i], append_data); mask_ptr[offset] = false; } } + return has_null; } @@ -95,6 +111,243 @@ static bool CHColumnToNumpyArray(NumpyAppendData & append_data) return TransformColumn(append_data); } +template +static bool CHColumnDecimalToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * decimal_column = typeid_cast *>(data_column); + if (!decimal_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnDecimal"); + + /// Get scale from data type to convert integer to actual decimal value + const auto * decimal_type = typeid_cast *>(data_type.get()); + if (!decimal_type) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected DataTypeDecimal"); + + auto scale_multiplier = decimal_type->getScaleMultiplier(); + double scale_multiplier_double = static_cast(scale_multiplier.value); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = 0; i < append_data.count; i++) + { + size_t offset = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(i)) + { + /// Set to 0.0 for null values + dest_ptr[offset] = 0.0; + mask_ptr[offset] = true; + has_null = true; + } + else + { + /// Convert decimal integer value to actual decimal by dividing by scale multiplier + auto decimal_value = decimal_column->getElement(i); + dest_ptr[offset] = static_cast(decimal_value.value) / scale_multiplier_double; + mask_ptr[offset] = false; + } + } + + return has_null; +} + +static bool CHColumnUUIDToNumpyArray(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * uuid_column = typeid_cast *>(data_column); + if (!uuid_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnVector"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = 0; i < append_data.count; i++) + { + size_t offset = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(i)) + { + Py_INCREF(Py_None); + dest_ptr[offset] = Py_None; + has_null = true; + mask_ptr[offset] = true; + } + else + { + /// Convert UUID to Python uuid.UUID object + UUID uuid_value = uuid_column->getElement(i); + const auto formatted_uuid = formatUUID(uuid_value); + const char * uuid_str = formatted_uuid.data(); + const size_t uuid_str_len = formatted_uuid.size(); + + /// Create Python uuid.UUID object + auto & import_cache = PythonImporter::ImportCache(); + py::handle uuid_handle = import_cache.uuid.UUID()(String(uuid_str, uuid_str_len)).release(); + dest_ptr[offset] = uuid_handle.ptr(); + mask_ptr[offset] = false; + } + } + + return has_null; +} + +static bool CHColumnIPv4ToNumpyArray(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * ipv4_column = typeid_cast *>(data_column); + if (!ipv4_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnVector"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + + for (size_t i = 0; i < append_data.count; i++) + { + size_t offset = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(i)) + { + Py_INCREF(Py_None); + dest_ptr[offset] = Py_None; + has_null = true; + } + else + { + /// Convert IPv4 to Python ipaddress.IPv4Address object + IPv4 ipv4_value = ipv4_column->getElement(i); + + char ipv4_str[IPV4_MAX_TEXT_LENGTH]; + char * ptr = ipv4_str; + formatIPv4(reinterpret_cast(&ipv4_value), ptr); + const size_t ipv4_str_len = ptr - ipv4_str; + + /// Create Python ipaddress.IPv4Address object + auto & import_cache = PythonImporter::ImportCache(); + py::handle ipv4_handle = import_cache.ipaddress.ipv4_address()(String(ipv4_str, ipv4_str_len)).release(); + dest_ptr[offset] = ipv4_handle.ptr(); + } + } + + return has_null; +} + +static bool CHColumnIPv6ToNumpyArray(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * ipv6_column = typeid_cast *>(data_column); + if (!ipv6_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnVector"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + + for (size_t i = 0; i < append_data.count; i++) + { + size_t offset = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(i)) + { + Py_INCREF(Py_None); + dest_ptr[offset] = Py_None; + has_null = true; + } + else + { + /// Convert IPv6 to Python ipaddress.IPv6Address object + IPv6 ipv6_value = ipv6_column->getElement(i); + + /// Use ClickHouse's built-in IPv6 formatting function + char ipv6_str[IPV6_MAX_TEXT_LENGTH]; + char * ptr = ipv6_str; + formatIPv6(reinterpret_cast(&ipv6_value), ptr); + const size_t ipv6_str_len = ptr - ipv6_str; + + /// Create Python ipaddress.IPv6Address object + auto & import_cache = PythonImporter::ImportCache(); + py::handle ipv6_handle = import_cache.ipaddress.ipv6_address()(String(ipv6_str, ipv6_str_len)).release(); + dest_ptr[offset] = ipv6_handle.ptr(); + } + } + + return has_null; +} + +template +static bool CHColumnStringToNumpyArray(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * string_column = typeid_cast(data_column); + if (!string_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected String ColumnType"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + + for (size_t i = 0; i < append_data.count; i++) + { + size_t offset = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(i)) + { + Py_INCREF(Py_None); + dest_ptr[offset] = Py_None; + } + else + { + StringRef str_ref = string_column->getDataAt(i); + auto * str_ptr = const_cast(str_ref.data); + auto str_size = str_ref.size; + dest_ptr[offset] = PyUnicode_FromStringAndSize(str_ptr, str_size); + } + } + + return has_null; +} + InternalNumpyArray::InternalNumpyArray(const DataTypePtr & type_) : data(nullptr) , type(type_) @@ -159,7 +412,14 @@ void NumpyArray::append(const ColumnPtr & column) append_data.target_mask = mask_ptr; append_data.dest_offset = data_array->count - size; - switch (data_array->type->getTypeId()) + /// For nullable types, we need to get the nested type + DataTypePtr actual_type = data_array->type; + if (const auto * nullable_type = typeid_cast(data_array->type.get())) + { + actual_type = nullable_type->getNestedType(); + } + + switch (actual_type->getTypeId()) { case TypeIndex::Int8: may_have_null = CHColumnToNumpyArray(append_data); @@ -231,32 +491,60 @@ void NumpyArray::append(const ColumnPtr & column) may_have_null = CHColumnToNumpyArray(append_data); break; case TypeIndex::String: + may_have_null = CHColumnStringToNumpyArray(append_data); + break; case TypeIndex::FixedString: + may_have_null = CHColumnStringToNumpyArray(append_data); + break; case TypeIndex::Enum8: + may_have_null = CHColumnToNumpyArray(append_data); + break; case TypeIndex::Enum16: + may_have_null = CHColumnToNumpyArray(append_data); + break; case TypeIndex::Decimal32: + may_have_null = CHColumnDecimalToNumpyArray(append_data, actual_type); + break; case TypeIndex::Decimal64: + may_have_null = CHColumnDecimalToNumpyArray(append_data, actual_type); + break; case TypeIndex::Decimal128: + may_have_null = CHColumnDecimalToNumpyArray(append_data, actual_type); + break; case TypeIndex::Decimal256: + may_have_null = CHColumnDecimalToNumpyArray(append_data, actual_type); + break; case TypeIndex::UUID: + may_have_null = CHColumnUUIDToNumpyArray(append_data); + break; case TypeIndex::Array: case TypeIndex::Tuple: case TypeIndex::Set: case TypeIndex::Interval: + may_have_null = CHColumnToNumpyArray(append_data); + break; case TypeIndex::Map: case TypeIndex::Object: case TypeIndex::IPv4: + may_have_null = CHColumnIPv4ToNumpyArray(append_data); + break; case TypeIndex::IPv6: + may_have_null = CHColumnIPv6ToNumpyArray(append_data); + break; case TypeIndex::JSONPaths: case TypeIndex::Variant: case TypeIndex::Dynamic: /// TODO break; - case TypeIndex::ObjectDeprecated: /// Deprecated type, should not appear in normal data processing - case TypeIndex::Function: /// Function types are not data types, should not appear here - case TypeIndex::AggregateFunction: /// Aggregate function types are not data types, should not appear here - case TypeIndex::LowCardinality: /// LowCardinality should be unwrapped before reaching this point + /// Deprecated type, should not appear in normal data processing + case TypeIndex::ObjectDeprecated: + /// Function types are not data types, should not appear here + case TypeIndex::Function: + /// Aggregate function types are not data types, should not appear here + case TypeIndex::AggregateFunction: + /// LowCardinality should be unwrapped before reaching this point + case TypeIndex::LowCardinality: default: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", data_array->type->getName()); } diff --git a/programs/local/NumpyType.cpp b/programs/local/NumpyType.cpp index 45d03ac786a..aa5f760a79c 100644 --- a/programs/local/NumpyType.cpp +++ b/programs/local/NumpyType.cpp @@ -342,7 +342,8 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type case IntervalKind::Kind::Month: return "timedelta64[M]"; case IntervalKind::Kind::Quarter: - return "object"; + /// numpy doesn't have quarter type, use int64 + return "int64"; case IntervalKind::Kind::Year: return "timedelta64[Y]"; default: @@ -360,7 +361,7 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type case TypeIndex::Decimal64: case TypeIndex::Decimal128: case TypeIndex::Decimal256: - return "object"; + return "float64"; case TypeIndex::Array: case TypeIndex::Tuple: case TypeIndex::Map: @@ -369,6 +370,10 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type case TypeIndex::Variant: case TypeIndex::Object: return "object"; + case TypeIndex::Enum8: + return "int8"; + case TypeIndex::Enum16: + return "int16"; case TypeIndex::Nullable: { if (const auto * nullable = typeid_cast(data_type.get())) diff --git a/programs/local/PythonImportCache.h b/programs/local/PythonImportCache.h index 1703a5103a5..382bb34358d 100644 --- a/programs/local/PythonImportCache.h +++ b/programs/local/PythonImportCache.h @@ -6,6 +6,8 @@ #include "PandasCacheItem.h" #include "PyArrowCacheItem.h" #include "PythonImportCacheItem.h" +#include "UUIDCacheItem.h" +#include "IPAddressCacheItem.h" #include @@ -14,7 +16,8 @@ namespace CHDB { struct PythonImportCache; using PythonImportCachePtr = std::shared_ptr; -struct PythonImportCache { +struct PythonImportCache +{ public: explicit PythonImportCache() = default; @@ -25,6 +28,8 @@ struct PythonImportCache { DatetimeCacheItem datetime; DecimalCacheItem decimal; NumpyCacheItem numpy; + UUIDCacheItem uuid; + IPAddressCacheItem ipaddress; py::handle AddCache(py::object item); diff --git a/programs/local/UUIDCacheItem.h b/programs/local/UUIDCacheItem.h new file mode 100644 index 00000000000..ee21b48ca22 --- /dev/null +++ b/programs/local/UUIDCacheItem.h @@ -0,0 +1,21 @@ +#pragma once + +#include "PythonImportCacheItem.h" + +namespace CHDB { + +struct UUIDCacheItem : public PythonImportCacheItem +{ +public: + static constexpr const char * Name = "uuid"; + + UUIDCacheItem() : PythonImportCacheItem("uuid"), UUID("UUID", this) + { + } + + ~UUIDCacheItem() override = default; + + PythonImportCacheItem UUID; +}; + +} // namespace CHDB From dd55a088a1fd7f9e4177b07c2d4f1824bcd92618 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Mon, 3 Nov 2025 17:23:55 +0800 Subject: [PATCH 12/22] chore: support nested types --- programs/local/NumpyArray.cpp | 121 +++++++++++++------ programs/local/NumpyArray.h | 17 +++ programs/local/NumpyNestedTypes.cpp | 180 ++++++++++++++++++++++++++++ programs/local/NumpyNestedTypes.h | 20 ++++ programs/local/NumpyType.cpp | 4 +- programs/local/NumpyType.h | 2 +- 6 files changed, 304 insertions(+), 40 deletions(-) create mode 100644 programs/local/NumpyNestedTypes.cpp create mode 100644 programs/local/NumpyNestedTypes.h diff --git a/programs/local/NumpyArray.cpp b/programs/local/NumpyArray.cpp index cb0c949f001..470478052ea 100644 --- a/programs/local/NumpyArray.cpp +++ b/programs/local/NumpyArray.cpp @@ -1,5 +1,6 @@ #include "NumpyArray.h" #include "NumpyType.h" +#include "NumpyNestedTypes.h" #include "PythonImporter.h" #include @@ -25,7 +26,7 @@ namespace DB namespace ErrorCodes { extern const int NOT_IMPLEMENTED; - extern const int LOGICAL_ERROR + extern const int LOGICAL_ERROR; } } @@ -35,22 +36,6 @@ using namespace DB; namespace CHDB { -struct NumpyAppendData -{ -public: - explicit NumpyAppendData(const IColumn & column) - : column(column) - { - } - - const IColumn & column; - - size_t count; - size_t dest_offset; - UInt8 * target_data; - bool * target_mask; -}; - struct RegularConvert { template @@ -87,7 +72,7 @@ static bool TransformColumn(NumpyAppendData & append_data) auto * dest_ptr = reinterpret_cast(append_data.target_data); auto * mask_ptr = append_data.target_mask; - for (size_t i = 0; i < append_data.count; i++) + for (size_t i = append_data.src_offset; i < append_data.src_offset + append_data.src_count; i++) { size_t offset = append_data.dest_offset + i; if (nullable_column && nullable_column->isNullAt(i)) @@ -140,7 +125,7 @@ static bool CHColumnDecimalToNumpyArray(NumpyAppendData & append_data, const Dat auto * dest_ptr = reinterpret_cast(append_data.target_data); auto * mask_ptr = append_data.target_mask; - for (size_t i = 0; i < append_data.count; i++) + for (size_t i = append_data.src_offset; i < append_data.src_offset + append_data.src_count; i++) { size_t offset = append_data.dest_offset + i; if (nullable_column && nullable_column->isNullAt(i)) @@ -182,13 +167,12 @@ static bool CHColumnUUIDToNumpyArray(NumpyAppendData & append_data) auto * dest_ptr = reinterpret_cast(append_data.target_data); auto * mask_ptr = append_data.target_mask; - for (size_t i = 0; i < append_data.count; i++) + for (size_t i = append_data.src_offset; i < append_data.src_offset + append_data.src_count; i++) { size_t offset = append_data.dest_offset + i; if (nullable_column && nullable_column->isNullAt(i)) { - Py_INCREF(Py_None); - dest_ptr[offset] = Py_None; + dest_ptr[offset] = nullptr; has_null = true; mask_ptr[offset] = true; } @@ -229,15 +213,16 @@ static bool CHColumnIPv4ToNumpyArray(NumpyAppendData & append_data) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnVector"); auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; - for (size_t i = 0; i < append_data.count; i++) + for (size_t i = append_data.src_offset; i < append_data.src_offset + append_data.src_count; i++) { size_t offset = append_data.dest_offset + i; if (nullable_column && nullable_column->isNullAt(i)) { - Py_INCREF(Py_None); - dest_ptr[offset] = Py_None; + dest_ptr[offset] = nullptr; has_null = true; + mask_ptr[offset] = true; } else { @@ -253,6 +238,7 @@ static bool CHColumnIPv4ToNumpyArray(NumpyAppendData & append_data) auto & import_cache = PythonImporter::ImportCache(); py::handle ipv4_handle = import_cache.ipaddress.ipv4_address()(String(ipv4_str, ipv4_str_len)).release(); dest_ptr[offset] = ipv4_handle.ptr(); + mask_ptr[offset] = false; } } @@ -277,15 +263,16 @@ static bool CHColumnIPv6ToNumpyArray(NumpyAppendData & append_data) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnVector"); auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; - for (size_t i = 0; i < append_data.count; i++) + for (size_t i = append_data.src_offset; i < append_data.src_offset + append_data.src_count; i++) { size_t offset = append_data.dest_offset + i; if (nullable_column && nullable_column->isNullAt(i)) { - Py_INCREF(Py_None); - dest_ptr[offset] = Py_None; + dest_ptr[offset] = nullptr; has_null = true; + mask_ptr[offset] = true; } else { @@ -302,6 +289,7 @@ static bool CHColumnIPv6ToNumpyArray(NumpyAppendData & append_data) auto & import_cache = PythonImporter::ImportCache(); py::handle ipv6_handle = import_cache.ipaddress.ipv6_address()(String(ipv6_str, ipv6_str_len)).release(); dest_ptr[offset] = ipv6_handle.ptr(); + mask_ptr[offset] = false; } } @@ -328,7 +316,7 @@ static bool CHColumnStringToNumpyArray(NumpyAppendData & append_data) auto * dest_ptr = reinterpret_cast(append_data.target_data); - for (size_t i = 0; i < append_data.count; i++) + for (size_t i = append_data.src_offset; i < append_data.src_offset + append_data.src_count; i++) { size_t offset = append_data.dest_offset + i; if (nullable_column && nullable_column->isNullAt(i)) @@ -348,6 +336,16 @@ static bool CHColumnStringToNumpyArray(NumpyAppendData & append_data) return has_null; } +NumpyAppendData::NumpyAppendData(const DB::IColumn & column) + : column(column) + , src_offset(0) + , src_count(0) + , dest_offset(0) + , target_data(nullptr) + , target_mask(nullptr) +{ +} + InternalNumpyArray::InternalNumpyArray(const DataTypePtr & type_) : data(nullptr) , type(type_) @@ -390,7 +388,34 @@ void NumpyArray::resize(size_t capacity) mask_array->resize(capacity); } +static bool CHColumnNothingToNumpyArray(NumpyAppendData & append_data) +{ + /// Nothing type represents columns with no actual values, so we fill all positions with None + bool has_null = true; + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = append_data.src_offset; i < append_data.src_offset + append_data.src_count; i++) + { + size_t offset = append_data.dest_offset + i; + + Py_INCREF(Py_None); + dest_ptr[offset] = Py_None; + mask_ptr[offset] = true; + } + + return has_null; +} + void NumpyArray::append(const ColumnPtr & column) +{ + append(column, 0, column->size()); +} + +void NumpyArray::append( + const ColumnPtr & column, + size_t offset, + size_t count) { chassert(data_array); chassert(mask_array); @@ -407,7 +432,8 @@ void NumpyArray::append(const ColumnPtr & column) bool may_have_null = false; NumpyAppendData append_data(*column); - append_data.count = size; + append_data.src_offset = offset; + append_data.src_offset + append_data.src_count = count; append_data.target_data = data_ptr; append_data.target_mask = mask_ptr; append_data.dest_offset = data_array->count - size; @@ -421,6 +447,9 @@ void NumpyArray::append(const ColumnPtr & column) switch (actual_type->getTypeId()) { + case TypeIndex::Nothing: + may_have_null = CHColumnNothingToNumpyArray(append_data); + break; case TypeIndex::Int8: may_have_null = CHColumnToNumpyArray(append_data); break; @@ -518,33 +547,49 @@ void NumpyArray::append(const ColumnPtr & column) may_have_null = CHColumnUUIDToNumpyArray(append_data); break; case TypeIndex::Array: + may_have_null = CHColumnArrayToNumpyArray(append_data, actual_type); + break; case TypeIndex::Tuple: - case TypeIndex::Set: + may_have_null = CHColumnTupleToNumpyArray(append_data, actual_type); + break; case TypeIndex::Interval: may_have_null = CHColumnToNumpyArray(append_data); break; case TypeIndex::Map: + may_have_null = CHColumnMapToNumpyArray(append_data, actual_type); + break; case TypeIndex::Object: - case TypeIndex::IPv4: + may_have_null = CHColumnObjectToNumpyArray(append_data, actual_type); + break; + case TypeIndex::IPv4: may_have_null = CHColumnIPv4ToNumpyArray(append_data); break; case TypeIndex::IPv6: may_have_null = CHColumnIPv6ToNumpyArray(append_data); break; - case TypeIndex::JSONPaths: - case TypeIndex::Variant: - case TypeIndex::Dynamic: - /// TODO + case TypeIndex::Variant: + may_have_null = CHColumnVariantToNumpyArray(append_data, actual_type); + break; + case TypeIndex::Dynamic: + may_have_null = CHColumnDynamicToNumpyArray(append_data, actual_type); break; + /// Set types are used only in WHERE clauses for IN operations, not in actual data storage + case TypeIndex::Set: + /// JSONPaths is an internal type used only for JSON schema inference, + case TypeIndex::JSONPaths: /// Deprecated type, should not appear in normal data processing case TypeIndex::ObjectDeprecated: - /// Function types are not data types, should not appear here + /// Function types are not actual data types, should not appear here case TypeIndex::Function: - /// Aggregate function types are not data types, should not appear here + /// Aggregate function types are not actual data types, should not appear here case TypeIndex::AggregateFunction: /// LowCardinality should be unwrapped before reaching this point case TypeIndex::LowCardinality: + /// Nullable cannot contain another Nullable type, so this should not appear in nested conversion + case TypeIndex::Nullable: + /// QBit type is supported in newer versions of ClickHouse + /// case TypeIndex::QBit: default: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", data_array->type->getName()); } diff --git a/programs/local/NumpyArray.h b/programs/local/NumpyArray.h index 7927faf1ec0..3c014dc79f8 100644 --- a/programs/local/NumpyArray.h +++ b/programs/local/NumpyArray.h @@ -9,6 +9,21 @@ namespace CHDB { +/// Data structure for appending column data to numpy arrays +class NumpyAppendData +{ +public: + explicit NumpyAppendData(const DB::IColumn & column); + + const DB::IColumn & column; + + size_t src_offset; + size_t src_count; + size_t dest_offset; + UInt8 * target_data; + bool * target_mask; +}; + class InternalNumpyArray { public: @@ -32,6 +47,8 @@ class NumpyArray { void resize(size_t capacity); + void append(const DB::ColumnPtr & column, size_t offset, size_t count); + void append(const DB::ColumnPtr & column); py::object toArray() const; diff --git a/programs/local/NumpyNestedTypes.cpp b/programs/local/NumpyNestedTypes.cpp new file mode 100644 index 00000000000..eafc3477fbb --- /dev/null +++ b/programs/local/NumpyNestedTypes.cpp @@ -0,0 +1,180 @@ +#include "NumpyNestedTypes.h" +#include "NumpyArray.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace CHDB +{ + +using namespace DB; +namespace py = pybind11; + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int NOT_IMPLEMENTED; +} + +template +struct ColumnTraits; + +template <> +struct ColumnTraits +{ + using DataType = DataTypeArray; + + static py::object convertElement(const ColumnArray * column, const DataTypePtr & data_type, size_t index) + { + const auto * array_data_type = typeid_cast(data_type.get()); + if (!array_data_type) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected DataTypeArray"); + + const auto & offsets = column->getOffsets(); + const auto & nested_column = column->getDataPtr(); + + size_t start_offset = (index == 0) ? 0 : offsets[index - 1]; + size_t end_offset = offsets[index]; + size_t array_size = end_offset - start_offset; + + NumpyArray numpy_array(data_type); + numpy_array.init(array_size); + numpy_array.append(nested_column, start_offset, array_size); + + return numpy_array.toArray(); + } +}; + +template <> +struct ColumnTraits +{ + using DataType = DataTypeTuple; + + static py::object convertElement(const ColumnTuple * column, const DataTypePtr & data_type, size_t index) + { + } +}; + +template <> +struct ColumnTraits +{ + using DataType = DataTypeMap; + + static py::object convertElement(const ColumnMap * column, const DataTypePtr & data_type, size_t index) + { + } +}; + +template <> +struct ColumnTraits +{ + using DataType = DataTypeObject; + + static py::object convertElement(const ColumnObject * column, const DataTypePtr & data_type, size_t index) + { + } +}; + +template <> +struct ColumnTraits +{ + using DataType = DataTypeVariant; + + static py::object convertElement(const ColumnVariant * column, const DataTypePtr & data_type, size_t index) + { + } +}; + +template <> +struct ColumnTraits +{ + using DataType = DataTypeDynamic; + + static py::object convertElement(const ColumnDynamic * column, const DataTypePtr & data_type, size_t index) + { + } +}; + +template +bool CHNestedColumnToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + has_null = true; + } + + const auto * typed_column = typeid_cast(data_column); + if (!typed_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected specific column type"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = append_data.src_offset; i < append_data.src_offset + append_data.src_count; i++) + { + size_t offset = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(i)) + { + dest_ptr[offset] = py::none(); + mask_ptr[offset] = true; + has_null = true; + } + else + { + dest_ptr[offset] = ColumnTraits::convertElement(typed_column, data_type, i); + mask_ptr[offset] = false; + } + } + + return has_null; +} + +bool CHColumnArrayToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + return CHNestedColumnToNumpyArray(append_data, data_type); +} + +bool CHColumnTupleToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + return CHNestedColumnToNumpyArray(append_data, data_type); +} + +bool CHColumnMapToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + return CHNestedColumnToNumpyArray(append_data, data_type); +} + +bool CHColumnObjectToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + return CHNestedColumnToNumpyArray(append_data, data_type); +} + +bool CHColumnVariantToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + return CHNestedColumnToNumpyArray(append_data, data_type); +} + +bool CHColumnDynamicToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + return CHNestedColumnToNumpyArray(append_data, data_type); +} + +} // namespace CHDB diff --git a/programs/local/NumpyNestedTypes.h b/programs/local/NumpyNestedTypes.h new file mode 100644 index 00000000000..b3e0a68520e --- /dev/null +++ b/programs/local/NumpyNestedTypes.h @@ -0,0 +1,20 @@ +#pragma once + +#include "NumpyArray.h" + +namespace CHDB +{ + +bool CHColumnArrayToNumpyArray(NumpyAppendData & append_data, const DB::DataTypePtr & data_type); + +bool CHColumnTupleToNumpyArray(NumpyAppendData & append_data, const DB::DataTypePtr & data_type); + +bool CHColumnMapToNumpyArray(NumpyAppendData & append_data, const DB::DataTypePtr & data_type); + +bool CHColumnObjectToNumpyArray(NumpyAppendData & append_data, const DB::DataTypePtr & data_type); + +bool CHColumnVariantToNumpyArray(NumpyAppendData & append_data, const DB::DataTypePtr & data_type); + +bool CHColumnDynamicToNumpyArray(NumpyAppendData & append_data, const DB::DataTypePtr & data_type); + +} // namespace CHDB diff --git a/programs/local/NumpyType.cpp b/programs/local/NumpyType.cpp index aa5f760a79c..9f2bb23216a 100644 --- a/programs/local/NumpyType.cpp +++ b/programs/local/NumpyType.cpp @@ -241,6 +241,8 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type TypeIndex type_id = data_type->getTypeId(); switch (type_id) { + case TypeIndex::Nothing: + return "object"; case TypeIndex::Int8: return "int8"; case TypeIndex::UInt8: @@ -380,7 +382,7 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type { return DataTypeToNumpyTypeStr(nullable->getNestedType()); } - return "object"; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected nullable type {}", data_type->getName()); } default: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", data_type->getName()); diff --git a/programs/local/NumpyType.h b/programs/local/NumpyType.h index 8a72ece2bb1..787bfcd857a 100644 --- a/programs/local/NumpyType.h +++ b/programs/local/NumpyType.h @@ -51,7 +51,7 @@ NumpyType ConvertNumpyType(const py::handle & col_type); std::shared_ptr NumpyToDataType(const NumpyType & col_type); -String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type); +String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type); py::object ConvertNumpyDtype(py::handle & numpy_array); From afd902aca59ef27d5928614207ba4044414c0bf0 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Tue, 4 Nov 2025 03:49:27 +0800 Subject: [PATCH 13/22] chore: support converting filed to python object --- programs/local/FieldToPython.cpp | 386 ++++++++++++++++++++++++++++ programs/local/FieldToPython.h | 14 + programs/local/NumpyArray.cpp | 257 ++++++++++++++++-- programs/local/NumpyArray.h | 15 +- programs/local/NumpyNestedTypes.cpp | 19 ++ programs/local/NumpyType.cpp | 63 +++-- programs/local/PythonImportCache.h | 2 + programs/local/PytzCacheItem.h | 19 ++ 8 files changed, 717 insertions(+), 58 deletions(-) create mode 100644 programs/local/FieldToPython.cpp create mode 100644 programs/local/FieldToPython.h create mode 100644 programs/local/PytzCacheItem.h diff --git a/programs/local/FieldToPython.cpp b/programs/local/FieldToPython.cpp new file mode 100644 index 00000000000..e5fb03380c2 --- /dev/null +++ b/programs/local/FieldToPython.cpp @@ -0,0 +1,386 @@ +#include "FieldToPython.h" +#include "PythonImporter.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace CHDB +{ + +using namespace DB; +namespace py = pybind11; + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +static py::object convertLocalDateToPython(const LocalDate & local_date, auto & import_cache, const Field & field) +{ + auto year = local_date.year(); + auto month = local_date.month(); + auto day = local_date.day(); + + try + { + return import_cache.datetime.date()(year, month, day); + } + catch (py::error_already_set &) + { + return py::str(toString(field)); + } +} + +py::object convertFieldToPython( + const Field & field, + const DB::DataTypePtr & type) +{ + chassert(type); + + auto filed_type = field.getType(); + if (filed_type == Field::Types::Null) + { + return py::none(); + } + + DataTypePtr actual_type = type; + if (const auto * nullable_type = typeid_cast(type.get())) + { + actual_type = nullable_type->getNestedType(); + } + + auto & import_cache = PythonImporter::ImportCache(); + + switch (actual_type->getTypeId()) + { + case TypeIndex::Nothing: + return py::none(); + + case TypeIndex::Int8: + return py::cast(field.safeGet()); + + case TypeIndex::UInt8: + if (filed_type == Field::Types::Bool) + return py::cast(field.safeGet()); + + return py::cast(field.safeGet()); + + case TypeIndex::Int16: + return py::cast(field.safeGet()); + + case TypeIndex::UInt16: + return py::cast(field.safeGet()); + + case TypeIndex::Int32: + return py::cast(field.safeGet()); + + case TypeIndex::UInt32: + return py::cast(field.safeGet()); + + case TypeIndex::Int64: + return py::cast(field.safeGet()); + + case TypeIndex::UInt64: + return py::cast(field.safeGet()); + + case TypeIndex::Float32: + return py::cast(field.safeGet()); + + case TypeIndex::Float64: + return py::cast(field.safeGet()); + + case TypeIndex::Int128: + return py::cast((double)field.safeGet()); + + case TypeIndex::Int256: + return py::cast((double)field.safeGet()); + + case TypeIndex::UInt128: + return py::cast((double)field.safeGet()); + + case TypeIndex::UInt256: + return py::cast((double)field.safeGet()); + + case TypeIndex::BFloat16: + return py::cast((double)field.safeGet()); + + case TypeIndex::Date: + { + auto days = field.safeGet(); + LocalDate local_date(static_cast(days)); + return convertLocalDateToPython(local_date, import_cache, field); + } + + case TypeIndex::Date32: + { + auto days = field.safeGet(); + LocalDate local_date(static_cast(days)); + return convertLocalDateToPython(local_date, import_cache, field); + } + + case TypeIndex::DateTime: + { + auto seconds = field.safeGet(); + + const auto * datetime_type = typeid_cast(type.get()); + const auto & time_zone = datetime_type ? datetime_type->getTimeZone() : DateLUT::instance("UTC"); + + time_t timestamp = static_cast(seconds); + LocalDateTime local_dt(timestamp, time_zone); + + int year = local_dt.year(); + int month = local_dt.month(); + int day = local_dt.day(); + int hour = local_dt.hour(); + int minute = local_dt.minute(); + int second = local_dt.second(); + int microsecond = 0; + + try + { + py::object timestamp_object = import_cache.datetime.datetime()( + year, month, day, hour, minute, second, microsecond + ); + + const String & tz_name = time_zone.getTimeZone(); + auto tz_obj = import_cache.pytz.timezone()(tz_name); + return tz_obj.attr("localize")(timestamp_object); + } + catch (py::error_already_set &) + { + return py::str(toString(field)); + } + } + + case TypeIndex::DateTime64: + { + auto datetime64_field = field.safeGet>(); + auto datetime64_value = datetime64_field.getValue(); + Int64 datetime64_ticks = datetime64_value.value; + + const auto * datetime64_type = typeid_cast(type.get()); + const auto & time_zone = datetime64_type ? datetime64_type->getTimeZone() : DateLUT::instance("UTC"); + + UInt32 scale = datetime64_field.getScale(); + Int64 scale_multiplier = DecimalUtils::scaleMultiplier(scale); + + auto seconds = static_cast(datetime64_ticks / scale_multiplier); + auto fractional = datetime64_ticks % scale_multiplier; + + LocalDateTime local_dt(seconds, time_zone); + + int year = local_dt.year(); + int month = local_dt.month(); + int day = local_dt.day(); + int hour = local_dt.hour(); + int minute = local_dt.minute(); + int second = local_dt.second(); + int microsecond = static_cast((fractional * 1000000) / scale_multiplier); + + try + { + py::object timestamp_object = import_cache.datetime.datetime()( + year, month, day, hour, minute, second, microsecond + ); + + const String & tz_name = time_zone.getTimeZone(); + auto tz_obj = import_cache.pytz.timezone()(tz_name); + return tz_obj.attr("localize")(timestamp_object); + } + catch (py::error_already_set &) + { + return py::str(toString(field)); + } + } + + case TypeIndex::Time: + { + auto time_seconds = field.safeGet(); + + if (time_seconds < 0) + { + return py::str(toString(field)); + } + + /// Handle time overflow (should be within 24 hours) + /// ClickHouse Time range is [-999:59:59, 999:59:59] + time_seconds = time_seconds % 86400; + + int hour = static_cast(time_seconds / 3600); + int minute = static_cast((time_seconds % 3600) / 60); + int second = static_cast(time_seconds % 60); + int microsecond = 0; + + try + { + return import_cache.datetime.time()(hour, minute, second, microsecond); + } + catch (py::error_already_set &) + { + return py::str(toString(field)); + } + } + + case TypeIndex::Time64: + { + auto time64_field = field.safeGet>(); + auto time64_value = time64_field.getValue(); + Int64 time64_ticks = time64_value.value; + + if (time64_ticks < 0) + { + return py::str(toString(field)); + } + + UInt32 scale = time64_field.getScale(); + Int64 scale_multiplier = DecimalUtils::scaleMultiplier(scale); + + /// Convert to seconds and fractional part within a day + Int64 total_seconds = time64_ticks / scale_multiplier; + Int64 fractional = time64_ticks % scale_multiplier; + + /// Handle time overflow (should be within 24 hours) + /// ClickHouse Time range is [-999:59:59, 999:59:59] + total_seconds = total_seconds % 86400; + + int hour = static_cast(total_seconds / 3600); + int minute = static_cast((total_seconds % 3600) / 60); + int second = static_cast(total_seconds % 60); + int microsecond = static_cast((fractional * 1000000) / scale_multiplier); + + try + { + return import_cache.datetime.time()(hour, minute, second, microsecond); + } + catch (py::error_already_set &) + { + return py::str(toString(field)); + } + } + + case TypeIndex::String: + case TypeIndex::FixedString: + return py::cast(field.safeGet()); + + case TypeIndex::Enum8: + case TypeIndex::Enum16: + return py::cast(field.safeGet()); + + case TypeIndex::Decimal32: + { + auto decimal_field = field.safeGet>(); + auto decimal_value = decimal_field.getValue(); + UInt32 scale = decimal_field.getScale(); + double result = DecimalUtils::convertTo(decimal_value, scale); + return py::cast(result); + } + + case TypeIndex::Decimal64: + { + auto decimal_field = field.safeGet>(); + auto decimal_value = decimal_field.getValue(); + UInt32 scale = decimal_field.getScale(); + double result = DecimalUtils::convertTo(decimal_value, scale); + return py::cast(result); + } + + case TypeIndex::Decimal128: + { + auto decimal_field = field.safeGet>(); + auto decimal_value = decimal_field.getValue(); + UInt32 scale = decimal_field.getScale(); + double result = DecimalUtils::convertTo(decimal_value, scale); + return py::cast(result); + } + + case TypeIndex::Decimal256: + { + auto decimal_field = field.safeGet>(); + auto decimal_value = decimal_field.getValue(); + UInt32 scale = decimal_field.getScale(); + double result = DecimalUtils::convertTo(decimal_value, scale); + return py::cast(result); + } + + case TypeIndex::UUID: + break; + + // case TypeIndex::Array: + // may_have_null = CHColumnArrayToNumpyArray(append_data, actual_type); + // break; + + // case TypeIndex::Tuple: + // may_have_null = CHColumnTupleToNumpyArray(append_data, actual_type); + // break; + + // case TypeIndex::Interval: + // { + // const auto * interval_type = typeid_cast(actual_type.get()); + // if (interval_type && interval_type->getKind() == IntervalKind::Kind::Quarter) + // { + // may_have_null = CHColumnIntervalToNumpyArray(append_data); + // } + // else + // { + // may_have_null = CHColumnToNumpyArray(append_data); + // } + // } + // break; + + // case TypeIndex::Map: + // may_have_null = CHColumnMapToNumpyArray(append_data, actual_type); + // break; + + // case TypeIndex::Object: + // may_have_null = CHColumnObjectToNumpyArray(append_data, actual_type); + // break; + + // case TypeIndex::IPv4: + // may_have_null = CHColumnIPv4ToNumpyArray(append_data); + // break; + + // case TypeIndex::IPv6: + // may_have_null = CHColumnIPv6ToNumpyArray(append_data); + // break; + + // case TypeIndex::Variant: + // may_have_null = CHColumnVariantToNumpyArray(append_data, actual_type); + // break; + + // case TypeIndex::Dynamic: + // may_have_null = CHColumnDynamicToNumpyArray(append_data, actual_type); + // break; + + /// Set types are used only in WHERE clauses for IN operations, not in actual data storage + case TypeIndex::Set: + /// JSONPaths is an internal type used only for JSON schema inference, + case TypeIndex::JSONPaths: + /// Deprecated type, should not appear in normal data processing + case TypeIndex::ObjectDeprecated: + /// Function types are not actual data types, should not appear here + case TypeIndex::Function: + /// Aggregate function types are not actual data types, should not appear here + case TypeIndex::AggregateFunction: + /// LowCardinality should be unwrapped before reaching this point + case TypeIndex::LowCardinality: + /// Nullable cannot contain another Nullable type, so this should not appear in nested conversion + case TypeIndex::Nullable: + /// QBit type is supported in newer versions of ClickHouse + /// case TypeIndex::QBit: + default: + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", type->getName()); + } +} + +} // namespace CHDB diff --git a/programs/local/FieldToPython.h b/programs/local/FieldToPython.h new file mode 100644 index 00000000000..a47e6d94773 --- /dev/null +++ b/programs/local/FieldToPython.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include +#include + +namespace CHDB +{ + +pybind11::object convertFieldToPython( + const DB::Field & field, + const DB::DataTypePtr & type); + +} // namespace CHDB diff --git a/programs/local/NumpyArray.cpp b/programs/local/NumpyArray.cpp index 470478052ea..f054a95d27a 100644 --- a/programs/local/NumpyArray.cpp +++ b/programs/local/NumpyArray.cpp @@ -2,6 +2,7 @@ #include "NumpyType.h" #include "NumpyNestedTypes.h" #include "PythonImporter.h" +#include "FieldToPython.h" #include #include @@ -13,10 +14,15 @@ #include #include #include +#include +#include +#include +#include #include #include #include #include +#include #include #include @@ -53,6 +59,46 @@ struct RegularConvert } }; +struct TimeConvert +{ + template + static NUMPYTYPE convertValue(CHTYPE val, NumpyAppendData & append_data) + { + chassert(append_data.type); + + Field field(static_cast(val)); + auto time_object = convertFieldToPython(field, append_data.type); + return time_object.release().ptr(); + } + + template + static NUMPYTYPE nullValue(bool & set_mask) + { + set_mask = true; + return nullptr; + } +}; + +struct Time64Convert +{ + template + static NUMPYTYPE convertValue(CHTYPE val, NumpyAppendData & append_data) + { + chassert(append_data.type); + + Field field(val); + auto time64_object = convertFieldToPython(field, append_data.type); + return time64_object.release().ptr(); + } + + template + static NUMPYTYPE nullValue(bool & set_mask) + { + set_mask = true; + return nullptr; + } +}; + template static bool TransformColumn(NumpyAppendData & append_data) { @@ -119,8 +165,7 @@ static bool CHColumnDecimalToNumpyArray(NumpyAppendData & append_data, const Dat if (!decimal_type) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected DataTypeDecimal"); - auto scale_multiplier = decimal_type->getScaleMultiplier(); - double scale_multiplier_double = static_cast(scale_multiplier.value); + UInt32 scale = decimal_type->getScale(); auto * dest_ptr = reinterpret_cast(append_data.target_data); auto * mask_ptr = append_data.target_mask; @@ -137,9 +182,100 @@ static bool CHColumnDecimalToNumpyArray(NumpyAppendData & append_data, const Dat } else { - /// Convert decimal integer value to actual decimal by dividing by scale multiplier auto decimal_value = decimal_column->getElement(i); - dest_ptr[offset] = static_cast(decimal_value.value) / scale_multiplier_double; + dest_ptr[offset] = DecimalUtils::convertTo(decimal_value, scale); + mask_ptr[offset] = false; + } + } + + return has_null; +} + +static bool CHColumnDateTime64ToNumpyArray(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * decimal_column = typeid_cast *>(data_column); + if (!decimal_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnDecimal"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = append_data.src_offset; i < append_data.src_offset + append_data.src_count; i++) + { + size_t offset = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(i)) + { + dest_ptr[offset] = 0; + mask_ptr[offset] = true; + has_null = true; + } + else + { + /// Get the DateTime64 value and convert to nanoseconds + Int64 raw_value = decimal_column->getInt(i); + auto scale = decimal_column->getScale(); + + Int64 ns_value; + chassert(scale <= 9); + Int64 multiplier = common::exp10_i32(9 - scale); + ns_value = raw_value * multiplier; + + dest_ptr[offset] = ns_value; + mask_ptr[offset] = false; + } + } + + return has_null; +} + +static bool CHColumnIntervalToNumpyArray(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * int64_column = typeid_cast *>(data_column); + if (!int64_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnVector for Interval"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = append_data.src_offset; i < append_data.src_offset + append_data.src_count; i++) + { + size_t offset = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(i)) + { + dest_ptr[offset] = 0; + mask_ptr[offset] = true; + has_null = true; + } + else + { + Int64 interval_value = int64_column->getElement(i); + + /// Convert quarter to month by multiplying by 3 + /// This function is only called for Quarter intervals + interval_value *= 3; + + dest_ptr[offset] = interval_value; mask_ptr[offset] = false; } } @@ -336,8 +472,11 @@ static bool CHColumnStringToNumpyArray(NumpyAppendData & append_data) return has_null; } -NumpyAppendData::NumpyAppendData(const DB::IColumn & column) - : column(column) +NumpyAppendData::NumpyAppendData( + const DB::IColumn & column_, + const DB::DataTypePtr & type_) + : column(column_) + , type(type_) , src_offset(0) , src_count(0) , dest_offset(0) @@ -376,16 +515,24 @@ NumpyArray::NumpyArray(const DataTypePtr & type_) mask_array = std::make_unique(DataTypeFactory::instance().get("Bool")); } -void NumpyArray::init(size_t capacity) +void NumpyArray::init(size_t capacity, bool may_have_null) { data_array->init(capacity); - mask_array->init(capacity); + + if (may_have_null) + { + mask_array->init(capacity); + } } -void NumpyArray::resize(size_t capacity) +void NumpyArray::resize(size_t capacity, bool may_have_null) { data_array->resize(capacity); - mask_array->resize(capacity); + + if (may_have_null) + { + mask_array->resize(capacity); + } } static bool CHColumnNothingToNumpyArray(NumpyAppendData & append_data) @@ -431,13 +578,6 @@ void NumpyArray::append( mask_array->count += size; bool may_have_null = false; - NumpyAppendData append_data(*column); - append_data.src_offset = offset; - append_data.src_offset + append_data.src_count = count; - append_data.target_data = data_ptr; - append_data.target_mask = mask_ptr; - append_data.dest_offset = data_array->count - size; - /// For nullable types, we need to get the nested type DataTypePtr actual_type = data_array->type; if (const auto * nullable_type = typeid_cast(data_array->type.get())) @@ -445,14 +585,23 @@ void NumpyArray::append( actual_type = nullable_type->getNestedType(); } + NumpyAppendData append_data(*column, actual_type); + append_data.src_offset = offset; + append_data.src_count = count; + append_data.target_data = data_ptr; + append_data.target_mask = mask_ptr; + append_data.dest_offset = data_array->count - size; + switch (actual_type->getTypeId()) { case TypeIndex::Nothing: may_have_null = CHColumnNothingToNumpyArray(append_data); break; + case TypeIndex::Int8: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::UInt8: { auto is_bool = isBool(data_array->type); @@ -462,114 +611,161 @@ void NumpyArray::append( may_have_null = CHColumnToNumpyArray(append_data); } break; + case TypeIndex::Int16: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::UInt16: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::Int32: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::UInt32: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::Int64: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::UInt64: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::Float32: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::Float64: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::Int128: may_have_null = TransformColumn(append_data); break; + case TypeIndex::Int256: may_have_null = TransformColumn(append_data); break; + case TypeIndex::UInt128: may_have_null = TransformColumn(append_data); break; + case TypeIndex::UInt256: may_have_null = TransformColumn(append_data); break; + case TypeIndex::BFloat16: may_have_null = TransformColumn(append_data); break; + case TypeIndex::Date: may_have_null = TransformColumn(append_data); break; + case TypeIndex::Date32: may_have_null = TransformColumn(append_data); break; + case TypeIndex::DateTime: may_have_null = TransformColumn(append_data); break; + case TypeIndex::DateTime64: - may_have_null = CHColumnToNumpyArray(append_data); + may_have_null = CHColumnDateTime64ToNumpyArray(append_data); break; + case TypeIndex::Time: - may_have_null = TransformColumn(append_data); + may_have_null = TransformColumn(append_data); break; + case TypeIndex::Time64: - may_have_null = CHColumnToNumpyArray(append_data); + may_have_null = TransformColumn(append_data); break; + case TypeIndex::String: may_have_null = CHColumnStringToNumpyArray(append_data); break; + case TypeIndex::FixedString: may_have_null = CHColumnStringToNumpyArray(append_data); break; + case TypeIndex::Enum8: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::Enum16: may_have_null = CHColumnToNumpyArray(append_data); break; + case TypeIndex::Decimal32: may_have_null = CHColumnDecimalToNumpyArray(append_data, actual_type); break; + case TypeIndex::Decimal64: may_have_null = CHColumnDecimalToNumpyArray(append_data, actual_type); break; + case TypeIndex::Decimal128: may_have_null = CHColumnDecimalToNumpyArray(append_data, actual_type); break; + case TypeIndex::Decimal256: may_have_null = CHColumnDecimalToNumpyArray(append_data, actual_type); break; + case TypeIndex::UUID: may_have_null = CHColumnUUIDToNumpyArray(append_data); break; + case TypeIndex::Array: may_have_null = CHColumnArrayToNumpyArray(append_data, actual_type); break; + case TypeIndex::Tuple: may_have_null = CHColumnTupleToNumpyArray(append_data, actual_type); break; + case TypeIndex::Interval: - may_have_null = CHColumnToNumpyArray(append_data); + { + const auto * interval_type = typeid_cast(actual_type.get()); + if (interval_type && interval_type->getKind() == IntervalKind::Kind::Quarter) + { + may_have_null = CHColumnIntervalToNumpyArray(append_data); + } + else + { + may_have_null = CHColumnToNumpyArray(append_data); + } + } break; + case TypeIndex::Map: may_have_null = CHColumnMapToNumpyArray(append_data, actual_type); break; + case TypeIndex::Object: may_have_null = CHColumnObjectToNumpyArray(append_data, actual_type); break; + case TypeIndex::IPv4: may_have_null = CHColumnIPv4ToNumpyArray(append_data); break; + case TypeIndex::IPv6: may_have_null = CHColumnIPv6ToNumpyArray(append_data); break; + case TypeIndex::Variant: may_have_null = CHColumnVariantToNumpyArray(append_data, actual_type); break; + case TypeIndex::Dynamic: may_have_null = CHColumnDynamicToNumpyArray(append_data, actual_type); break; @@ -600,9 +796,24 @@ void NumpyArray::append( } } +void NumpyArray::append(const DB::Field & field, const DB::DataTypePtr & type) +{ + chassert(data_array); + chassert(!mask_array); + + auto * data_ptr = data_array->data; + chassert(data_ptr); + + auto * dest_ptr = reinterpret_cast(data_ptr) + data_array->count; + + *dest_ptr = convertFieldToPython(field, type); + + data_array->count += 1; +} + py::object NumpyArray::toArray() const { - chassert(data_array && mask_array); + chassert(data_array); data_array->resize(data_array->count); if (!hava_null) @@ -610,6 +821,8 @@ py::object NumpyArray::toArray() const return std::move(data_array->array); } + chassert(mask_array); + mask_array->resize(mask_array->count); auto data_values = std::move(data_array->array); auto null_values = std::move(mask_array->array); diff --git a/programs/local/NumpyArray.h b/programs/local/NumpyArray.h index 3c014dc79f8..03f2fd5f360 100644 --- a/programs/local/NumpyArray.h +++ b/programs/local/NumpyArray.h @@ -4,6 +4,8 @@ #include #include +#include +#include #include namespace CHDB @@ -13,9 +15,12 @@ namespace CHDB class NumpyAppendData { public: - explicit NumpyAppendData(const DB::IColumn & column); + explicit NumpyAppendData( + const DB::IColumn & column_, + const DB::DataTypePtr & type_); const DB::IColumn & column; + const DB::DataTypePtr & type; size_t src_offset; size_t src_count; @@ -43,13 +48,15 @@ class NumpyArray { public: explicit NumpyArray(const DB::DataTypePtr & type_); - void init(size_t capacity); + void init(size_t capacity, bool may_have_null = true); - void resize(size_t capacity); + void resize(size_t capacity, bool may_have_null = true); + + void append(const DB::ColumnPtr & column); void append(const DB::ColumnPtr & column, size_t offset, size_t count); - void append(const DB::ColumnPtr & column); + void append(const DB::Field & field, const DB::DataTypePtr & type); py::object toArray() const; diff --git a/programs/local/NumpyNestedTypes.cpp b/programs/local/NumpyNestedTypes.cpp index eafc3477fbb..34468320bbe 100644 --- a/programs/local/NumpyNestedTypes.cpp +++ b/programs/local/NumpyNestedTypes.cpp @@ -64,6 +64,25 @@ struct ColumnTraits static py::object convertElement(const ColumnTuple * column, const DataTypePtr & data_type, size_t index) { + const auto * tuple_data_type = typeid_cast(data_type.get()); + if (!tuple_data_type) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected DataTypeTuple"); + + const auto & element_types = tuple_data_type->getElements(); + size_t tuple_size = column->tupleSize(); + + Field tuple_field = column->operator[](index); + const Tuple & tuple_value = tuple_field.safeGet(); + + NumpyArray numpy_array({}); + numpy_array.init(tuple_size); + + for (size_t i = 0; i < tuple_size; ++i) + { + numpy_array.append(tuple_value[i], element_types[i]); + } + + return numpy_array.toArray(); } }; diff --git a/programs/local/NumpyType.cpp b/programs/local/NumpyType.cpp index 9f2bb23216a..83408d13278 100644 --- a/programs/local/NumpyType.cpp +++ b/programs/local/NumpyType.cpp @@ -243,81 +243,74 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type { case TypeIndex::Nothing: return "object"; + case TypeIndex::Int8: return "int8"; + case TypeIndex::UInt8: /// Special case: UInt8 could be Bool type, need to check getName() { auto is_bool = isBool(data_type); return is_bool ? "bool" : "uint8"; } + case TypeIndex::Int16: return "int16"; + case TypeIndex::UInt16: return "uint16"; + case TypeIndex::Int32: return "int32"; + case TypeIndex::UInt32: return "uint32"; + case TypeIndex::Int64: return "int64"; + case TypeIndex::UInt64: return "uint64"; + case TypeIndex::BFloat16: case TypeIndex::Float32: return "float32"; + case TypeIndex::Int256: case TypeIndex::UInt256: case TypeIndex::Int128: case TypeIndex::UInt128: case TypeIndex::Float64: return "float64"; + case TypeIndex::String: case TypeIndex::FixedString: return "object"; + case TypeIndex::DateTime: return "datetime64[s]"; + case TypeIndex::DateTime64: { if (const auto * dt64 = typeid_cast(data_type.get())) { UInt32 scale = dt64->getScale(); - if (scale == 0) - return "datetime64[s]"; - else if (scale == 3) - return "datetime64[ms]"; - else if (scale == 6) - return "datetime64[us]"; - else if (scale == 9) - return "datetime64[ns]"; - else + if (scale <= 9) return "datetime64[ns]"; + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}, scale {}", data_type->getName(), scale); } - return "datetime64[ns]"; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected datetime64 type {}", data_type->getName()); } + case TypeIndex::Date: case TypeIndex::Date32: return "datetime64[D]"; + case TypeIndex::Time: - return "timedelta64[s]"; case TypeIndex::Time64: - { - if (const auto * time64 = typeid_cast(data_type.get())) - { - UInt32 scale = time64->getScale(); - if (scale == 0) - return "timedelta64[s]"; - else if (scale == 3) - return "timedelta64[ms]"; - else if (scale == 6) - return "timedelta64[us]"; - else if (scale == 9) - return "timedelta64[ns]"; - else - return "timedelta64[ns]"; - } - return "timedelta64[ns]"; - } + return "object"; + case TypeIndex::Interval: { if (const auto * interval = typeid_cast(data_type.get())) @@ -344,26 +337,28 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type case IntervalKind::Kind::Month: return "timedelta64[M]"; case IntervalKind::Kind::Quarter: - /// numpy doesn't have quarter type, use int64 - return "int64"; + /// numpy doesn't have quarter type + return "timedelta64[M]"; case IntervalKind::Kind::Year: return "timedelta64[Y]"; default: - return "timedelta64[s]"; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected interval kind {}", kind.kind); } } - return "timedelta64[s]"; + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected interval type {}", data_type->getName()); } case TypeIndex::UUID: case TypeIndex::IPv4: case TypeIndex::IPv6: return "object"; + case TypeIndex::Decimal32: case TypeIndex::Decimal64: case TypeIndex::Decimal128: case TypeIndex::Decimal256: return "float64"; + case TypeIndex::Array: case TypeIndex::Tuple: case TypeIndex::Map: @@ -372,10 +367,13 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type case TypeIndex::Variant: case TypeIndex::Object: return "object"; + case TypeIndex::Enum8: return "int8"; + case TypeIndex::Enum16: return "int16"; + case TypeIndex::Nullable: { if (const auto * nullable = typeid_cast(data_type.get())) @@ -384,6 +382,7 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type } throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected nullable type {}", data_type->getName()); } + default: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", data_type->getName()); } diff --git a/programs/local/PythonImportCache.h b/programs/local/PythonImportCache.h index 382bb34358d..598069a60e2 100644 --- a/programs/local/PythonImportCache.h +++ b/programs/local/PythonImportCache.h @@ -8,6 +8,7 @@ #include "PythonImportCacheItem.h" #include "UUIDCacheItem.h" #include "IPAddressCacheItem.h" +#include "PytzCacheItem.h" #include @@ -30,6 +31,7 @@ struct PythonImportCache NumpyCacheItem numpy; UUIDCacheItem uuid; IPAddressCacheItem ipaddress; + PytzCacheItem pytz; py::handle AddCache(py::object item); diff --git a/programs/local/PytzCacheItem.h b/programs/local/PytzCacheItem.h new file mode 100644 index 00000000000..3c6fccbe858 --- /dev/null +++ b/programs/local/PytzCacheItem.h @@ -0,0 +1,19 @@ +#pragma once + +#include "PythonImportCacheItem.h" + +namespace CHDB { + +struct PytzCacheItem : public PythonImportCacheItem +{ +public: + static constexpr const char *Name = "pytz"; + + PytzCacheItem() : PythonImportCacheItem("pytz"), timezone("timezone", this) {} + + ~PytzCacheItem() override = default; + + PythonImportCacheItem timezone; +}; + +} // namespace CHDB From 670d87c53c2a22b4397a65e513ce8ba41b65bba7 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Tue, 4 Nov 2025 14:47:27 +0800 Subject: [PATCH 14/22] chore: support more types --- programs/local/FieldToPython.cpp | 200 ++++++++++++++++++++++++++----- 1 file changed, 169 insertions(+), 31 deletions(-) diff --git a/programs/local/FieldToPython.cpp b/programs/local/FieldToPython.cpp index e5fb03380c2..140b3beeced 100644 --- a/programs/local/FieldToPython.cpp +++ b/programs/local/FieldToPython.cpp @@ -6,6 +6,10 @@ #include #include #include +#include +#include +#include +#include #include #include #include @@ -13,6 +17,7 @@ #include #include #include +#include #include namespace CHDB @@ -24,6 +29,7 @@ namespace py = pybind11; namespace ErrorCodes { extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; } static py::object convertLocalDateToPython(const LocalDate & local_date, auto & import_cache, const Field & field) @@ -133,7 +139,7 @@ py::object convertFieldToPython( { auto seconds = field.safeGet(); - const auto * datetime_type = typeid_cast(type.get()); + const auto * datetime_type = typeid_cast(actual_type.get()); const auto & time_zone = datetime_type ? datetime_type->getTimeZone() : DateLUT::instance("UTC"); time_t timestamp = static_cast(seconds); @@ -169,7 +175,7 @@ py::object convertFieldToPython( auto datetime64_value = datetime64_field.getValue(); Int64 datetime64_ticks = datetime64_value.value; - const auto * datetime64_type = typeid_cast(type.get()); + const auto * datetime64_type = typeid_cast(actual_type.get()); const auto & time_zone = datetime64_type ? datetime64_type->getTimeZone() : DateLUT::instance("UTC"); UInt32 scale = datetime64_field.getScale(); @@ -314,45 +320,177 @@ py::object convertFieldToPython( } case TypeIndex::UUID: - break; + { + auto uuid_value = field.safeGet(); + const auto formatted_uuid = formatUUID(uuid_value); + return import_cache.uuid.UUID()(String(formatted_uuid.data(), formatted_uuid.size())); + } - // case TypeIndex::Array: - // may_have_null = CHColumnArrayToNumpyArray(append_data, actual_type); - // break; + case TypeIndex::Array: + { + auto array_field = field.safeGet(); - // case TypeIndex::Tuple: - // may_have_null = CHColumnTupleToNumpyArray(append_data, actual_type); - // break; + const auto * array_type = typeid_cast(actual_type.get()); + chassert(array_type); - // case TypeIndex::Interval: - // { - // const auto * interval_type = typeid_cast(actual_type.get()); - // if (interval_type && interval_type->getKind() == IntervalKind::Kind::Quarter) - // { - // may_have_null = CHColumnIntervalToNumpyArray(append_data); - // } - // else - // { - // may_have_null = CHColumnToNumpyArray(append_data); - // } - // } - // break; + const auto & element_type = array_type->getNestedType(); - // case TypeIndex::Map: - // may_have_null = CHColumnMapToNumpyArray(append_data, actual_type); - // break; + py::list python_list; + for (const auto & element : array_field) + { + auto python_element = convertFieldToPython(element, element_type); + python_list.append(python_element); + } + + return python_list; + } + + case TypeIndex::Tuple: + { + const auto & tuple_field = field.safeGet(); + + const auto * tuple_type = typeid_cast(actual_type.get()); + chassert(tuple_type); + + const auto & element_types = tuple_type->getElements(); + + py::tuple python_tuple(tuple_field.size()); + for (size_t i = 0; i < tuple_field.size(); ++i) + { + auto python_element = convertFieldToPython(tuple_field[i], element_types[i]); + python_tuple[i] = python_element; + } + + return python_tuple; + } + + case TypeIndex::Interval: + { + auto interval_value = field.safeGet(); + const auto * interval_type = typeid_cast(actual_type.get()); + chassert(interval_type); + IntervalKind::Kind interval_kind = interval_type->getKind(); + + switch (interval_kind) + { + case IntervalKind::Kind::Nanosecond: + return import_cache.datetime.timedelta()(py::arg("microseconds") = interval_value / 1000); + case IntervalKind::Kind::Microsecond: + return import_cache.datetime.timedelta()(py::arg("microseconds") = interval_value); + case IntervalKind::Kind::Millisecond: + return import_cache.datetime.timedelta()(py::arg("milliseconds") = interval_value); + case IntervalKind::Kind::Second: + return import_cache.datetime.timedelta()(py::arg("seconds") = interval_value); + case IntervalKind::Kind::Minute: + return import_cache.datetime.timedelta()(py::arg("minutes") = interval_value); + case IntervalKind::Kind::Hour: + return import_cache.datetime.timedelta()(py::arg("hours") = interval_value); + case IntervalKind::Kind::Day: + return import_cache.datetime.timedelta()(py::arg("days") = interval_value); + case IntervalKind::Kind::Week: + return import_cache.datetime.timedelta()(py::arg("weeks") = interval_value); + case IntervalKind::Kind::Month: + /// Approximate: 1 month = 30 days + return import_cache.datetime.timedelta()(py::arg("days") = interval_value * 30); + case IntervalKind::Kind::Quarter: + /// 1 quarter = 3 months = 90 days + return import_cache.datetime.timedelta()(py::arg("days") = interval_value * 90); + case IntervalKind::Kind::Year: + /// 1 year = 365 days + return import_cache.datetime.timedelta()(py::arg("days") = interval_value * 365); + default: + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unsupported interval kind"); + } + } + + case TypeIndex::Map: + { + const auto & map_field = field.safeGet(); + + const auto * map_type = typeid_cast(actual_type.get()); + chassert(map_type); + + const auto & key_type = map_type->getKeyType(); + const auto & value_type = map_type->getValueType(); + + py::list keys_list; + py::list values_list; + py::dict python_dict; + bool use_dict = true; + + for (const auto & entry : map_field) + { + const auto & entry_tuple = entry.safeGet(); + chassert(entry_tuple.size() == 2); + + auto python_key = convertFieldToPython(entry_tuple[0], key_type); + auto python_value = convertFieldToPython(entry_tuple[1], value_type); + + if (use_dict) + { + try + { + python_dict[python_key] = python_value; + keys_list.append(std::move(python_key)); + values_list.append(std::move(python_value)); + } + catch (const std::exception &) + { + // Key is not hashable, switch to list format + use_dict = false; + keys_list.clear(); + values_list.clear(); + keys_list.append(std::move(python_key)); + values_list.append(std::move(python_value)); + } + } + else + { + keys_list.append(std::move(python_key)); + values_list.append(std::move(python_value)); + } + } + + if (use_dict) + { + return python_dict; + } + else + { + py::dict result; + result["keys"] = keys_list; + result["values"] = values_list; + return result; + } + } // case TypeIndex::Object: // may_have_null = CHColumnObjectToNumpyArray(append_data, actual_type); // break; - // case TypeIndex::IPv4: - // may_have_null = CHColumnIPv4ToNumpyArray(append_data); - // break; + case TypeIndex::IPv4: + { + auto ipv4_value = field.safeGet(); - // case TypeIndex::IPv6: - // may_have_null = CHColumnIPv6ToNumpyArray(append_data); - // break; + char ipv4_str[IPV4_MAX_TEXT_LENGTH]; + char * ptr = ipv4_str; + formatIPv4(reinterpret_cast(&ipv4_value), ptr); + const size_t ipv4_str_len = ptr - ipv4_str; + + return import_cache.ipaddress.ipv4_address()(String(ipv4_str, ipv4_str_len)); + } + + case TypeIndex::IPv6: + { + auto ipv6_value = field.safeGet(); + + char ipv6_str[IPV6_MAX_TEXT_LENGTH]; + char * ptr = ipv6_str; + formatIPv6(reinterpret_cast(&ipv6_value), ptr); + const size_t ipv6_str_len = ptr - ipv6_str; + + return import_cache.ipaddress.ipv6_address()(String(ipv6_str, ipv6_str_len)); + } // case TypeIndex::Variant: // may_have_null = CHColumnVariantToNumpyArray(append_data, actual_type); From b5b4de63f8f2f16a18609dbeae4df5fc343fafa8 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Tue, 4 Nov 2025 15:10:09 +0800 Subject: [PATCH 15/22] chore: support map type --- programs/local/FieldToPython.cpp | 59 ++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/programs/local/FieldToPython.cpp b/programs/local/FieldToPython.cpp index 140b3beeced..010f8cfcbe2 100644 --- a/programs/local/FieldToPython.cpp +++ b/programs/local/FieldToPython.cpp @@ -32,6 +32,65 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +static bool canTypeBeUsedAsDictKey(TypeIndex key_type) +{ + switch (key_type) + { + case TypeIndex::Nothing: + case TypeIndex::Int8: + case TypeIndex::UInt8: + case TypeIndex::Int16: + case TypeIndex::UInt16: + case TypeIndex::Int32: + case TypeIndex::UInt32: + case TypeIndex::Int64: + case TypeIndex::UInt64: + case TypeIndex::Float32: + case TypeIndex::Float64: + case TypeIndex::Int128: + case TypeIndex::Int256: + case TypeIndex::UInt128: + case TypeIndex::UInt256: + case TypeIndex::BFloat16: + case TypeIndex::Date: + case TypeIndex::Date32: + case TypeIndex::DateTime: + case TypeIndex::DateTime64: + case TypeIndex::Time: + case TypeIndex::Time64: + case TypeIndex::String: + case TypeIndex::FixedString: + case TypeIndex::Enum8: + case TypeIndex::Enum16: + case TypeIndex::Decimal32: + case TypeIndex::Decimal64: + case TypeIndex::Decimal128: + case TypeIndex::Decimal256: + case TypeIndex::UUID: + case TypeIndex::Interval: + case TypeIndex::IPv4: + case TypeIndex::IPv6: + return true; + + // Unsupported nested types + case TypeIndex::Array: + case TypeIndex::Tuple: + case TypeIndex::Map: + return false; + + // Other unsupported types + case TypeIndex::Set: + case TypeIndex::JSONPaths: + case TypeIndex::ObjectDeprecated: + case TypeIndex::Function: + case TypeIndex::AggregateFunction: + case TypeIndex::LowCardinality: + case TypeIndex::Nullable: + default: + return false; + } +} + static py::object convertLocalDateToPython(const LocalDate & local_date, auto & import_cache, const Field & field) { auto year = local_date.year(); From f9f19706f0233f977217e30c2ecb11206dde4ffe Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Tue, 4 Nov 2025 21:27:43 +0800 Subject: [PATCH 16/22] chore: support more nested type --- programs/local/FieldToPython.cpp | 363 ++++++++++++++++++---------- programs/local/FieldToPython.h | 6 +- programs/local/NumpyNestedTypes.cpp | 20 ++ 3 files changed, 258 insertions(+), 131 deletions(-) diff --git a/programs/local/FieldToPython.cpp b/programs/local/FieldToPython.cpp index 010f8cfcbe2..cfce3708b71 100644 --- a/programs/local/FieldToPython.cpp +++ b/programs/local/FieldToPython.cpp @@ -1,7 +1,10 @@ #include "FieldToPython.h" #include "PythonImporter.h" -#include +#include +#include +#include +#include #include #include #include @@ -10,6 +13,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include @@ -32,63 +40,83 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -static bool canTypeBeUsedAsDictKey(TypeIndex key_type) +static bool canTypeBeUsedAsDictKey(const DataTypePtr & type) { - switch (key_type) + DataTypePtr actual_type = type; + if (const auto * nullable_type = typeid_cast(type.get())) { - case TypeIndex::Nothing: - case TypeIndex::Int8: - case TypeIndex::UInt8: - case TypeIndex::Int16: - case TypeIndex::UInt16: - case TypeIndex::Int32: - case TypeIndex::UInt32: - case TypeIndex::Int64: + actual_type = nullable_type->getNestedType(); + } + + switch (actual_type->getTypeId()) + { + case TypeIndex::Nothing: + case TypeIndex::Int8: + case TypeIndex::UInt8: + case TypeIndex::Int16: + case TypeIndex::UInt16: + case TypeIndex::Int32: + case TypeIndex::UInt32: + case TypeIndex::Int64: case TypeIndex::UInt64: - case TypeIndex::Float32: - case TypeIndex::Float64: - case TypeIndex::Int128: - case TypeIndex::Int256: - case TypeIndex::UInt128: - case TypeIndex::UInt256: - case TypeIndex::BFloat16: - case TypeIndex::Date: - case TypeIndex::Date32: - case TypeIndex::DateTime: - case TypeIndex::DateTime64: - case TypeIndex::Time: - case TypeIndex::Time64: - case TypeIndex::String: - case TypeIndex::FixedString: - case TypeIndex::Enum8: - case TypeIndex::Enum16: - case TypeIndex::Decimal32: - case TypeIndex::Decimal64: - case TypeIndex::Decimal128: - case TypeIndex::Decimal256: - case TypeIndex::UUID: + case TypeIndex::Float32: + case TypeIndex::Float64: + case TypeIndex::Int128: + case TypeIndex::Int256: + case TypeIndex::UInt128: + case TypeIndex::UInt256: + case TypeIndex::BFloat16: + case TypeIndex::Date: + case TypeIndex::Date32: + case TypeIndex::DateTime: + case TypeIndex::DateTime64: + case TypeIndex::Time: + case TypeIndex::Time64: + case TypeIndex::String: + case TypeIndex::FixedString: + case TypeIndex::Enum8: + case TypeIndex::Enum16: + case TypeIndex::Decimal32: + case TypeIndex::Decimal64: + case TypeIndex::Decimal128: + case TypeIndex::Decimal256: + case TypeIndex::UUID: case TypeIndex::Interval: case TypeIndex::IPv4: - case TypeIndex::IPv6: + case TypeIndex::IPv6: return true; - // Unsupported nested types - case TypeIndex::Array: - case TypeIndex::Tuple: - case TypeIndex::Map: + case TypeIndex::Array: + case TypeIndex::Tuple: + case TypeIndex::Map: + case TypeIndex::Object: + case TypeIndex::Dynamic: return false; - // Other unsupported types - case TypeIndex::Set: - case TypeIndex::JSONPaths: - case TypeIndex::ObjectDeprecated: - case TypeIndex::Function: - case TypeIndex::AggregateFunction: - case TypeIndex::LowCardinality: - case TypeIndex::Nullable: - default: - return false; - } + case TypeIndex::Variant: + { + const auto * variant_type = typeid_cast(type.get()); + chassert(variant_type); + + const auto & variants = variant_type->getVariants(); + for (const auto & variant : variants) + { + if (!canTypeBeUsedAsDictKey(variant)) + return false; + } + return true; + } + + case TypeIndex::Set: + case TypeIndex::JSONPaths: + case TypeIndex::ObjectDeprecated: + case TypeIndex::Function: + case TypeIndex::AggregateFunction: + case TypeIndex::LowCardinality: + case TypeIndex::Nullable: + default: + return false; + } } static py::object convertLocalDateToPython(const LocalDate & local_date, auto & import_cache, const Field & field) @@ -108,13 +136,11 @@ static py::object convertLocalDateToPython(const LocalDate & local_date, auto & } py::object convertFieldToPython( - const Field & field, - const DB::DataTypePtr & type) + const ColumnPtr & column, + const DataTypePtr & type, + size_t index) { - chassert(type); - - auto filed_type = field.getType(); - if (filed_type == Field::Types::Null) + if (column->isNullAt(index)) { return py::none(); } @@ -133,55 +159,102 @@ py::object convertFieldToPython( return py::none(); case TypeIndex::Int8: - return py::cast(field.safeGet()); + { + auto field = column->operator[](index); + return py::cast(field.safeGet()); + } case TypeIndex::UInt8: - if (filed_type == Field::Types::Bool) - return py::cast(field.safeGet()); + { + auto field = column->operator[](index); + auto filed_type = field.getType(); + if (filed_type == Field::Types::Bool) + return py::cast(field.safeGet()); - return py::cast(field.safeGet()); + return py::cast(field.safeGet()); + } case TypeIndex::Int16: - return py::cast(field.safeGet()); + { + auto field = column->operator[](index); + return py::cast(field.safeGet()); + } case TypeIndex::UInt16: - return py::cast(field.safeGet()); + { + auto field = column->operator[](index); + return py::cast(field.safeGet()); + } case TypeIndex::Int32: - return py::cast(field.safeGet()); + { + auto field = column->operator[](index); + return py::cast(field.safeGet()); + } case TypeIndex::UInt32: - return py::cast(field.safeGet()); + { + auto field = column->operator[](index); + return py::cast(field.safeGet()); + } case TypeIndex::Int64: - return py::cast(field.safeGet()); + { + auto field = column->operator[](index); + return py::cast(field.safeGet()); + } case TypeIndex::UInt64: - return py::cast(field.safeGet()); + { + auto field = column->operator[](index); + return py::cast(field.safeGet()); + } case TypeIndex::Float32: - return py::cast(field.safeGet()); + { + auto field = column->operator[](index); + return py::cast(field.safeGet()); + } case TypeIndex::Float64: - return py::cast(field.safeGet()); + { + auto field = column->operator[](index); + return py::cast(field.safeGet()); + } case TypeIndex::Int128: - return py::cast((double)field.safeGet()); + { + auto field = column->operator[](index); + return py::cast((double)field.safeGet()); + } case TypeIndex::Int256: - return py::cast((double)field.safeGet()); + { + auto field = column->operator[](index); + return py::cast((double)field.safeGet()); + } case TypeIndex::UInt128: - return py::cast((double)field.safeGet()); + { + auto field = column->operator[](index); + return py::cast((double)field.safeGet()); + } case TypeIndex::UInt256: - return py::cast((double)field.safeGet()); + { + auto field = column->operator[](index); + return py::cast((double)field.safeGet()); + } case TypeIndex::BFloat16: - return py::cast((double)field.safeGet()); + { + auto field = column->operator[](index); + return py::cast((double)field.safeGet()); + } case TypeIndex::Date: { + auto field = column->operator[](index); auto days = field.safeGet(); LocalDate local_date(static_cast(days)); return convertLocalDateToPython(local_date, import_cache, field); @@ -189,6 +262,7 @@ py::object convertFieldToPython( case TypeIndex::Date32: { + auto field = column->operator[](index); auto days = field.safeGet(); LocalDate local_date(static_cast(days)); return convertLocalDateToPython(local_date, import_cache, field); @@ -196,6 +270,7 @@ py::object convertFieldToPython( case TypeIndex::DateTime: { + auto field = column->operator[](index); auto seconds = field.safeGet(); const auto * datetime_type = typeid_cast(actual_type.get()); @@ -230,6 +305,7 @@ py::object convertFieldToPython( case TypeIndex::DateTime64: { + auto field = column->operator[](index); auto datetime64_field = field.safeGet>(); auto datetime64_value = datetime64_field.getValue(); Int64 datetime64_ticks = datetime64_value.value; @@ -271,6 +347,7 @@ py::object convertFieldToPython( case TypeIndex::Time: { + auto field = column->operator[](index); auto time_seconds = field.safeGet(); if (time_seconds < 0) @@ -299,6 +376,7 @@ py::object convertFieldToPython( case TypeIndex::Time64: { + auto field = column->operator[](index); auto time64_field = field.safeGet>(); auto time64_value = time64_field.getValue(); Int64 time64_ticks = time64_value.value; @@ -336,14 +414,21 @@ py::object convertFieldToPython( case TypeIndex::String: case TypeIndex::FixedString: - return py::cast(field.safeGet()); + { + auto field = column->operator[](index); + return py::cast(field.safeGet()); + } case TypeIndex::Enum8: case TypeIndex::Enum16: - return py::cast(field.safeGet()); + { + auto field = column->operator[](index); + return py::cast(field.safeGet()); + } case TypeIndex::Decimal32: { + auto field = column->operator[](index); auto decimal_field = field.safeGet>(); auto decimal_value = decimal_field.getValue(); UInt32 scale = decimal_field.getScale(); @@ -353,6 +438,7 @@ py::object convertFieldToPython( case TypeIndex::Decimal64: { + auto field = column->operator[](index); auto decimal_field = field.safeGet>(); auto decimal_value = decimal_field.getValue(); UInt32 scale = decimal_field.getScale(); @@ -362,6 +448,7 @@ py::object convertFieldToPython( case TypeIndex::Decimal128: { + auto field = column->operator[](index); auto decimal_field = field.safeGet>(); auto decimal_value = decimal_field.getValue(); UInt32 scale = decimal_field.getScale(); @@ -371,6 +458,7 @@ py::object convertFieldToPython( case TypeIndex::Decimal256: { + auto field = column->operator[](index); auto decimal_field = field.safeGet>(); auto decimal_value = decimal_field.getValue(); UInt32 scale = decimal_field.getScale(); @@ -380,6 +468,7 @@ py::object convertFieldToPython( case TypeIndex::UUID: { + auto field = column->operator[](index); auto uuid_value = field.safeGet(); const auto formatted_uuid = formatUUID(uuid_value); return import_cache.uuid.UUID()(String(formatted_uuid.data(), formatted_uuid.size())); @@ -387,17 +476,24 @@ py::object convertFieldToPython( case TypeIndex::Array: { - auto array_field = field.safeGet(); + const auto * array_column = typeid_cast(column.get()); + if (!array_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnArray"); const auto * array_type = typeid_cast(actual_type.get()); chassert(array_type); const auto & element_type = array_type->getNestedType(); + const auto & offsets = array_column->getOffsets(); + const auto & nested_column = array_column->getDataPtr(); + + size_t start_offset = (index == 0) ? 0 : offsets[index - 1]; + size_t end_offset = offsets[index]; py::list python_list; - for (const auto & element : array_field) + for (size_t i = start_offset; i < end_offset; ++i) { - auto python_element = convertFieldToPython(element, element_type); + auto python_element = convertFieldToPython(nested_column, element_type, i); python_list.append(python_element); } @@ -406,17 +502,20 @@ py::object convertFieldToPython( case TypeIndex::Tuple: { - const auto & tuple_field = field.safeGet(); + const auto * tuple_column = typeid_cast(column.get()); + if (!tuple_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnTuple"); const auto * tuple_type = typeid_cast(actual_type.get()); chassert(tuple_type); const auto & element_types = tuple_type->getElements(); + const auto & tuple_columns = tuple_column->getColumns(); - py::tuple python_tuple(tuple_field.size()); - for (size_t i = 0; i < tuple_field.size(); ++i) + py::tuple python_tuple(tuple_columns.size()); + for (size_t i = 0; i < tuple_columns.size(); ++i) { - auto python_element = convertFieldToPython(tuple_field[i], element_types[i]); + auto python_element = convertFieldToPython(tuple_columns[i], element_types[i], index); python_tuple[i] = python_element; } @@ -425,6 +524,7 @@ py::object convertFieldToPython( case TypeIndex::Interval: { + auto field = column->operator[](index); auto interval_value = field.safeGet(); const auto * interval_type = typeid_cast(actual_type.get()); chassert(interval_type); @@ -464,7 +564,9 @@ py::object convertFieldToPython( case TypeIndex::Map: { - const auto & map_field = field.safeGet(); + const auto * map_column = typeid_cast(column.get()); + if (!map_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnMap"); const auto * map_type = typeid_cast(actual_type.get()); chassert(map_type); @@ -472,63 +574,73 @@ py::object convertFieldToPython( const auto & key_type = map_type->getKeyType(); const auto & value_type = map_type->getValueType(); - py::list keys_list; - py::list values_list; - py::dict python_dict; - bool use_dict = true; + /// Get the nested array column containing tuples + const auto & nested_array = map_column->getNestedColumn(); + const auto * array_column = typeid_cast(&nested_array); + if (!array_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnArray in ColumnMap"); - for (const auto & entry : map_field) - { - const auto & entry_tuple = entry.safeGet(); - chassert(entry_tuple.size() == 2); + const auto & offsets = array_column->getOffsets(); + const auto & tuple_column_ptr = array_column->getDataPtr(); + const auto * tuple_column = typeid_cast(tuple_column_ptr.get()); + if (!tuple_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnTuple in ColumnMap"); - auto python_key = convertFieldToPython(entry_tuple[0], key_type); - auto python_value = convertFieldToPython(entry_tuple[1], value_type); + size_t start_offset = (index == 0) ? 0 : offsets[index - 1]; + size_t end_offset = offsets[index]; - if (use_dict) + const auto & key_column_ptr = tuple_column->getColumnPtr(0); + const auto & value_column_ptr = tuple_column->getColumnPtr(1); + + bool use_dict = canTypeBeUsedAsDictKey(key_type); + + if (use_dict) + { + py::dict python_dict; + for (size_t i = start_offset; i < end_offset; ++i) { - try - { - python_dict[python_key] = python_value; - keys_list.append(std::move(python_key)); - values_list.append(std::move(python_value)); - } - catch (const std::exception &) - { - // Key is not hashable, switch to list format - use_dict = false; - keys_list.clear(); - values_list.clear(); - keys_list.append(std::move(python_key)); - values_list.append(std::move(python_value)); - } + auto python_key = convertFieldToPython(key_column_ptr, key_type, i); + auto python_value = convertFieldToPython(value_column_ptr, value_type, i); + + python_dict[std::move(python_key)] = std::move(python_value); } - else + + return python_dict; + } + else + { + py::list keys_list; + py::list values_list; + for (size_t i = start_offset; i < end_offset; ++i) { + auto python_key = convertFieldToPython(key_column_ptr, key_type, i); + auto python_value = convertFieldToPython(value_column_ptr, value_type, i); + keys_list.append(std::move(python_key)); values_list.append(std::move(python_value)); } - } - if (use_dict) - { + py::dict python_dict; + python_dict["keys"] = std::move(keys_list); + python_dict["values"] = std::move(values_list); + return python_dict; } - else - { - py::dict result; - result["keys"] = keys_list; - result["values"] = values_list; - return result; - } } + case TypeIndex::Variant: + { + + } + + + // case TypeIndex::Dynamic: + // case TypeIndex::Object: - // may_have_null = CHColumnObjectToNumpyArray(append_data, actual_type); - // break; case TypeIndex::IPv4: { + auto field = column->operator[](index); auto ipv4_value = field.safeGet(); char ipv4_str[IPV4_MAX_TEXT_LENGTH]; @@ -541,6 +653,7 @@ py::object convertFieldToPython( case TypeIndex::IPv6: { + auto field = column->operator[](index); auto ipv6_value = field.safeGet(); char ipv6_str[IPV6_MAX_TEXT_LENGTH]; @@ -551,14 +664,6 @@ py::object convertFieldToPython( return import_cache.ipaddress.ipv6_address()(String(ipv6_str, ipv6_str_len)); } - // case TypeIndex::Variant: - // may_have_null = CHColumnVariantToNumpyArray(append_data, actual_type); - // break; - - // case TypeIndex::Dynamic: - // may_have_null = CHColumnDynamicToNumpyArray(append_data, actual_type); - // break; - /// Set types are used only in WHERE clauses for IN operations, not in actual data storage case TypeIndex::Set: /// JSONPaths is an internal type used only for JSON schema inference, diff --git a/programs/local/FieldToPython.h b/programs/local/FieldToPython.h index a47e6d94773..c108d4f6479 100644 --- a/programs/local/FieldToPython.h +++ b/programs/local/FieldToPython.h @@ -2,13 +2,15 @@ #include #include +#include #include namespace CHDB { pybind11::object convertFieldToPython( - const DB::Field & field, - const DB::DataTypePtr & type); + const DB::ColumnPtr & column, + const DB::DataTypePtr & type, + size_t index); } // namespace CHDB diff --git a/programs/local/NumpyNestedTypes.cpp b/programs/local/NumpyNestedTypes.cpp index 34468320bbe..e35774025fe 100644 --- a/programs/local/NumpyNestedTypes.cpp +++ b/programs/local/NumpyNestedTypes.cpp @@ -1,5 +1,6 @@ #include "NumpyNestedTypes.h" #include "NumpyArray.h" +#include "FieldToPython.h" #include #include @@ -13,6 +14,9 @@ #include #include #include +#include +#include +#include #include #include @@ -93,6 +97,7 @@ struct ColumnTraits static py::object convertElement(const ColumnMap * column, const DataTypePtr & data_type, size_t index) { + return convertFieldToPython(*column, data_type, index); } }; @@ -113,6 +118,19 @@ struct ColumnTraits static py::object convertElement(const ColumnVariant * column, const DataTypePtr & data_type, size_t index) { + auto discriminator = column->globalDiscriminatorAt(index); + if (discriminator == ColumnVariant::NULL_DISCRIMINATOR) + { + return py::none(); + } + + const auto * variant_type = typeid_cast(data_type.get()); + const auto & variants = variant_type->getVariants(); + const auto & actual_type = variants[discriminator]; + Field variant_field = column->operator[](index); + + /// Nested types can be arbitrary types except Variant(...), LowCardinality(Nullable(...)) and Nullable(...) types. + return convertFieldToPython(variant_field, actual_type); } }; @@ -123,6 +141,8 @@ struct ColumnTraits static py::object convertElement(const ColumnDynamic * column, const DataTypePtr & data_type, size_t index) { + Field dynamic_field = column->operator[](index); + return convertFieldToPython(dynamic_field, data_type); } }; From dab8450bd6eed758f1abeffeea10c8909e3f9929 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Tue, 4 Nov 2025 22:35:27 +0800 Subject: [PATCH 17/22] chore: support more nested type --- programs/local/FieldToPython.cpp | 289 ++++++++++++++++------------ programs/local/FieldToPython.h | 6 +- programs/local/NumpyArray.cpp | 11 +- programs/local/NumpyArray.h | 12 +- programs/local/NumpyNestedTypes.cpp | 28 +-- 5 files changed, 192 insertions(+), 154 deletions(-) diff --git a/programs/local/FieldToPython.cpp b/programs/local/FieldToPython.cpp index cfce3708b71..83a0569ad0e 100644 --- a/programs/local/FieldToPython.cpp +++ b/programs/local/FieldToPython.cpp @@ -5,6 +5,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include @@ -40,6 +45,73 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +py::object convertTimeFieldToPython(const Field & field) +{ + auto & import_cache = PythonImporter::ImportCache(); + auto time_seconds = field.safeGet(); + + if (time_seconds < 0) + { + return py::str(toString(field)); + } + + /// Handle time overflow (should be within 24 hours) + /// ClickHouse Time range is [-999:59:59, 999:59:59] + time_seconds = time_seconds % 86400; + + int hour = static_cast(time_seconds / 3600); + int minute = static_cast((time_seconds % 3600) / 60); + int second = static_cast(time_seconds % 60); + int microsecond = 0; + + try + { + return import_cache.datetime.time()(hour, minute, second, microsecond); + } + catch (py::error_already_set &) + { + return py::str(toString(field)); + } +} + +py::object convertTime64FieldToPython(const Field & field) +{ + auto & import_cache = PythonImporter::ImportCache(); + auto time64_field = field.safeGet>(); + auto time64_value = time64_field.getValue(); + Int64 time64_ticks = time64_value.value; + + if (time64_ticks < 0) + { + return py::str(toString(field)); + } + + UInt32 scale = time64_field.getScale(); + Int64 scale_multiplier = DecimalUtils::scaleMultiplier(scale); + + /// Convert to seconds and fractional part within a day + Int64 total_seconds = time64_ticks / scale_multiplier; + Int64 fractional = time64_ticks % scale_multiplier; + + /// Handle time overflow (should be within 24 hours) + /// ClickHouse Time range is [-999:59:59, 999:59:59] + total_seconds = total_seconds % 86400; + + int hour = static_cast(total_seconds / 3600); + int minute = static_cast((total_seconds % 3600) / 60); + int second = static_cast(total_seconds % 60); + int microsecond = static_cast((fractional * 1000000) / scale_multiplier); + + try + { + return import_cache.datetime.time()(hour, minute, second, microsecond); + } + catch (py::error_already_set &) + { + return py::str(toString(field)); + } +} + static bool canTypeBeUsedAsDictKey(const DataTypePtr & type) { DataTypePtr actual_type = type; @@ -136,11 +208,11 @@ static py::object convertLocalDateToPython(const LocalDate & local_date, auto & } py::object convertFieldToPython( - const ColumnPtr & column, + const IColumn & column, const DataTypePtr & type, size_t index) { - if (column->isNullAt(index)) + if (column.isNullAt(index)) { return py::none(); } @@ -160,13 +232,13 @@ py::object convertFieldToPython( case TypeIndex::Int8: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast(field.safeGet()); } case TypeIndex::UInt8: { - auto field = column->operator[](index); + auto field = column[index]; auto filed_type = field.getType(); if (filed_type == Field::Types::Bool) return py::cast(field.safeGet()); @@ -176,85 +248,85 @@ py::object convertFieldToPython( case TypeIndex::Int16: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast(field.safeGet()); } case TypeIndex::UInt16: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast(field.safeGet()); } case TypeIndex::Int32: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast(field.safeGet()); } case TypeIndex::UInt32: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast(field.safeGet()); } case TypeIndex::Int64: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast(field.safeGet()); } case TypeIndex::UInt64: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast(field.safeGet()); } case TypeIndex::Float32: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast(field.safeGet()); } case TypeIndex::Float64: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast(field.safeGet()); } case TypeIndex::Int128: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast((double)field.safeGet()); } case TypeIndex::Int256: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast((double)field.safeGet()); } case TypeIndex::UInt128: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast((double)field.safeGet()); } case TypeIndex::UInt256: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast((double)field.safeGet()); } case TypeIndex::BFloat16: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast((double)field.safeGet()); } case TypeIndex::Date: { - auto field = column->operator[](index); + auto field = column[index]; auto days = field.safeGet(); LocalDate local_date(static_cast(days)); return convertLocalDateToPython(local_date, import_cache, field); @@ -262,7 +334,7 @@ py::object convertFieldToPython( case TypeIndex::Date32: { - auto field = column->operator[](index); + auto field = column[index]; auto days = field.safeGet(); LocalDate local_date(static_cast(days)); return convertLocalDateToPython(local_date, import_cache, field); @@ -270,7 +342,7 @@ py::object convertFieldToPython( case TypeIndex::DateTime: { - auto field = column->operator[](index); + auto field = column[index]; auto seconds = field.safeGet(); const auto * datetime_type = typeid_cast(actual_type.get()); @@ -305,7 +377,7 @@ py::object convertFieldToPython( case TypeIndex::DateTime64: { - auto field = column->operator[](index); + auto field = column[index]; auto datetime64_field = field.safeGet>(); auto datetime64_value = datetime64_field.getValue(); Int64 datetime64_ticks = datetime64_value.value; @@ -347,88 +419,33 @@ py::object convertFieldToPython( case TypeIndex::Time: { - auto field = column->operator[](index); - auto time_seconds = field.safeGet(); - - if (time_seconds < 0) - { - return py::str(toString(field)); - } - - /// Handle time overflow (should be within 24 hours) - /// ClickHouse Time range is [-999:59:59, 999:59:59] - time_seconds = time_seconds % 86400; - - int hour = static_cast(time_seconds / 3600); - int minute = static_cast((time_seconds % 3600) / 60); - int second = static_cast(time_seconds % 60); - int microsecond = 0; - - try - { - return import_cache.datetime.time()(hour, minute, second, microsecond); - } - catch (py::error_already_set &) - { - return py::str(toString(field)); - } + auto field = column[index]; + return convertTimeFieldToPython(field); } case TypeIndex::Time64: { - auto field = column->operator[](index); - auto time64_field = field.safeGet>(); - auto time64_value = time64_field.getValue(); - Int64 time64_ticks = time64_value.value; - - if (time64_ticks < 0) - { - return py::str(toString(field)); - } - - UInt32 scale = time64_field.getScale(); - Int64 scale_multiplier = DecimalUtils::scaleMultiplier(scale); - - /// Convert to seconds and fractional part within a day - Int64 total_seconds = time64_ticks / scale_multiplier; - Int64 fractional = time64_ticks % scale_multiplier; - - /// Handle time overflow (should be within 24 hours) - /// ClickHouse Time range is [-999:59:59, 999:59:59] - total_seconds = total_seconds % 86400; - - int hour = static_cast(total_seconds / 3600); - int minute = static_cast((total_seconds % 3600) / 60); - int second = static_cast(total_seconds % 60); - int microsecond = static_cast((fractional * 1000000) / scale_multiplier); - - try - { - return import_cache.datetime.time()(hour, minute, second, microsecond); - } - catch (py::error_already_set &) - { - return py::str(toString(field)); - } + auto field = column[index]; + return convertTime64FieldToPython(field); } case TypeIndex::String: case TypeIndex::FixedString: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast(field.safeGet()); } case TypeIndex::Enum8: case TypeIndex::Enum16: { - auto field = column->operator[](index); + auto field = column[index]; return py::cast(field.safeGet()); } case TypeIndex::Decimal32: { - auto field = column->operator[](index); + auto field = column[index]; auto decimal_field = field.safeGet>(); auto decimal_value = decimal_field.getValue(); UInt32 scale = decimal_field.getScale(); @@ -438,7 +455,7 @@ py::object convertFieldToPython( case TypeIndex::Decimal64: { - auto field = column->operator[](index); + auto field = column[index]; auto decimal_field = field.safeGet>(); auto decimal_value = decimal_field.getValue(); UInt32 scale = decimal_field.getScale(); @@ -448,7 +465,7 @@ py::object convertFieldToPython( case TypeIndex::Decimal128: { - auto field = column->operator[](index); + auto field = column[index]; auto decimal_field = field.safeGet>(); auto decimal_value = decimal_field.getValue(); UInt32 scale = decimal_field.getScale(); @@ -458,7 +475,7 @@ py::object convertFieldToPython( case TypeIndex::Decimal256: { - auto field = column->operator[](index); + auto field = column[index]; auto decimal_field = field.safeGet>(); auto decimal_value = decimal_field.getValue(); UInt32 scale = decimal_field.getScale(); @@ -468,7 +485,7 @@ py::object convertFieldToPython( case TypeIndex::UUID: { - auto field = column->operator[](index); + auto field = column[index]; auto uuid_value = field.safeGet(); const auto formatted_uuid = formatUUID(uuid_value); return import_cache.uuid.UUID()(String(formatted_uuid.data(), formatted_uuid.size())); @@ -476,16 +493,14 @@ py::object convertFieldToPython( case TypeIndex::Array: { - const auto * array_column = typeid_cast(column.get()); - if (!array_column) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnArray"); + const auto & array_column = typeid_cast(column); const auto * array_type = typeid_cast(actual_type.get()); chassert(array_type); const auto & element_type = array_type->getNestedType(); - const auto & offsets = array_column->getOffsets(); - const auto & nested_column = array_column->getDataPtr(); + const auto & offsets = array_column.getOffsets(); + const auto & nested_column = array_column.getDataPtr(); size_t start_offset = (index == 0) ? 0 : offsets[index - 1]; size_t end_offset = offsets[index]; @@ -493,7 +508,7 @@ py::object convertFieldToPython( py::list python_list; for (size_t i = start_offset; i < end_offset; ++i) { - auto python_element = convertFieldToPython(nested_column, element_type, i); + auto python_element = convertFieldToPython(*nested_column, element_type, i); python_list.append(python_element); } @@ -502,20 +517,18 @@ py::object convertFieldToPython( case TypeIndex::Tuple: { - const auto * tuple_column = typeid_cast(column.get()); - if (!tuple_column) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnTuple"); + const auto & tuple_column = typeid_cast(column); const auto * tuple_type = typeid_cast(actual_type.get()); chassert(tuple_type); const auto & element_types = tuple_type->getElements(); - const auto & tuple_columns = tuple_column->getColumns(); + const auto & tuple_columns = tuple_column.getColumns(); py::tuple python_tuple(tuple_columns.size()); for (size_t i = 0; i < tuple_columns.size(); ++i) { - auto python_element = convertFieldToPython(tuple_columns[i], element_types[i], index); + auto python_element = convertFieldToPython(*(tuple_columns[i]), element_types[i], index); python_tuple[i] = python_element; } @@ -524,7 +537,7 @@ py::object convertFieldToPython( case TypeIndex::Interval: { - auto field = column->operator[](index); + auto field = column[index]; auto interval_value = field.safeGet(); const auto * interval_type = typeid_cast(actual_type.get()); chassert(interval_type); @@ -564,9 +577,7 @@ py::object convertFieldToPython( case TypeIndex::Map: { - const auto * map_column = typeid_cast(column.get()); - if (!map_column) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnMap"); + const auto & map_column = typeid_cast(column); const auto * map_type = typeid_cast(actual_type.get()); chassert(map_type); @@ -575,22 +586,18 @@ py::object convertFieldToPython( const auto & value_type = map_type->getValueType(); /// Get the nested array column containing tuples - const auto & nested_array = map_column->getNestedColumn(); - const auto * array_column = typeid_cast(&nested_array); - if (!array_column) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnArray in ColumnMap"); + const auto & nested_array = map_column.getNestedColumn(); + const auto & array_column = typeid_cast(nested_array); - const auto & offsets = array_column->getOffsets(); - const auto & tuple_column_ptr = array_column->getDataPtr(); - const auto * tuple_column = typeid_cast(tuple_column_ptr.get()); - if (!tuple_column) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnTuple in ColumnMap"); + const auto & offsets = array_column.getOffsets(); + const auto & tuple_column_ptr = array_column.getDataPtr(); + const auto & tuple_column = typeid_cast(tuple_column_ptr); size_t start_offset = (index == 0) ? 0 : offsets[index - 1]; size_t end_offset = offsets[index]; - const auto & key_column_ptr = tuple_column->getColumnPtr(0); - const auto & value_column_ptr = tuple_column->getColumnPtr(1); + const auto & key_column = tuple_column.getColumn(0); + const auto & value_column = tuple_column.getColumn(1); bool use_dict = canTypeBeUsedAsDictKey(key_type); @@ -599,8 +606,8 @@ py::object convertFieldToPython( py::dict python_dict; for (size_t i = start_offset; i < end_offset; ++i) { - auto python_key = convertFieldToPython(key_column_ptr, key_type, i); - auto python_value = convertFieldToPython(value_column_ptr, value_type, i); + auto python_key = convertFieldToPython(key_column, key_type, i); + auto python_value = convertFieldToPython(value_column, value_type, i); python_dict[std::move(python_key)] = std::move(python_value); } @@ -613,8 +620,8 @@ py::object convertFieldToPython( py::list values_list; for (size_t i = start_offset; i < end_offset; ++i) { - auto python_key = convertFieldToPython(key_column_ptr, key_type, i); - auto python_value = convertFieldToPython(value_column_ptr, value_type, i); + auto python_key = convertFieldToPython(key_column, key_type, i); + auto python_value = convertFieldToPython(value_column, value_type, i); keys_list.append(std::move(python_key)); values_list.append(std::move(python_value)); @@ -630,17 +637,55 @@ py::object convertFieldToPython( case TypeIndex::Variant: { - + const auto & variant_column = typeid_cast(column); + auto discriminator = variant_column.globalDiscriminatorAt(index); + if (discriminator == ColumnVariant::NULL_DISCRIMINATOR) + { + return py::none(); + } + + const auto & variant_type = typeid_cast(actual_type); + const auto & variants = variant_type.getVariants(); + const auto & variant_data_type = variants[discriminator]; + + auto offset = variant_column.offsetAt(index); + const auto & variant_inner_column = variant_column.getVariantByGlobalDiscriminator(discriminator); + + return convertFieldToPython(variant_inner_column, variant_data_type, offset); } - // case TypeIndex::Dynamic: + case TypeIndex::Dynamic: + { + const auto & dynamic_column = typeid_cast(column); + const auto & variant_column = dynamic_column.getVariantColumn(); + + /// Check if this row has value in shared variant + if (variant_column.globalDiscriminatorAt(index) == dynamic_column.getSharedVariantDiscriminator()) + { + /// Get data from shared variant and deserialize it + auto value = dynamic_column.getSharedVariant().getDataAt(variant_column.offsetAt(index)); + ReadBufferFromMemory buf(value.data, value.size); + auto variant_type = decodeDataType(buf); + auto tmp_variant_column = variant_type->createColumn(); + auto variant_serialization = variant_type->getDefaultSerialization(); + variant_serialization->deserializeBinary(*tmp_variant_column, buf, FormatSettings{}); + + /// Convert the deserialized value + return convertFieldToPython(*tmp_variant_column, variant_type, 0); + } + else + { + /// Use variant conversion logic directly + return convertFieldToPython(variant_column, dynamic_column.getVariantInfo().variant_type, index); + } + } - // case TypeIndex::Object: + case TypeIndex::Object: case TypeIndex::IPv4: { - auto field = column->operator[](index); + auto field = column[index]; auto ipv4_value = field.safeGet(); char ipv4_str[IPV4_MAX_TEXT_LENGTH]; @@ -653,7 +698,7 @@ py::object convertFieldToPython( case TypeIndex::IPv6: { - auto field = column->operator[](index); + auto field = column[index]; auto ipv6_value = field.safeGet(); char ipv6_str[IPV6_MAX_TEXT_LENGTH]; diff --git a/programs/local/FieldToPython.h b/programs/local/FieldToPython.h index c108d4f6479..f175ceb0866 100644 --- a/programs/local/FieldToPython.h +++ b/programs/local/FieldToPython.h @@ -8,8 +8,12 @@ namespace CHDB { +pybind11::object convertTimeFieldToPython(const DB::Field & field); + +pybind11::object convertTime64FieldToPython(const DB::Field & field); + pybind11::object convertFieldToPython( - const DB::ColumnPtr & column, + const DB::IColumn & column, const DB::DataTypePtr & type, size_t index); diff --git a/programs/local/NumpyArray.cpp b/programs/local/NumpyArray.cpp index f054a95d27a..89bbc334343 100644 --- a/programs/local/NumpyArray.cpp +++ b/programs/local/NumpyArray.cpp @@ -67,7 +67,7 @@ struct TimeConvert chassert(append_data.type); Field field(static_cast(val)); - auto time_object = convertFieldToPython(field, append_data.type); + auto time_object = convertTimeFieldToPython(field); return time_object.release().ptr(); } @@ -87,7 +87,7 @@ struct Time64Convert chassert(append_data.type); Field field(val); - auto time64_object = convertFieldToPython(field, append_data.type); + auto time64_object = convertTime64FieldToPython(field); return time64_object.release().ptr(); } @@ -796,7 +796,10 @@ void NumpyArray::append( } } -void NumpyArray::append(const DB::Field & field, const DB::DataTypePtr & type) +void NumpyArray::append( + const DB::IColumn & column, + const DB::DataTypePtr & type, + size_t index) { chassert(data_array); chassert(!mask_array); @@ -806,7 +809,7 @@ void NumpyArray::append(const DB::Field & field, const DB::DataTypePtr & type) auto * dest_ptr = reinterpret_cast(data_ptr) + data_array->count; - *dest_ptr = convertFieldToPython(field, type); + *dest_ptr = convertFieldToPython(column, type, index); data_array->count += 1; } diff --git a/programs/local/NumpyArray.h b/programs/local/NumpyArray.h index 03f2fd5f360..bea70774732 100644 --- a/programs/local/NumpyArray.h +++ b/programs/local/NumpyArray.h @@ -54,9 +54,15 @@ class NumpyArray { void append(const DB::ColumnPtr & column); - void append(const DB::ColumnPtr & column, size_t offset, size_t count); - - void append(const DB::Field & field, const DB::DataTypePtr & type); + void append( + const DB::ColumnPtr & column, + size_t offset, + size_t count); + + void append( + const DB::IColumn & column, + const DB::DataTypePtr & type, + size_t index); py::object toArray() const; diff --git a/programs/local/NumpyNestedTypes.cpp b/programs/local/NumpyNestedTypes.cpp index e35774025fe..3459c65c903 100644 --- a/programs/local/NumpyNestedTypes.cpp +++ b/programs/local/NumpyNestedTypes.cpp @@ -42,10 +42,6 @@ struct ColumnTraits static py::object convertElement(const ColumnArray * column, const DataTypePtr & data_type, size_t index) { - const auto * array_data_type = typeid_cast(data_type.get()); - if (!array_data_type) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected DataTypeArray"); - const auto & offsets = column->getOffsets(); const auto & nested_column = column->getDataPtr(); @@ -75,15 +71,12 @@ struct ColumnTraits const auto & element_types = tuple_data_type->getElements(); size_t tuple_size = column->tupleSize(); - Field tuple_field = column->operator[](index); - const Tuple & tuple_value = tuple_field.safeGet(); - NumpyArray numpy_array({}); numpy_array.init(tuple_size); for (size_t i = 0; i < tuple_size; ++i) { - numpy_array.append(tuple_value[i], element_types[i]); + numpy_array.append(column->getColumn(i), element_types[i], index); } return numpy_array.toArray(); @@ -108,6 +101,7 @@ struct ColumnTraits static py::object convertElement(const ColumnObject * column, const DataTypePtr & data_type, size_t index) { + return convertFieldToPython(*column, data_type, index); } }; @@ -118,19 +112,7 @@ struct ColumnTraits static py::object convertElement(const ColumnVariant * column, const DataTypePtr & data_type, size_t index) { - auto discriminator = column->globalDiscriminatorAt(index); - if (discriminator == ColumnVariant::NULL_DISCRIMINATOR) - { - return py::none(); - } - - const auto * variant_type = typeid_cast(data_type.get()); - const auto & variants = variant_type->getVariants(); - const auto & actual_type = variants[discriminator]; - Field variant_field = column->operator[](index); - - /// Nested types can be arbitrary types except Variant(...), LowCardinality(Nullable(...)) and Nullable(...) types. - return convertFieldToPython(variant_field, actual_type); + return convertFieldToPython(*column, data_type, index); } }; @@ -141,8 +123,7 @@ struct ColumnTraits static py::object convertElement(const ColumnDynamic * column, const DataTypePtr & data_type, size_t index) { - Field dynamic_field = column->operator[](index); - return convertFieldToPython(dynamic_field, data_type); + return convertFieldToPython(*column, data_type, index); } }; @@ -157,7 +138,6 @@ bool CHNestedColumnToNumpyArray(NumpyAppendData & append_data, const DataTypePtr { nullable_column = nullable; data_column = &nullable->getNestedColumn(); - has_null = true; } const auto * typed_column = typeid_cast(data_column); From 9894ace29d85ce4f9f8366261ac882374af2ebc6 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Wed, 5 Nov 2025 01:15:45 +0800 Subject: [PATCH 18/22] chore: support object type --- programs/local/FieldToPython.cpp | 6 ++ programs/local/ObjectToPython.cpp | 143 ++++++++++++++++++++++++++++++ programs/local/ObjectToPython.h | 15 ++++ 3 files changed, 164 insertions(+) create mode 100644 programs/local/ObjectToPython.cpp create mode 100644 programs/local/ObjectToPython.h diff --git a/programs/local/FieldToPython.cpp b/programs/local/FieldToPython.cpp index 83a0569ad0e..cc02eeac945 100644 --- a/programs/local/FieldToPython.cpp +++ b/programs/local/FieldToPython.cpp @@ -1,11 +1,13 @@ #include "FieldToPython.h" #include "PythonImporter.h" +#include "ObjectToPython.h" #include #include #include #include #include +#include #include #include #include @@ -20,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -682,6 +685,9 @@ py::object convertFieldToPython( } case TypeIndex::Object: + { + return convertObjectToPython(column, actual_type, index); + } case TypeIndex::IPv4: { diff --git a/programs/local/ObjectToPython.cpp b/programs/local/ObjectToPython.cpp new file mode 100644 index 00000000000..88b51f79a3e --- /dev/null +++ b/programs/local/ObjectToPython.cpp @@ -0,0 +1,143 @@ +#include "ObjectToPython.h" +#include "FieldToPython.h" + +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} +} + +namespace CHDB +{ + +using namespace DB; +namespace py = pybind11; + +struct PathElements +{ + explicit PathElements(const String & path) + { + const char * start = path.data(); + const char * end = start + path.size(); + const char * pos = start; + const char * last_dot_pos = pos - 1; + for (pos = start; pos != end; ++pos) + { + if (*pos == '.') + { + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); + last_dot_pos = pos; + } + } + + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); + } + + size_t size() const { return elements.size(); } + + std::vector elements; +}; + +py::object convertObjectToPython( + const IColumn & column, + const DataTypePtr & type, + size_t index) +{ + const auto & column_object = typeid_cast(column); + const auto & typed_paths = column_object.getTypedPaths(); + const auto & dynamic_paths = column_object.getDynamicPaths(); + const auto & shared_data_offsets = column_object.getSharedDataOffsets(); + const auto [shared_data_paths, shared_data_values] = column_object.getSharedDataPathsAndValues(); + + size_t shared_data_offset = shared_data_offsets[static_cast(index) - 1]; + size_t shared_data_end = shared_data_offsets[static_cast(index)]; + + const auto & object_type = typeid_cast(type); + const auto & specific_typed_paths = object_type.getTypedPaths(); + const auto & dynamic_data_type = object_type.getDynamicType(); + + std::vector> path_values; + path_values.reserve(typed_paths.size() + dynamic_paths.size() + (shared_data_end - shared_data_offset)); + + for (const auto & [path, column_ptr] : typed_paths) + { + auto iter = specific_typed_paths.find(path); + if (iter == specific_typed_paths.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Path {} not found in typed paths", path); + + const auto & specific_data_type = iter->second; + auto python_value = convertFieldToPython(*column_ptr, specific_data_type, index); + path_values.emplace_back(path, python_value); + } + + for (const auto & [path, dynamic_column] : dynamic_paths) + { + if (!dynamic_column->isNullAt(index)) + { + auto python_value = convertFieldToPython(*dynamic_column, dynamic_data_type, index); + path_values.emplace_back(path, python_value); + } + } + + size_t index_in_shared_data_values = shared_data_offset; + for (size_t i = shared_data_offset; i != shared_data_end; ++i) + { + auto path = shared_data_paths->getDataAt(i).toString(); + + auto tmp_dynamic_column = ColumnDynamic::create(); + tmp_dynamic_column->reserve(1); + ColumnObject::deserializeValueFromSharedData(shared_data_values, index_in_shared_data_values++, *tmp_dynamic_column); + + auto python_value = convertFieldToPython(*tmp_dynamic_column, dynamic_data_type, 0); + path_values.emplace_back(path, python_value); + } + + py::dict result; + + for (const auto & [path, value] : path_values) + { + PathElements path_elements(path); + + if (path_elements.size() == 1) + { + String key(path_elements.elements[0]); + result[key.c_str()] = value; + } + else + { + py::dict * current_dict = &result; + + for (size_t i = 0; i < path_elements.size() - 1; ++i) + { + String key(path_elements.elements[i]); + + if (current_dict->contains(key.c_str())) + { + py::object nested = (*current_dict)[key.c_str()]; + current_dict = &nested.cast(); + } + else + { + py::dict new_dict; + (*current_dict)[key.c_str()] = new_dict; + current_dict = &new_dict; + } + } + + chassert(current_dict); + String final_key(path_elements.elements[path_elements.size() - 1]); + (*current_dict)[final_key.c_str()] = value; + } + } + + return result; +} + +} // namespace CHDB diff --git a/programs/local/ObjectToPython.h b/programs/local/ObjectToPython.h new file mode 100644 index 00000000000..64d79e218fd --- /dev/null +++ b/programs/local/ObjectToPython.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include +#include + +namespace CHDB +{ + +pybind11::object convertObjectToPython( + const DB::IColumn & column, + const DB::DataTypePtr & type, + size_t index); + +} // namespace CHDB From 95ad2d4553ea51fa33872743c125a7cd1fa90f31 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Wed, 5 Nov 2025 04:28:57 +0800 Subject: [PATCH 19/22] fix: fix build issues --- chdb/__init__.py | 5 + programs/local/CMakeLists.txt | 4 + programs/local/ChunkCollectorOutputFormat.cpp | 27 +++--- programs/local/ChunkCollectorOutputFormat.h | 27 +++--- programs/local/FieldToPython.cpp | 20 ++-- programs/local/LocalChdb.cpp | 35 +++---- programs/local/LocalServer.cpp | 2 +- programs/local/NumpyArray.h | 1 - programs/local/NumpyNestedTypes.cpp | 16 ++-- programs/local/NumpyType.h | 2 +- programs/local/ObjectToPython.cpp | 17 ++-- programs/local/PandasDataFrameBuilder.cpp | 37 +++++++- programs/local/PandasDataFrameBuilder.h | 14 +-- src/Client/ClientBase.cpp | 6 +- src/Client/ClientBase.h | 2 +- tests/test_complex_pyobj.py | 91 +++++++++++++++---- tests/test_dataframe_column_types.py | 60 ++++++++++++ 17 files changed, 264 insertions(+), 102 deletions(-) create mode 100644 tests/test_dataframe_column_types.py diff --git a/chdb/__init__.py b/chdb/__init__.py index ecf29e43df7..6d4d516a3e7 100644 --- a/chdb/__init__.py +++ b/chdb/__init__.py @@ -194,6 +194,11 @@ def query(sql, output_format="CSV", path="", udf_path=""): with g_conn_lock: conn = _chdb.connect(conn_str) res = conn.query(sql, output_format) + + if lower_output_format == "dataframe": + conn.close() + return res + if res.has_error(): conn.close() raise ChdbError(res.error_message()) diff --git a/programs/local/CMakeLists.txt b/programs/local/CMakeLists.txt index c4e480aaac3..2cade0b59be 100644 --- a/programs/local/CMakeLists.txt +++ b/programs/local/CMakeLists.txt @@ -26,11 +26,15 @@ if (USE_PYTHON) set (CHDB_SOURCES chdb.cpp ChunkCollectorOutputFormat.cpp + FieldToPython.cpp FormatHelper.cpp ListScan.cpp LocalChdb.cpp LocalServer.cpp + NumpyArray.cpp + NumpyNestedTypes.cpp NumpyType.cpp + ObjectToPython.cpp PandasAnalyzer.cpp PandasDataFrame.cpp PandasDataFrameBuilder.cpp diff --git a/programs/local/ChunkCollectorOutputFormat.cpp b/programs/local/ChunkCollectorOutputFormat.cpp index 38d31883a60..f215f48b25d 100644 --- a/programs/local/ChunkCollectorOutputFormat.cpp +++ b/programs/local/ChunkCollectorOutputFormat.cpp @@ -6,15 +6,17 @@ #include #include -namespace DB +using namespace DB; + +namespace CHDB { NullWriteBuffer ChunkCollectorOutputFormat::out; ChunkCollectorOutputFormat::ChunkCollectorOutputFormat( - const Block & header, + SharedHeader shared_header, PandasDataFrameBuilder & builder) - : IOutputFormat(header, out) + : IOutputFormat(shared_header, out) , dataframe_builder(builder) {} @@ -48,16 +50,16 @@ void ChunkCollectorOutputFormat::finalizeImpl() } /// Global dataframe builder -static std::unique_ptr g_dataframe_builder = nullptr; +static std::shared_ptr g_dataframe_builder = nullptr; -PandasDataFrameBuilder * getGlobalDataFrameBuilder() +PandasDataFrameBuilder & getGlobalDataFrameBuilder() { - return g_dataframe_builder.get(); + return *g_dataframe_builder; } -void setGlobalDataFrameBuilder(std::unique_ptr builder) +void setGlobalDataFrameBuilder(std::shared_ptr builder) { - g_dataframe_builder = std::move(builder); + g_dataframe_builder = builder; } void resetGlobalDataFrameBuilder() @@ -66,15 +68,14 @@ void resetGlobalDataFrameBuilder() } /// create ChunkCollectorOutputFormat for use with function pointer -std::shared_ptr createDataFrameOutputFormat(const Block & header) +std::shared_ptr createDataFrameOutputFormat(SharedHeader header) { /// Create a PandasDataFrameBuilder and set it globally - auto dataframe_builder = std::make_unique(header); - PandasDataFrameBuilder * builder_ptr = dataframe_builder.get(); - setGlobalDataFrameBuilder(std::move(dataframe_builder)); + auto dataframe_builder = std::make_shared(*header); + setGlobalDataFrameBuilder(dataframe_builder); /// Create and return the format with the builder - return std::make_shared(header, *builder_ptr); + return std::make_shared(header, getGlobalDataFrameBuilder()); } /// Registration function to be called during initialization diff --git a/programs/local/ChunkCollectorOutputFormat.h b/programs/local/ChunkCollectorOutputFormat.h index 8c588cd9711..7dc2fe26127 100644 --- a/programs/local/ChunkCollectorOutputFormat.h +++ b/programs/local/ChunkCollectorOutputFormat.h @@ -1,20 +1,26 @@ #pragma once #include +#include #include +#include namespace DB { - class NullWriteBuffer; +} + +namespace CHDB +{ + class PandasDataFrameBuilder; /// OutputFormat that collects all chunks into memory for further processing /// Does not write to WriteBuffer, instead accumulates data for conversion to pandas DataFrame objects -class ChunkCollectorOutputFormat : public IOutputFormat +class ChunkCollectorOutputFormat : public DB::IOutputFormat { public: - ChunkCollectorOutputFormat(const Block & header, PandasDataFrameBuilder & builder); + ChunkCollectorOutputFormat(DB::SharedHeader shared_header, PandasDataFrameBuilder & builder); String getName() const override { return "ChunkCollectorOutputFormat"; } @@ -24,31 +30,30 @@ class ChunkCollectorOutputFormat : public IOutputFormat } protected: - void consume(Chunk chunk) override; + void consume(DB::Chunk chunk) override; - void consumeTotals(Chunk totals) override; + void consumeTotals(DB::Chunk totals) override; - void consumeExtremes(Chunk extremes) override; + void consumeExtremes(DB::Chunk extremes) override; void finalizeImpl() override; private: - std::vector chunks; + std::vector chunks; PandasDataFrameBuilder & dataframe_builder; - /// Is not used. - static NullWriteBuffer out; + static DB::NullWriteBuffer out; }; /// Registration function to be called during initialization void registerDataFrameOutputFormat(); /// Get the global dataframe builder -PandasDataFrameBuilder * getGlobalDataFrameBuilder(); +PandasDataFrameBuilder & getGlobalDataFrameBuilder(); /// Set the global dataframe builder -void setGlobalDataFrameBuilder(std::unique_ptr builder); +void setGlobalDataFrameBuilder(std::shared_ptr builder); /// Reset the global dataframe builder void resetGlobalDataFrameBuilder(); diff --git a/programs/local/FieldToPython.cpp b/programs/local/FieldToPython.cpp index cc02eeac945..ca76032aaf6 100644 --- a/programs/local/FieldToPython.cpp +++ b/programs/local/FieldToPython.cpp @@ -36,18 +36,22 @@ #include #include -namespace CHDB +namespace DB { -using namespace DB; -namespace py = pybind11; - namespace ErrorCodes { - extern const int NOT_IMPLEMENTED; - extern const int LOGICAL_ERROR; +extern const int NOT_IMPLEMENTED; +extern const int LOGICAL_ERROR; +} + } +namespace CHDB +{ + +using namespace DB; + py::object convertTimeFieldToPython(const Field & field) { auto & import_cache = PythonImporter::ImportCache(); @@ -594,7 +598,7 @@ py::object convertFieldToPython( const auto & offsets = array_column.getOffsets(); const auto & tuple_column_ptr = array_column.getDataPtr(); - const auto & tuple_column = typeid_cast(tuple_column_ptr); + const auto & tuple_column = typeid_cast(*tuple_column_ptr); size_t start_offset = (index == 0) ? 0 : offsets[index - 1]; size_t end_offset = offsets[index]; @@ -647,7 +651,7 @@ py::object convertFieldToPython( return py::none(); } - const auto & variant_type = typeid_cast(actual_type); + const auto & variant_type = typeid_cast(*actual_type); const auto & variants = variant_type.getVariants(); const auto & variant_data_type = variants[discriminator]; diff --git a/programs/local/LocalChdb.cpp b/programs/local/LocalChdb.cpp index e1513d3ec4d..c2a2db34695 100644 --- a/programs/local/LocalChdb.cpp +++ b/programs/local/LocalChdb.cpp @@ -1,13 +1,14 @@ #include "LocalChdb.h" #include "chdb-internal.h" -#include "ChunkCollectorOutputFormat.h" #include "PandasDataFrameBuilder.h" +#include "ChunkCollectorOutputFormat.h" #include "PythonImporter.h" #include "PythonTableCache.h" #include "StoragePython.h" #include #include +#include #include #if USE_JEMALLOC # include @@ -86,19 +87,14 @@ py::object query( { auto * result = queryToBuffer(queryStr, output_format, path, udfPath); - if (output_format == "dataframe") + if (Poco::toLower(output_format) == "dataframe") { chdb_destroy_query_result(result); - auto * builder = DB::getGlobalDataFrameBuilder(); - if (builder && builder->hasData()) - { - return builder->getDataFrame(); - } - else - { - throw std::runtime_error("DataFrame not available - query may have failed"); - } + auto & builder = CHDB::getGlobalDataFrameBuilder(); + auto ret = builder.getDataFrame(); + CHDB::resetGlobalDataFrameBuilder(); + return ret; } // Default behavior - return query_result @@ -291,22 +287,17 @@ py::object connection_wrapper::query(const std::string & query_str, const std::s { std::string msg_copy(error_msg); chdb_destroy_query_result(result); + CHDB::resetGlobalDataFrameBuilder(); throw std::runtime_error(msg_copy); } - if (format == "dataframe") + if (Poco::toLower(format) == "dataframe") { chdb_destroy_query_result(result); - - auto * builder = DB::getGlobalDataFrameBuilder(); - if (builder && builder->hasData()) - { - return builder->getDataFrame(); - } - else - { - throw std::runtime_error("DataFrame not available - query may have failed"); - } + auto & builder = CHDB::getGlobalDataFrameBuilder(); + auto ret = builder.getDataFrame(); + CHDB::resetGlobalDataFrameBuilder(); + return ret; } if (chdb_result_length(result)) diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 32dac7fa278..886fe374be0 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -670,7 +670,7 @@ try auto & storage_factory = StorageFactory::instance(); #if USE_PYTHON registerStoragePython(storage_factory); - registerDataFrameOutputFormat(); + CHDB::registerDataFrameOutputFormat(); #else registerStorageArrowStream(storage_factory); #endif diff --git a/programs/local/NumpyArray.h b/programs/local/NumpyArray.h index bea70774732..ca2af0ae6bd 100644 --- a/programs/local/NumpyArray.h +++ b/programs/local/NumpyArray.h @@ -3,7 +3,6 @@ #include "PybindWrapper.h" #include -#include #include #include #include diff --git a/programs/local/NumpyNestedTypes.cpp b/programs/local/NumpyNestedTypes.cpp index 3459c65c903..60c0c8cfc88 100644 --- a/programs/local/NumpyNestedTypes.cpp +++ b/programs/local/NumpyNestedTypes.cpp @@ -20,18 +20,22 @@ #include #include -namespace CHDB +namespace DB { -using namespace DB; -namespace py = pybind11; - namespace ErrorCodes { - extern const int LOGICAL_ERROR; - extern const int NOT_IMPLEMENTED; +extern const int LOGICAL_ERROR; +extern const int NOT_IMPLEMENTED; +} + } +namespace CHDB +{ + +using namespace DB; + template struct ColumnTraits; diff --git a/programs/local/NumpyType.h b/programs/local/NumpyType.h index 787bfcd857a..da8ccd5eafe 100644 --- a/programs/local/NumpyType.h +++ b/programs/local/NumpyType.h @@ -53,6 +53,6 @@ std::shared_ptr NumpyToDataType(const NumpyType & col_type); String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type); -py::object ConvertNumpyDtype(py::handle & numpy_array); +py::object ConvertNumpyDtype(const py::handle & numpy_array); } // namespace CHDB diff --git a/programs/local/ObjectToPython.cpp b/programs/local/ObjectToPython.cpp index 88b51f79a3e..4ea107b9ca6 100644 --- a/programs/local/ObjectToPython.cpp +++ b/programs/local/ObjectToPython.cpp @@ -8,10 +8,12 @@ namespace DB { + namespace ErrorCodes { extern const int LOGICAL_ERROR; } + } namespace CHDB @@ -59,7 +61,7 @@ py::object convertObjectToPython( size_t shared_data_offset = shared_data_offsets[static_cast(index) - 1]; size_t shared_data_end = shared_data_offsets[static_cast(index)]; - const auto & object_type = typeid_cast(type); + const auto & object_type = typeid_cast(*type); const auto & specific_typed_paths = object_type.getTypedPaths(); const auto & dynamic_data_type = object_type.getDynamicType(); @@ -112,28 +114,27 @@ py::object convertObjectToPython( } else { - py::dict * current_dict = &result; + py::dict current_dict = result; for (size_t i = 0; i < path_elements.size() - 1; ++i) { String key(path_elements.elements[i]); - if (current_dict->contains(key.c_str())) + if (current_dict.contains(key.c_str())) { py::object nested = (*current_dict)[key.c_str()]; - current_dict = &nested.cast(); + current_dict = nested.cast(); } else { py::dict new_dict; - (*current_dict)[key.c_str()] = new_dict; - current_dict = &new_dict; + current_dict[key.c_str()] = new_dict; + current_dict = new_dict; } } - chassert(current_dict); String final_key(path_elements.elements[path_elements.size() - 1]); - (*current_dict)[final_key.c_str()] = value; + current_dict[final_key.c_str()] = value; } } diff --git a/programs/local/PandasDataFrameBuilder.cpp b/programs/local/PandasDataFrameBuilder.cpp index 9d81271a563..4992a69e994 100644 --- a/programs/local/PandasDataFrameBuilder.cpp +++ b/programs/local/PandasDataFrameBuilder.cpp @@ -17,17 +17,23 @@ #include #include #include - -using namespace CHDB; +#include namespace DB { namespace ErrorCodes { - extern const int LOGICAL_ERROR; +extern const int LOGICAL_ERROR; +} + } +using namespace DB; + +namespace CHDB +{ + PandasDataFrameBuilder::PandasDataFrameBuilder(const Block & sample) { column_names.reserve(sample.columns()); @@ -70,9 +76,9 @@ py::object PandasDataFrameBuilder::genDataFrame(const py::handle & dict) py::handle key = key_value[0]; py::handle value = key_value[1]; - auto dtype = ConvertNumpyDtype(value); if (py::isinstance(value, import_cache.numpy.ma.masked_array())) { + auto dtype = ConvertNumpyDtype(value); auto series = pandas.attr("Series")(value.attr("data"), py::arg("dtype") = dtype); series.attr("__setitem__")(value.attr("mask"), import_cache.pandas.NA()); dict.attr("__setitem__")(key, series); @@ -118,6 +124,9 @@ void PandasDataFrameBuilder::finalize() return; columns_data.reserve(column_types.size()); + + py::gil_scoped_acquire acquire; + for (const auto & type : column_types) { columns_data.emplace_back(type); @@ -134,10 +143,19 @@ void PandasDataFrameBuilder::finalize() const auto & columns = chunk.getColumns(); for (size_t col_idx = 0; col_idx < columns.size(); ++col_idx) { - columns_data[col_idx].append(columns[col_idx]); + auto column = columns[col_idx]; + + if (column->lowCardinality()) + { + column = column->convertToFullColumnIfLowCardinality(); + } + + columns_data[col_idx].append(column); } } + chunks.clear(); + /// Create pandas DataFrame py::dict res; for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) { @@ -150,4 +168,13 @@ void PandasDataFrameBuilder::finalize() is_finalized = true; } +py::object PandasDataFrameBuilder::getDataFrame() +{ + chassert(is_finalized); + + py::gil_scoped_acquire acquire; + + columns_data.clear(); + return std::move(final_dataframe); +} } diff --git a/programs/local/PandasDataFrameBuilder.h b/programs/local/PandasDataFrameBuilder.h index 2f45b08e866..4c6d395e0a5 100644 --- a/programs/local/PandasDataFrameBuilder.h +++ b/programs/local/PandasDataFrameBuilder.h @@ -9,7 +9,7 @@ #include #include -namespace DB +namespace CHDB { /// Builder class to convert ClickHouse Chunks to Pandas DataFrame @@ -17,28 +17,30 @@ namespace DB class PandasDataFrameBuilder { public: - explicit PandasDataFrameBuilder(const Block & sample); + explicit PandasDataFrameBuilder(const DB::Block & sample); + + ~PandasDataFrameBuilder() = default; /// Add data chunk - void addChunk(const Chunk & chunk); + void addChunk(const DB::Chunk & chunk); /// Finalize and build pandas DataFrame from all collected chunks void finalize(); /// Get the finalized pandas DataFrame - pybind11::object getDataFrame() const { return final_dataframe; } + pybind11::object getDataFrame(); private: pybind11::object genDataFrame(const pybind11::handle & dict); void changeToTZType(pybind11::object & df); std::vector column_names; - std::vector column_types; + std::vector column_types; /// Map column name to timezone string for timezone-aware types std::unordered_map column_timezones; - std::vector chunks; + std::vector chunks; std::vector columns_data; size_t total_rows = 0; diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index f767a5c50b9..1c2e8bab2e1 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -648,17 +648,17 @@ try if (!output_format) { #if USE_PYTHON - if (default_output_format == "dataframe") + if (Poco::toLower(default_output_format) == "dataframe") { auto creator = getDataFrameFormatCreator(); if (creator) { - output_format = creator(block); + output_format = creator(std::make_shared(block)); return; } else { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "DataFrame output format creator not set"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "DataFrame output format creator not set"); } } #endif diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 7a52e50ed40..86fdb78d798 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -96,7 +96,7 @@ struct StreamingQueryContext #if USE_PYTHON /// Function pointer type for creating custom output formats (e.g. DataFrame) -using CustomOutputFormatCreator = std::function(const Block &)>; +using CustomOutputFormatCreator = std::function(SharedHeader)>; #endif /** diff --git a/tests/test_complex_pyobj.py b/tests/test_complex_pyobj.py index 241fffc0105..e393f042e70 100644 --- a/tests/test_complex_pyobj.py +++ b/tests/test_complex_pyobj.py @@ -42,14 +42,43 @@ def test_df_with_na(self): self.assertEqual(ret.dtypes["E"], "object") self.assertEqual(ret.dtypes["F"], "object") self.assertEqual(ret.dtypes["G"], "object") - self.assertEqual( - str(ret), - """ A B C D E F G -0 1 4.0 True a [1, 2] {"a": 1, "b": 2} -1 2 5.0 False b [3, 4] {"c": 3, "d": 4} -2 3 6.0 True c [5, 6] {"e": 5, "f": 6} -3 """, - ) + self.assertEqual(ret.shape, (4, 7)) + + # Row 0 + self.assertEqual(ret.iloc[0]["A"], '1') + self.assertEqual(ret.iloc[0]["B"], '4.0') + self.assertEqual(ret.iloc[0]["C"], 'True') + self.assertEqual(ret.iloc[0]["D"], 'a') + self.assertEqual(ret.iloc[0]["E"], '') + self.assertEqual(ret.iloc[0]["F"], '[1, 2]') + self.assertEqual(ret.iloc[0]["G"], '{"a": 1, "b": 2}') + + # Row 1 + self.assertEqual(ret.iloc[1]["A"], '2') + self.assertEqual(ret.iloc[1]["B"], '5.0') + self.assertEqual(ret.iloc[1]["C"], 'False') + self.assertEqual(ret.iloc[1]["D"], 'b') + self.assertEqual(ret.iloc[1]["E"], '') + self.assertEqual(ret.iloc[1]["F"], '[3, 4]') + self.assertEqual(ret.iloc[1]["G"], '{"c": 3, "d": 4}') + + # Row 2 + self.assertEqual(ret.iloc[2]["A"], '3') + self.assertEqual(ret.iloc[2]["B"], '6.0') + self.assertEqual(ret.iloc[2]["C"], 'True') + self.assertEqual(ret.iloc[2]["D"], 'c') + self.assertEqual(ret.iloc[2]["E"], '') + self.assertEqual(ret.iloc[2]["F"], '[5, 6]') + self.assertEqual(ret.iloc[2]["G"], '{"e": 5, "f": 6}') + + # Row 3 + self.assertEqual(ret.iloc[3]["A"], '') + self.assertEqual(ret.iloc[3]["B"], '') + self.assertEqual(ret.iloc[3]["C"], '') + self.assertEqual(ret.iloc[3]["D"], '') + self.assertEqual(ret.iloc[3]["E"], '') + self.assertEqual(ret.iloc[3]["F"], '') + self.assertEqual(ret.iloc[3]["G"], '') def test_df_without_na(self): ret = chdb.query( @@ -65,14 +94,44 @@ def test_df_without_na(self): self.assertEqual(ret.dtypes["E"], "object") self.assertEqual(ret.dtypes["F"], "object") self.assertEqual(ret.dtypes["G"], "object") - self.assertEqual( - str(ret), - """ A B C D E F G -0 1 4.0 1 a a [1, 2] {"a": 1, "b": 2} -1 2 5.0 0 b b [3, 4] {"c": 3, "d": 4} -2 3 6.0 1 c c [5, 6] {"e": 5, "f": 6} -3 4 7.0 0 d d [7, 8] {"g": 7, "h": 8}""", - ) + + self.assertEqual(ret.shape, (4, 7)) + + # Row 0 + self.assertEqual(ret.iloc[0]["A"], 1) + self.assertEqual(ret.iloc[0]["B"], 4.0) + self.assertEqual(ret.iloc[0]["C"], 1) + self.assertEqual(ret.iloc[0]["D"], "a") + self.assertEqual(ret.iloc[0]["E"], "a") + self.assertEqual(ret.iloc[0]["F"], '[1, 2]') + self.assertEqual(ret.iloc[0]["G"], {"a": 1, "b": 2}) + + # Row 1 + self.assertEqual(ret.iloc[1]["A"], 2) + self.assertEqual(ret.iloc[1]["B"], 5.0) + self.assertEqual(ret.iloc[1]["C"], 0) + self.assertEqual(ret.iloc[1]["D"], "b") + self.assertEqual(ret.iloc[1]["E"], "b") + self.assertEqual(ret.iloc[1]["F"], '[3, 4]') + self.assertEqual(ret.iloc[1]["G"], {"c": 3, "d": 4}) + + # Row 2 + self.assertEqual(ret.iloc[2]["A"], 3) + self.assertEqual(ret.iloc[2]["B"], 6.0) + self.assertEqual(ret.iloc[2]["C"], 1) + self.assertEqual(ret.iloc[2]["D"], "c") + self.assertEqual(ret.iloc[2]["E"], "c") + self.assertEqual(ret.iloc[2]["F"], '[5, 6]') + self.assertEqual(ret.iloc[2]["G"], {"e": 5, "f": 6}) + + # Row 3 + self.assertEqual(ret.iloc[3]["A"], 4) + self.assertEqual(ret.iloc[3]["B"], 7.0) + self.assertEqual(ret.iloc[3]["C"], 0) + self.assertEqual(ret.iloc[3]["D"], "d") + self.assertEqual(ret.iloc[3]["E"], "d") + self.assertEqual(ret.iloc[3]["F"], '[7, 8]') + self.assertEqual(ret.iloc[3]["G"], {"g": 7, "h": 8}) if __name__ == "__main__": diff --git a/tests/test_dataframe_column_types.py b/tests/test_dataframe_column_types.py new file mode 100644 index 00000000000..62bb11fc8c4 --- /dev/null +++ b/tests/test_dataframe_column_types.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import unittest +import pandas as pd +import chdb +from datetime import datetime, date +import numpy as np + + +class TestDataFrameColumnTypes(unittest.TestCase): + + def setUp(self): + self.session = chdb.session.Session() + + def tearDown(self): + self.session.close() + + def test_integer_types(self): + ret = self.session.query(""" + SELECT + toInt8(-128) as int8_val, + toInt16(-32768) as int16_val, + toInt32(-2147483648) as int32_val, + toInt64(-9223372036854775808) as int64_val, + toUInt8(255) as uint8_val, + toUInt16(65535) as uint16_val, + toUInt32(4294967295) as uint32_val, + toUInt64(18446744073709551615) as uint64_val + """, "DataFrame") + + self.assertEqual(ret.iloc[0]["int16_val"], -32768) + self.assertEqual(ret.iloc[0]["int32_val"], -2147483648) + self.assertEqual(ret.iloc[0]["int64_val"], -9223372036854775808) + self.assertEqual(ret.iloc[0]["uint8_val"], 255) + self.assertEqual(ret.iloc[0]["uint16_val"], 65535) + self.assertEqual(ret.iloc[0]["uint32_val"], 4294967295) + self.assertEqual(ret.iloc[0]["uint64_val"], 18446744073709551615) + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Precise data type validation + expected_types = { + "int8_val": "int8", + "int16_val": "int16", + "int32_val": "int32", + "int64_val": "int64", + "uint8_val": "uint8", + "uint16_val": "uint16", + "uint32_val": "uint32", + "uint64_val": "uint64" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + +if __name__ == "__main__": + unittest.main() From 7ab8fcbb32a194263b23fca4bd985c26a8857a13 Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Wed, 5 Nov 2025 12:18:19 +0800 Subject: [PATCH 20/22] test: update workflow --- .../workflows/build_linux_arm64_wheels-gh.yml | 22 +++++++++---------- .../workflows/build_macos_arm64_wheels.yml | 20 ++++++++--------- .github/workflows/build_macos_x86_wheels.yml | 20 ++++++++--------- .../build_musllinux_arm64_wheels.yml | 20 ++++++++--------- .../workflows/build_musllinux_x86_wheels.yml | 20 ++++++++--------- src/Client/ClientBase.cpp | 2 ++ 6 files changed, 53 insertions(+), 51 deletions(-) diff --git a/.github/workflows/build_linux_arm64_wheels-gh.yml b/.github/workflows/build_linux_arm64_wheels-gh.yml index 5fd5c76fd83..952ef01a151 100644 --- a/.github/workflows/build_linux_arm64_wheels-gh.yml +++ b/.github/workflows/build_linux_arm64_wheels-gh.yml @@ -8,16 +8,16 @@ on: required: true release: types: [created] - push: - branches: - - main - paths-ignore: - - '**/*.md' - pull_request: - branches: - - main - paths-ignore: - - '**/*.md' + # push: + # branches: + # - main + # paths-ignore: + # - '**/*.md' + # pull_request: + # branches: + # - main + # paths-ignore: + # - '**/*.md' jobs: @@ -137,7 +137,7 @@ jobs: which clang++-19 clang++-19 --version sudo apt-get install -y make cmake ccache ninja-build yasm gawk wget - # Install WebAssembly linker (wasm-ld) + # Install WebAssembly linker (wasm-ld) sudo apt-get install -y lld-19 # Create symlink for wasm-ld if ! command -v wasm-ld &> /dev/null; then diff --git a/.github/workflows/build_macos_arm64_wheels.yml b/.github/workflows/build_macos_arm64_wheels.yml index 4c7b24f1ac2..f09c06edfb5 100644 --- a/.github/workflows/build_macos_arm64_wheels.yml +++ b/.github/workflows/build_macos_arm64_wheels.yml @@ -8,16 +8,16 @@ on: required: true release: types: [created] - push: - branches: - - main - paths-ignore: - - '**/*.md' - pull_request: - branches: - - main - paths-ignore: - - '**/*.md' + # push: + # branches: + # - main + # paths-ignore: + # - '**/*.md' + # pull_request: + # branches: + # - main + # paths-ignore: + # - '**/*.md' jobs: build_universal_wheel: diff --git a/.github/workflows/build_macos_x86_wheels.yml b/.github/workflows/build_macos_x86_wheels.yml index 22d597a0f6e..47ae8e490c0 100644 --- a/.github/workflows/build_macos_x86_wheels.yml +++ b/.github/workflows/build_macos_x86_wheels.yml @@ -8,16 +8,16 @@ on: required: true release: types: [created] - push: - branches: - - main - paths-ignore: - - '**/*.md' - pull_request: - branches: - - main - paths-ignore: - - '**/*.md' + # push: + # branches: + # - main + # paths-ignore: + # - '**/*.md' + # pull_request: + # branches: + # - main + # paths-ignore: + # - '**/*.md' jobs: build_universal_wheel: diff --git a/.github/workflows/build_musllinux_arm64_wheels.yml b/.github/workflows/build_musllinux_arm64_wheels.yml index d7c2819ac0a..0cfd5d2a3a1 100644 --- a/.github/workflows/build_musllinux_arm64_wheels.yml +++ b/.github/workflows/build_musllinux_arm64_wheels.yml @@ -8,16 +8,16 @@ on: required: true release: types: [created] - push: - branches: - - main - paths-ignore: - - '**/*.md' - pull_request: - branches: - - main - paths-ignore: - - '**/*.md' + # push: + # branches: + # - main + # paths-ignore: + # - '**/*.md' + # pull_request: + # branches: + # - main + # paths-ignore: + # - '**/*.md' jobs: build_musllinux_wheels: diff --git a/.github/workflows/build_musllinux_x86_wheels.yml b/.github/workflows/build_musllinux_x86_wheels.yml index 715cc816bf6..bf077181b6b 100644 --- a/.github/workflows/build_musllinux_x86_wheels.yml +++ b/.github/workflows/build_musllinux_x86_wheels.yml @@ -8,16 +8,16 @@ on: required: true release: types: [created] - push: - branches: - - main - paths-ignore: - - '**/*.md' - pull_request: - branches: - - main - paths-ignore: - - '**/*.md' + # push: + # branches: + # - main + # paths-ignore: + # - '**/*.md' + # pull_request: + # branches: + # - main + # paths-ignore: + # - '**/*.md' jobs: diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 1c2e8bab2e1..aad4ad78e78 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -151,8 +151,10 @@ namespace ErrorCodes extern const int CANNOT_WRITE_TO_FILE; } +#if USE_PYTHON /// Custom DataFrame format creator function pointer static CustomOutputFormatCreator g_dataframe_format_creator = nullptr; +#endif } From f0406e6cae67115a0f6d4bf7e26b96505aa670ed Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Wed, 5 Nov 2025 20:17:50 +0800 Subject: [PATCH 21/22] fix: fix test issues --- programs/local/ChunkCollectorOutputFormat.cpp | 6 ++- programs/local/FieldToPython.cpp | 13 ++---- programs/local/LocalChdb.cpp | 43 ++++++++++--------- programs/local/NumpyArray.cpp | 23 +++++----- programs/local/NumpyType.cpp | 15 +++---- programs/local/ObjectToPython.cpp | 10 ++++- programs/local/PandasDataFrameBuilder.cpp | 6 +-- programs/local/PythonImportCache.cpp | 2 +- 8 files changed, 59 insertions(+), 59 deletions(-) diff --git a/programs/local/ChunkCollectorOutputFormat.cpp b/programs/local/ChunkCollectorOutputFormat.cpp index f215f48b25d..8faa54a7ef4 100644 --- a/programs/local/ChunkCollectorOutputFormat.cpp +++ b/programs/local/ChunkCollectorOutputFormat.cpp @@ -64,7 +64,11 @@ void setGlobalDataFrameBuilder(std::shared_ptr builder) void resetGlobalDataFrameBuilder() { - g_dataframe_builder.reset(); + if (g_dataframe_builder) + { + py::gil_scoped_acquire acquire; + g_dataframe_builder.reset(); + } } /// create ChunkCollectorOutputFormat for use with function pointer diff --git a/programs/local/FieldToPython.cpp b/programs/local/FieldToPython.cpp index ca76032aaf6..8863f3e79ff 100644 --- a/programs/local/FieldToPython.cpp +++ b/programs/local/FieldToPython.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -121,11 +122,7 @@ py::object convertTime64FieldToPython(const Field & field) static bool canTypeBeUsedAsDictKey(const DataTypePtr & type) { - DataTypePtr actual_type = type; - if (const auto * nullable_type = typeid_cast(type.get())) - { - actual_type = nullable_type->getNestedType(); - } + DataTypePtr actual_type = removeLowCardinalityAndNullable(type); switch (actual_type->getTypeId()) { @@ -224,11 +221,7 @@ py::object convertFieldToPython( return py::none(); } - DataTypePtr actual_type = type; - if (const auto * nullable_type = typeid_cast(type.get())) - { - actual_type = nullable_type->getNestedType(); - } + DataTypePtr actual_type = removeLowCardinalityAndNullable(type); auto & import_cache = PythonImporter::ImportCache(); diff --git a/programs/local/LocalChdb.cpp b/programs/local/LocalChdb.cpp index c2a2db34695..61c931270e3 100644 --- a/programs/local/LocalChdb.cpp +++ b/programs/local/LocalChdb.cpp @@ -280,29 +280,32 @@ py::object connection_wrapper::query(const std::string & query_str, const std::s { CHDB::PythonTableCache::findQueryableObjFromQuery(query_str); - py::gil_scoped_release release; - auto * result = chdb_query_n(*conn, query_str.data(), query_str.size(), format.data(), format.size()); - auto error_msg = CHDB::chdb_result_error_string(result); - if (!error_msg.empty()) + chdb_result * result = nullptr; { - std::string msg_copy(error_msg); - chdb_destroy_query_result(result); - CHDB::resetGlobalDataFrameBuilder(); - throw std::runtime_error(msg_copy); - } + py::gil_scoped_release release; + result = chdb_query_n(*conn, query_str.data(), query_str.size(), format.data(), format.size()); + auto error_msg = CHDB::chdb_result_error_string(result); + if (!error_msg.empty()) + { + std::string msg_copy(error_msg); + chdb_destroy_query_result(result); + CHDB::resetGlobalDataFrameBuilder(); + throw std::runtime_error(msg_copy); + } - if (Poco::toLower(format) == "dataframe") - { - chdb_destroy_query_result(result); - auto & builder = CHDB::getGlobalDataFrameBuilder(); - auto ret = builder.getDataFrame(); - CHDB::resetGlobalDataFrameBuilder(); - return ret; - } + if (Poco::toLower(format) == "dataframe") + { + chdb_destroy_query_result(result); + auto & builder = CHDB::getGlobalDataFrameBuilder(); + auto ret = builder.getDataFrame(); + CHDB::resetGlobalDataFrameBuilder(); + return ret; + } - if (chdb_result_length(result)) - { - LOG_DEBUG(getLogger("CHDB"), "Empty result returned for query: {}", query_str); + if (chdb_result_length(result)) + { + LOG_DEBUG(getLogger("CHDB"), "Empty result returned for query: {}", query_str); + } } return py::cast(new query_result(result, false)); diff --git a/programs/local/NumpyArray.cpp b/programs/local/NumpyArray.cpp index 89bbc334343..1b8a03568bf 100644 --- a/programs/local/NumpyArray.cpp +++ b/programs/local/NumpyArray.cpp @@ -4,6 +4,7 @@ #include "PythonImporter.h" #include "FieldToPython.h" +#include #include #include #include @@ -86,7 +87,11 @@ struct Time64Convert { chassert(append_data.type); - Field field(val); + const auto & time64_type = typeid_cast(*append_data.type); + UInt32 scale = time64_type.getScale(); + DecimalField decimal_field(static_cast(val), scale); + Field field(decimal_field); + auto time64_object = convertTime64FieldToPython(field); return time64_object.release().ptr(); } @@ -564,6 +569,9 @@ void NumpyArray::append( size_t offset, size_t count) { + auto actual_column = column->convertToFullColumnIfLowCardinality(); + DataTypePtr actual_type = removeLowCardinalityAndNullable(data_array->type); + chassert(data_array); chassert(mask_array); @@ -571,21 +579,14 @@ void NumpyArray::append( auto * mask_ptr = reinterpret_cast(mask_array->data); chassert(data_ptr); chassert(mask_ptr); - chassert(column->getDataType() == data_array->type->getColumnType()); + chassert(actual_column->getDataType() == actual_type->getColumnType()); - size_t size = column->size(); + size_t size = actual_column->size(); data_array->count += size; mask_array->count += size; bool may_have_null = false; - /// For nullable types, we need to get the nested type - DataTypePtr actual_type = data_array->type; - if (const auto * nullable_type = typeid_cast(data_array->type.get())) - { - actual_type = nullable_type->getNestedType(); - } - - NumpyAppendData append_data(*column, actual_type); + NumpyAppendData append_data(*actual_column, actual_type); append_data.src_offset = offset; append_data.src_count = count; append_data.target_data = data_ptr; diff --git a/programs/local/NumpyType.cpp b/programs/local/NumpyType.cpp index 83408d13278..682332397be 100644 --- a/programs/local/NumpyType.cpp +++ b/programs/local/NumpyType.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -238,7 +239,9 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type if (!data_type) return "object"; - TypeIndex type_id = data_type->getTypeId(); + auto actual_data_type = removeLowCardinalityAndNullable(data_type); + + TypeIndex type_id = actual_data_type->getTypeId(); switch (type_id) { case TypeIndex::Nothing: @@ -305,7 +308,7 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type case TypeIndex::Date: case TypeIndex::Date32: - return "datetime64[D]"; + return "datetime64[D]"; // pandas converts datetime64[D] to datetime64[s] internally case TypeIndex::Time: case TypeIndex::Time64: @@ -375,14 +378,6 @@ String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type return "int16"; case TypeIndex::Nullable: - { - if (const auto * nullable = typeid_cast(data_type.get())) - { - return DataTypeToNumpyTypeStr(nullable->getNestedType()); - } - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected nullable type {}", data_type->getName()); - } - default: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", data_type->getName()); } diff --git a/programs/local/ObjectToPython.cpp b/programs/local/ObjectToPython.cpp index 4ea107b9ca6..c3caf91ef58 100644 --- a/programs/local/ObjectToPython.cpp +++ b/programs/local/ObjectToPython.cpp @@ -1,8 +1,10 @@ #include "ObjectToPython.h" #include "FieldToPython.h" +#include #include #include +#include #include #include @@ -52,7 +54,13 @@ py::object convertObjectToPython( const DataTypePtr & type, size_t index) { - const auto & column_object = typeid_cast(column); + const IColumn * data_column = &column; + if (const auto * nullable = typeid_cast(&column)) + { + data_column = &nullable->getNestedColumn(); + } + + const auto & column_object = typeid_cast(*data_column); const auto & typed_paths = column_object.getTypedPaths(); const auto & dynamic_paths = column_object.getDynamicPaths(); const auto & shared_data_offsets = column_object.getSharedDataOffsets(); diff --git a/programs/local/PandasDataFrameBuilder.cpp b/programs/local/PandasDataFrameBuilder.cpp index 4992a69e994..7b570cebfcb 100644 --- a/programs/local/PandasDataFrameBuilder.cpp +++ b/programs/local/PandasDataFrameBuilder.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -145,11 +146,6 @@ void PandasDataFrameBuilder::finalize() { auto column = columns[col_idx]; - if (column->lowCardinality()) - { - column = column->convertToFullColumnIfLowCardinality(); - } - columns_data[col_idx].append(column); } } diff --git a/programs/local/PythonImportCache.cpp b/programs/local/PythonImportCache.cpp index 6e24b35e934..85f30a9a732 100644 --- a/programs/local/PythonImportCache.cpp +++ b/programs/local/PythonImportCache.cpp @@ -53,7 +53,7 @@ py::handle PythonImportCacheItem::AddCache(PythonImportCache & cache, py::object void PythonImportCacheItem::LoadModule(PythonImportCache & cache) { #if USE_JEMALLOC - ::Memory::MemoryCheckScope memory_check_scope; + ::Memory::MemoryCheckScope memory_check_scope; #endif try { From c6d370d39ca18ee186e1420ae4d53b7ff99f644c Mon Sep 17 00:00:00 2001 From: wudidapaopao Date: Wed, 5 Nov 2025 20:27:02 +0800 Subject: [PATCH 22/22] test: add more test cases --- tests/test_dataframe_column_types.py | 709 ++++++++++++++++++++++++++- 1 file changed, 696 insertions(+), 13 deletions(-) diff --git a/tests/test_dataframe_column_types.py b/tests/test_dataframe_column_types.py index 62bb11fc8c4..fc1d6f6988f 100644 --- a/tests/test_dataframe_column_types.py +++ b/tests/test_dataframe_column_types.py @@ -5,39 +5,83 @@ import chdb from datetime import datetime, date import numpy as np +import math class TestDataFrameColumnTypes(unittest.TestCase): def setUp(self): - self.session = chdb.session.Session() + self.session = chdb.session.Session("./tmp") def tearDown(self): self.session.close() def test_integer_types(self): ret = self.session.query(""" - SELECT - toInt8(-128) as int8_val, - toInt16(-32768) as int16_val, - toInt32(-2147483648) as int32_val, - toInt64(-9223372036854775808) as int64_val, - toUInt8(255) as uint8_val, - toUInt16(65535) as uint16_val, - toUInt32(4294967295) as uint32_val, - toUInt64(18446744073709551615) as uint64_val + SELECT * FROM ( + SELECT + 1 as row_id, + toInt8(-128) as int8_val, + toInt16(-32768) as int16_val, + toInt32(-2147483648) as int32_val, + toInt64(-9223372036854775808) as int64_val, + toInt128('-170141183460469231731687303715884105728') as int128_val, + toInt256('-57896044618658097711785492504343953926634992332820282019728792003956564819968') as int256_val, + toUInt8(255) as uint8_val, + toUInt16(65535) as uint16_val, + toUInt32(4294967295) as uint32_val, + toUInt64(18446744073709551615) as uint64_val, + toUInt128('340282366920938463463374607431768211455') as uint128_val, + toUInt256('115792089237316195423570985008687907853269984665640564039457584007913129639935') as uint256_val + UNION ALL + SELECT + 2 as row_id, + toInt8(127) as int8_val, + toInt16(32767) as int16_val, + toInt32(2147483647) as int32_val, + toInt64(9223372036854775807) as int64_val, + toInt128('170141183460469231731687303715884105727') as int128_val, + toInt256('57896044618658097711785492504343953926634992332820282019728792003956564819967') as int256_val, + toUInt8(254) as uint8_val, + toUInt16(65534) as uint16_val, + toUInt32(4294967294) as uint32_val, + toUInt64(18446744073709551614) as uint64_val, + toUInt128('340282366920938463463374607431768211454') as uint128_val, + toUInt256('115792089237316195423570985008687907853269984665640564039457584007913129639934') as uint256_val + ) + ORDER BY row_id """, "DataFrame") + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row (minimum/maximum values) + self.assertEqual(ret.iloc[0]["int8_val"], -128) self.assertEqual(ret.iloc[0]["int16_val"], -32768) self.assertEqual(ret.iloc[0]["int32_val"], -2147483648) self.assertEqual(ret.iloc[0]["int64_val"], -9223372036854775808) + self.assertEqual(ret.iloc[0]["int128_val"], float(-170141183460469231731687303715884105728)) + self.assertEqual(ret.iloc[0]["int256_val"], float(-57896044618658097711785492504343953926634992332820282019728792003956564819968)) self.assertEqual(ret.iloc[0]["uint8_val"], 255) self.assertEqual(ret.iloc[0]["uint16_val"], 65535) self.assertEqual(ret.iloc[0]["uint32_val"], 4294967295) self.assertEqual(ret.iloc[0]["uint64_val"], 18446744073709551615) + self.assertEqual(ret.iloc[0]["uint128_val"], float(340282366920938463463374607431768211455)) + self.assertEqual(ret.iloc[0]["uint256_val"], float(115792089237316195423570985008687907853269984665640564039457584007913129639935)) - for col in ret.columns: - print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + # Test second row (maximum/near-maximum values) + self.assertEqual(ret.iloc[1]["int8_val"], 127) + self.assertEqual(ret.iloc[1]["int16_val"], 32767) + self.assertEqual(ret.iloc[1]["int32_val"], 2147483647) + self.assertEqual(ret.iloc[1]["int64_val"], 9223372036854775807) + self.assertEqual(ret.iloc[1]["int128_val"], float(170141183460469231731687303715884105727)) + self.assertEqual(ret.iloc[1]["int256_val"], float(57896044618658097711785492504343953926634992332820282019728792003956564819967)) + self.assertEqual(ret.iloc[1]["uint8_val"], 254) + self.assertEqual(ret.iloc[1]["uint16_val"], 65534) + self.assertEqual(ret.iloc[1]["uint32_val"], 4294967294) + self.assertEqual(ret.iloc[1]["uint64_val"], 18446744073709551614) + self.assertEqual(ret.iloc[1]["uint128_val"], float(340282366920938463463374607431768211454)) + self.assertEqual(ret.iloc[1]["uint256_val"], float(115792089237316195423570985008687907853269984665640564039457584007913129639934)) # Precise data type validation expected_types = { @@ -45,10 +89,649 @@ def test_integer_types(self): "int16_val": "int16", "int32_val": "int32", "int64_val": "int64", + "int128_val": "float64", # Int128 mapped to float64 in ClickHouse->pandas conversion + "int256_val": "float64", # Int256 mapped to float64 in ClickHouse->pandas conversion "uint8_val": "uint8", "uint16_val": "uint16", "uint32_val": "uint32", - "uint64_val": "uint64" + "uint64_val": "uint64", + "uint128_val": "float64", # UInt128 mapped to float64 in ClickHouse->pandas conversion + "uint256_val": "float64" # UInt256 mapped to float64 in ClickHouse->pandas conversion + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_float_types(self): + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toFloat32(3.14159265) as float32_val, + toFloat32(-3.40282347e+38) as float32_min, + toFloat32(3.40282347e+38) as float32_max, + toFloat64(2.718281828459045) as float64_val, + toFloat64(-1.7976931348623157e+308) as float64_min, + toFloat64(1.7976931348623157e+308) as float64_max, + toBFloat16(1.5) as bfloat16_val, + toBFloat16(-3.389531389e+38) as bfloat16_min, + toBFloat16(3.389531389e+38) as bfloat16_max + UNION ALL + SELECT + 2 as row_id, + toFloat32(0.0) as float32_val, + toFloat32(1.175494351e-38) as float32_min, + toFloat32(-1.175494351e-38) as float32_max, + toFloat64(0.0) as float64_val, + toFloat64(2.2250738585072014e-308) as float64_min, + toFloat64(-2.2250738585072014e-308) as float64_max, + toBFloat16(0.0) as bfloat16_val, + toBFloat16(1.175494351e-38) as bfloat16_min, + toBFloat16(-1.175494351e-38) as bfloat16_max + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[1][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - regular and extreme values + self.assertAlmostEqual(ret.iloc[0]["float32_val"], 3.14159265, places=6) + self.assertAlmostEqual(ret.iloc[0]["float32_min"], -3.40282347e+38, delta=1e30) + self.assertAlmostEqual(ret.iloc[0]["float32_max"], 3.40282347e+38, delta=1e30) + self.assertAlmostEqual(ret.iloc[0]["float64_val"], 2.718281828459045, places=15) + self.assertAlmostEqual(ret.iloc[0]["float64_min"], -1.7976931348623157e+308, delta=1e300) + self.assertAlmostEqual(ret.iloc[0]["float64_max"], 1.7976931348623157e+308, delta=1e300) + self.assertAlmostEqual(ret.iloc[0]["bfloat16_val"], 1.5, places=2) + self.assertAlmostEqual(ret.iloc[0]["bfloat16_min"], -3.389531389e+38, delta=1e30) + self.assertAlmostEqual(ret.iloc[0]["bfloat16_max"], 3.389531389e+38, delta=1e30) + + # Test second row - zero and small values + self.assertEqual(ret.iloc[1]["float32_val"], 0.0) + self.assertAlmostEqual(ret.iloc[1]["float32_min"], 1.175494351e-38, delta=1e-40) + self.assertAlmostEqual(ret.iloc[1]["float32_max"], -1.175494351e-38, delta=1e-40) + self.assertEqual(ret.iloc[1]["float64_val"], 0.0) + self.assertAlmostEqual(ret.iloc[1]["float64_min"], 2.2250738585072014e-308, delta=1e-310) + self.assertAlmostEqual(ret.iloc[1]["float64_max"], -2.2250738585072014e-308, delta=1e-310) + self.assertEqual(ret.iloc[1]["bfloat16_val"], 0.0) + self.assertAlmostEqual(ret.iloc[1]["bfloat16_min"], 1.175494351e-38, delta=1e-40) + self.assertAlmostEqual(ret.iloc[1]["bfloat16_max"], -1.175494351e-38, delta=1e-40) + + # Precise data type validation + expected_types = { + "float32_val": "float32", + "float32_min": "float32", + "float32_max": "float32", + "float64_val": "float64", + "float64_min": "float64", + "float64_max": "float64", + "bfloat16_val": "float32", # BFloat16 typically mapped to float32 in pandas + "bfloat16_min": "float32", + "bfloat16_max": "float32" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_float_special_values(self): + """Test Infinity and NaN values for all float types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toFloat32(1.0/0.0) as float32_pos_inf, + toFloat32(-1.0/0.0) as float32_neg_inf, + toFloat32(0.0/0.0) as float32_nan, + toFloat64(1.0/0.0) as float64_pos_inf, + toFloat64(-1.0/0.0) as float64_neg_inf, + toFloat64(0.0/0.0) as float64_nan, + toBFloat16(1.0/0.0) as bfloat16_pos_inf, + toBFloat16(-1.0/0.0) as bfloat16_neg_inf, + toBFloat16(0.0/0.0) as bfloat16_nan + UNION ALL + SELECT + 2 as row_id, + toFloat32(1.0/0.0) as float32_pos_inf, + toFloat32(-1.0/0.0) as float32_neg_inf, + toFloat32(0.0/0.0) as float32_nan, + toFloat64(1.0/0.0) as float64_pos_inf, + toFloat64(-1.0/0.0) as float64_neg_inf, + toFloat64(0.0/0.0) as float64_nan, + toBFloat16(1.0/0.0) as bfloat16_pos_inf, + toBFloat16(-1.0/0.0) as bfloat16_neg_inf, + toBFloat16(0.0/0.0) as bfloat16_nan + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test Float32 special values + self.assertTrue(math.isinf(ret.iloc[0]["float32_pos_inf"])) + self.assertTrue(ret.iloc[0]["float32_pos_inf"] > 0) # positive infinity + self.assertTrue(math.isinf(ret.iloc[0]["float32_neg_inf"])) + self.assertTrue(ret.iloc[0]["float32_neg_inf"] < 0) # negative infinity + self.assertTrue(math.isnan(ret.iloc[0]["float32_nan"])) + + # Test Float64 special values + self.assertTrue(math.isinf(ret.iloc[0]["float64_pos_inf"])) + self.assertTrue(ret.iloc[0]["float64_pos_inf"] > 0) # positive infinity + self.assertTrue(math.isinf(ret.iloc[0]["float64_neg_inf"])) + self.assertTrue(ret.iloc[0]["float64_neg_inf"] < 0) # negative infinity + self.assertTrue(math.isnan(ret.iloc[0]["float64_nan"])) + + # Test BFloat16 special values + self.assertTrue(math.isinf(ret.iloc[0]["bfloat16_pos_inf"])) + self.assertTrue(ret.iloc[0]["bfloat16_pos_inf"] > 0) # positive infinity + self.assertTrue(math.isinf(ret.iloc[0]["bfloat16_neg_inf"])) + self.assertTrue(ret.iloc[0]["bfloat16_neg_inf"] < 0) # negative infinity + self.assertTrue(math.isnan(ret.iloc[0]["bfloat16_nan"])) + + # Test second row (same values, consistency check) + self.assertTrue(math.isinf(ret.iloc[1]["float32_pos_inf"])) + self.assertTrue(ret.iloc[1]["float32_pos_inf"] > 0) + self.assertTrue(math.isinf(ret.iloc[1]["float32_neg_inf"])) + self.assertTrue(ret.iloc[1]["float32_neg_inf"] < 0) + self.assertTrue(math.isnan(ret.iloc[1]["float32_nan"])) + + self.assertTrue(math.isinf(ret.iloc[1]["float64_pos_inf"])) + self.assertTrue(ret.iloc[1]["float64_pos_inf"] > 0) + self.assertTrue(math.isinf(ret.iloc[1]["float64_neg_inf"])) + self.assertTrue(ret.iloc[1]["float64_neg_inf"] < 0) + self.assertTrue(math.isnan(ret.iloc[1]["float64_nan"])) + + self.assertTrue(math.isinf(ret.iloc[1]["bfloat16_pos_inf"])) + self.assertTrue(ret.iloc[1]["bfloat16_pos_inf"] > 0) + self.assertTrue(math.isinf(ret.iloc[1]["bfloat16_neg_inf"])) + self.assertTrue(ret.iloc[1]["bfloat16_neg_inf"] < 0) + self.assertTrue(math.isnan(ret.iloc[1]["bfloat16_nan"])) + + # Precise data type validation + expected_types = { + "float32_pos_inf": "float32", + "float32_neg_inf": "float32", + "float32_nan": "float32", + "float64_pos_inf": "float64", + "float64_neg_inf": "float64", + "float64_nan": "float64", + "bfloat16_pos_inf": "float32", # BFloat16 typically mapped to float32 in pandas + "bfloat16_neg_inf": "float32", + "bfloat16_nan": "float32" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_decimal_types(self): + """Test Decimal32, Decimal64, Decimal128, Decimal256 types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toDecimal32('123.456', 3) as decimal32_val, + toDecimal32('-999999.999', 3) as decimal32_min, + toDecimal32('999999.999', 3) as decimal32_max, + toDecimal64('123456.789012', 6) as decimal64_val, + toDecimal64('-999999999999.999999', 6) as decimal64_min, + toDecimal64('999999999999.999999', 6) as decimal64_max, + toDecimal128('12345678901234567890123456789.123456789', 9) as decimal128_val, + toDecimal128('-12345678901234567890123456789.123456789', 9) as decimal128_min, + toDecimal128('12345678901234567890123456789.123456789', 9) as decimal128_max, + toDecimal256('1234567890123456789012345678901234567890123456789012345678.123456789012345678', 18) as decimal256_val, + toDecimal256('-1234567890123456789012345678901234567890123456789012345678.123456789012345678', 18) as decimal256_min, + toDecimal256('1234567890123456789012345678901234567890123456789012345678.123456789012345678', 18) as decimal256_max + UNION ALL + SELECT + 2 as row_id, + toDecimal32('0.001', 3) as decimal32_val, + toDecimal32('0.000', 3) as decimal32_min, + toDecimal32('1.000', 3) as decimal32_max, + toDecimal64('0.000001', 6) as decimal64_val, + toDecimal64('0.000000', 6) as decimal64_min, + toDecimal64('1.000000', 6) as decimal64_max, + toDecimal128('0.000000001', 9) as decimal128_val, + toDecimal128('0.000000000', 9) as decimal128_min, + toDecimal128('1.000000000', 9) as decimal128_max, + toDecimal256('0.000000000000000001', 18) as decimal256_val, + toDecimal256('0.000000000000000000', 18) as decimal256_min, + toDecimal256('1.000000000000000000', 18) as decimal256_max + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - regular and extreme decimal values (converted to float64) + self.assertAlmostEqual(ret.iloc[0]["decimal32_val"], 123.456, places=3) + self.assertAlmostEqual(ret.iloc[0]["decimal32_min"], -999999.999, places=3) + self.assertAlmostEqual(ret.iloc[0]["decimal32_max"], 999999.999, places=3) + + self.assertAlmostEqual(ret.iloc[0]["decimal64_val"], 123456.789012, places=6) + self.assertAlmostEqual(ret.iloc[0]["decimal64_min"], -999999999999.999999, places=6) + self.assertAlmostEqual(ret.iloc[0]["decimal64_max"], 999999999999.999999, places=6) + + self.assertAlmostEqual(ret.iloc[0]["decimal128_val"], 12345678901234567890123456789.123456789, delta=1e20) + self.assertAlmostEqual(ret.iloc[0]["decimal128_min"], -12345678901234567890123456789.123456789, delta=1e20) + self.assertAlmostEqual(ret.iloc[0]["decimal128_max"], 12345678901234567890123456789.123456789, delta=1e20) + + self.assertAlmostEqual(ret.iloc[0]["decimal256_val"], 1234567890123456789012345678901234567890123456789012345678.123456789012345678, delta=1e50) + self.assertAlmostEqual(ret.iloc[0]["decimal256_min"], -1234567890123456789012345678901234567890123456789012345678.123456789012345678, delta=1e50) + self.assertAlmostEqual(ret.iloc[0]["decimal256_max"], 1234567890123456789012345678901234567890123456789012345678.123456789012345678, delta=1e50) + + # Test second row - small decimal values (converted to float64) + self.assertAlmostEqual(ret.iloc[1]["decimal32_val"], 0.001, places=3) + self.assertEqual(ret.iloc[1]["decimal32_min"], 0.000) + self.assertAlmostEqual(ret.iloc[1]["decimal32_max"], 1.000, places=3) + + self.assertAlmostEqual(ret.iloc[1]["decimal64_val"], 0.000001, places=6) + self.assertEqual(ret.iloc[1]["decimal64_min"], 0.000000) + self.assertAlmostEqual(ret.iloc[1]["decimal64_max"], 1.000000, places=6) + + self.assertAlmostEqual(ret.iloc[1]["decimal128_val"], 0.000000001, places=9) + self.assertEqual(ret.iloc[1]["decimal128_min"], 0.000000000) + self.assertAlmostEqual(ret.iloc[1]["decimal128_max"], 1.000000000, places=9) + + self.assertAlmostEqual(ret.iloc[1]["decimal256_val"], 0.000000000000000001, places=18) + self.assertEqual(ret.iloc[1]["decimal256_min"], 0.000000000000000000) + self.assertAlmostEqual(ret.iloc[1]["decimal256_max"], 1.000000000000000000, places=18) + + # Precise data type validation + expected_types = { + "decimal32_val": "float64", # Decimal types mapped to float64 in ClickHouse->pandas conversion + "decimal32_min": "float64", + "decimal32_max": "float64", + "decimal64_val": "float64", + "decimal64_min": "float64", + "decimal64_max": "float64", + "decimal128_val": "float64", + "decimal128_min": "float64", + "decimal128_max": "float64", + "decimal256_val": "float64", + "decimal256_min": "float64", + "decimal256_max": "float64" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_string_types(self): + """Test String, FixedString, and LowCardinality string types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toString('Hello World') as string_val, + toFixedString('Fixed', 10) as fixed_string_val, + toLowCardinality('Category A') as low_cardinality_val, + toString('') as empty_string, + toString('Unicode: 🌍 éñáíóú') as unicode_string, + toString('Special chars: \\t\\n\\r\\"\\\'') as special_chars, + toString('Very long string with many characters to test maximum length handling and memory allocation behavior') as long_string, + toFixedString('ABC', 5) as fixed_string_short, + toLowCardinality('') as low_cardinality_empty + UNION ALL + SELECT + 2 as row_id, + toString('Another string') as string_val, + toFixedString('Test123', 10) as fixed_string_val, + toLowCardinality('Category B') as low_cardinality_val, + toString('Non-empty') as empty_string, + toString('More Unicode: 🚀 ñáéíóú àèìòù') as unicode_string, + toString('Line breaks:\\nTab:\\tQuote:\\"') as special_chars, + toString('Short') as long_string, + toFixedString('XYZZZ', 5) as fixed_string_short, + toLowCardinality('Option 2') as low_cardinality_empty + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - various string types + self.assertEqual(ret.iloc[0]["string_val"], "Hello World") + self.assertEqual(ret.iloc[0]["fixed_string_val"], "Fixed\x00\x00\x00\x00\x00") # FixedString pads with null bytes + self.assertEqual(ret.iloc[0]["low_cardinality_val"], "Category A") + self.assertEqual(ret.iloc[0]["empty_string"], "") + self.assertEqual(ret.iloc[0]["unicode_string"], "Unicode: 🌍 éñáíóú") + self.assertEqual(ret.iloc[0]["special_chars"], "Special chars: \t\n\r\"'") # ClickHouse interprets escape sequences + self.assertEqual(ret.iloc[0]["long_string"], "Very long string with many characters to test maximum length handling and memory allocation behavior") + self.assertEqual(ret.iloc[0]["fixed_string_short"], "ABC\x00\x00") # Padded to 5 chars + self.assertEqual(ret.iloc[0]["low_cardinality_empty"], "") + + # Test second row - different string values + self.assertEqual(ret.iloc[1]["string_val"], "Another string") + self.assertEqual(ret.iloc[1]["fixed_string_val"], "Test123\x00\x00\x00") # Padded to 10 chars + self.assertEqual(ret.iloc[1]["low_cardinality_val"], "Category B") + self.assertEqual(ret.iloc[1]["empty_string"], "Non-empty") + self.assertEqual(ret.iloc[1]["unicode_string"], "More Unicode: 🚀 ñáéíóú àèìòù") + self.assertEqual(ret.iloc[1]["special_chars"], "Line breaks:\nTab:\tQuote:\"") # ClickHouse interprets escape sequences + self.assertEqual(ret.iloc[1]["long_string"], "Short") + self.assertEqual(ret.iloc[1]["fixed_string_short"], "XYZZZ") # Exactly 5 chars, no padding + self.assertEqual(ret.iloc[1]["low_cardinality_empty"], "Option 2") + + # Precise data type validation + expected_types = { + "string_val": "object", # String types mapped to object in pandas + "fixed_string_val": "object", + "low_cardinality_val": "object", + "empty_string": "object", + "unicode_string": "object", + "special_chars": "object", + "long_string": "object", + "fixed_string_short": "object", + "low_cardinality_empty": "object" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_date_types(self): + """Test Date and Date32 types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toDate('2023-12-25') as date_val, + toDate('1970-01-01') as date_min, + toDate('2149-06-06') as date_max, + toDate32('2023-12-25') as date32_val, + toDate32('1900-01-01') as date32_min, + toDate32('2299-12-31') as date32_max, + toDate('2000-02-29') as date_leap_year, + toDate32('2000-02-29') as date32_leap_year, + toDate32('1950-06-15') as date32_negative_1, + toDate32('1960-12-31') as date32_negative_2, + toDate32('1969-12-31') as date32_before_epoch + UNION ALL + SELECT + 2 as row_id, + toDate('1970-01-01') as date_val, + toDate('2023-01-01') as date_min, + toDate('2023-12-31') as date_max, + toDate32('1970-01-01') as date32_val, + toDate32('2023-01-01') as date32_min, + toDate32('2023-12-31') as date32_max, + toDate('2024-02-29') as date_leap_year, + toDate32('2024-02-29') as date32_leap_year, + toDate32('1945-05-08') as date32_negative_1, + toDate32('1955-03-20') as date32_negative_2, + toDate32('1968-07-20') as date32_before_epoch + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - specific dates (Date types include time component 00:00:00) + self.assertIn("2023-12-25", str(ret.iloc[0]["date_val"])) + self.assertIn("1970-01-01", str(ret.iloc[0]["date_min"])) + self.assertIn("2149-06-06", str(ret.iloc[0]["date_max"])) + self.assertIn("2023-12-25", str(ret.iloc[0]["date32_val"])) + self.assertIn("1900-01-01", str(ret.iloc[0]["date32_min"])) + self.assertIn("2299-12-31", str(ret.iloc[0]["date32_max"])) + self.assertIn("2000-02-29", str(ret.iloc[0]["date_leap_year"])) + self.assertIn("2000-02-29", str(ret.iloc[0]["date32_leap_year"])) + # Test Date32 negative values (before 1970 epoch) + self.assertIn("1950-06-15", str(ret.iloc[0]["date32_negative_1"])) + self.assertIn("1960-12-31", str(ret.iloc[0]["date32_negative_2"])) + self.assertIn("1969-12-31", str(ret.iloc[0]["date32_before_epoch"])) + + # Test second row - different dates + self.assertIn("1970-01-01", str(ret.iloc[1]["date_val"])) + self.assertIn("2023-01-01", str(ret.iloc[1]["date_min"])) + self.assertIn("2023-12-31", str(ret.iloc[1]["date_max"])) + self.assertIn("1970-01-01", str(ret.iloc[1]["date32_val"])) + self.assertIn("2023-01-01", str(ret.iloc[1]["date32_min"])) + self.assertIn("2023-12-31", str(ret.iloc[1]["date32_max"])) + self.assertIn("2024-02-29", str(ret.iloc[1]["date_leap_year"])) + self.assertIn("2024-02-29", str(ret.iloc[1]["date32_leap_year"])) + # Test Date32 negative values (before 1970 epoch) - second row + self.assertIn("1945-05-08", str(ret.iloc[1]["date32_negative_1"])) + self.assertIn("1955-03-20", str(ret.iloc[1]["date32_negative_2"])) + self.assertIn("1968-07-20", str(ret.iloc[1]["date32_before_epoch"])) + + # Precise data type validation + expected_types = { + "date_val": "datetime64[s]", # Date types mapped to datetime64[s] in pandas + "date_min": "datetime64[s]", + "date_max": "datetime64[s]", + "date32_val": "datetime64[s]", + "date32_min": "datetime64[s]", + "date32_max": "datetime64[s]", + "date_leap_year": "datetime64[s]", + "date32_leap_year": "datetime64[s]", + "date32_negative_1": "datetime64[s]", # Date32 negative values (before 1970) + "date32_negative_2": "datetime64[s]", + "date32_before_epoch": "datetime64[s]" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_time_types(self): + """Test Time and Time64 types""" + # Enable Time and Time64 types + self.session.query("SET enable_time_time64_type = 1") + + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + CAST('14:30:45' AS Time) as time_val, + CAST('00:00:00' AS Time) as time_min, + CAST('23:59:59' AS Time) as time_max, + CAST('14:30:45.123456' AS Time64(6)) as time64_val, + CAST('00:00:00.000000' AS Time64(6)) as time64_min, + CAST('23:59:59.999999' AS Time64(6)) as time64_max, + CAST('12:00:00.123' AS Time64(3)) as time64_ms, + CAST('18:45:30.987654321' AS Time64(9)) as time64_ns + UNION ALL + SELECT + 2 as row_id, + CAST('09:15:30' AS Time) as time_val, + CAST('12:00:00' AS Time) as time_min, + CAST('18:45:15' AS Time) as time_max, + CAST('09:15:30.654321' AS Time64(6)) as time64_val, + CAST('12:30:45.500000' AS Time64(6)) as time64_min, + CAST('20:15:30.111111' AS Time64(6)) as time64_max, + CAST('08:30:15.500' AS Time64(3)) as time64_ms, + CAST('16:20:10.123456789' AS Time64(9)) as time64_ns + UNION ALL + SELECT + 3 as row_id, + CAST(-3600 AS Time) as time_val, -- -1 hour as negative seconds + CAST(-7200 AS Time) as time_min, -- -2 hours as negative seconds + CAST(-1800 AS Time) as time_max, -- -30 minutes as negative seconds + CAST(-3661.123456 AS Time64(6)) as time64_val, -- -1h 1m 1.123456s + CAST(-7322.500000 AS Time64(6)) as time64_min, -- -2h 2m 2.5s + CAST(-1801.999999 AS Time64(6)) as time64_max, -- -30m 1.999999s + CAST(-3723.500 AS Time64(3)) as time64_ms, -- -1h 2m 3.5s + CAST(-5434.123456789 AS Time64(9)) as time64_ns -- -1h 30m 34.123456789s + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - time values + self.assertIn("14:30:45", str(ret.iloc[0]["time_val"])) + self.assertIn("00:00:00", str(ret.iloc[0]["time_min"])) + self.assertIn("23:59:59", str(ret.iloc[0]["time_max"])) + self.assertIn("14:30:45", str(ret.iloc[0]["time64_val"])) + self.assertIn("00:00:00", str(ret.iloc[0]["time64_min"])) + self.assertIn("23:59:59", str(ret.iloc[0]["time64_max"])) + self.assertIn("12:00:00", str(ret.iloc[0]["time64_ms"])) + self.assertIn("18:45:30", str(ret.iloc[0]["time64_ns"])) + + # Test second row - different time values + self.assertIn("09:15:30", str(ret.iloc[1]["time_val"])) + self.assertIn("12:00:00", str(ret.iloc[1]["time_min"])) + self.assertIn("18:45:15", str(ret.iloc[1]["time_max"])) + self.assertIn("09:15:30", str(ret.iloc[1]["time64_val"])) + self.assertIn("12:30:45", str(ret.iloc[1]["time64_min"])) + self.assertIn("20:15:30", str(ret.iloc[1]["time64_max"])) + self.assertIn("08:30:15", str(ret.iloc[1]["time64_ms"])) + self.assertIn("16:20:10", str(ret.iloc[1]["time64_ns"])) + + # Test third row - negative time values (should be returned as string numbers) + # Since Python time types don't support negative values, they are returned as numeric strings + self.assertEqual(ret.iloc[2]["time_val"], "-3600") # -1 hour + self.assertEqual(ret.iloc[2]["time_min"], "-7200") # -2 hours + self.assertEqual(ret.iloc[2]["time_max"], "-1800") # -30 minutes + self.assertEqual(ret.iloc[2]["time64_val"], "-3661.123456") # -1h 1m 1.123456s + self.assertEqual(ret.iloc[2]["time64_min"], "-7322.5") # -2h 2m 2.5s + self.assertEqual(ret.iloc[2]["time64_max"], "-1801.999999") # -30m 1.999999s + self.assertEqual(ret.iloc[2]["time64_ms"], "-3723.5") # -1h 2m 3.5s + self.assertEqual(ret.iloc[2]["time64_ns"], "-5434.123456789") # -1h 30m 34.123456789s + + # Verify negative values are returned as strings (object dtype) + for col in ["time_val", "time_min", "time_max", "time64_val", "time64_min", "time64_max", "time64_ms", "time64_ns"]: + self.assertIsInstance(ret.iloc[2][col], str, f"{col} should be string for negative values") + + # Precise data type validation + expected_types = { + "time_val": "object", # Time types mapped to object in pandas + "time_min": "object", + "time_max": "object", + "time64_val": "object", + "time64_min": "object", + "time64_max": "object", + "time64_ms": "object", + "time64_ns": "object" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + @unittest.skip("") + def test_datetime_types(self): + """Test DateTime and DateTime64 types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toDateTime('2023-12-25 14:30:45') as datetime_val, + toDateTime('1970-01-01 00:00:00') as datetime_min, + toDateTime('2106-02-07 06:28:15') as datetime_max, + toDateTime64('2023-12-25 14:30:45.123456', 6) as datetime64_val, + toDateTime64('1900-01-01 00:00:00.000000', 6) as datetime64_min, + toDateTime64('2299-12-31 23:59:59.999999', 6) as datetime64_max, + toDateTime64('2023-12-25 14:30:45.123456789', 9) as datetime64_ns, + toDateTime('2023-06-15 12:00:00', 'UTC') as datetime_utc, + toDateTime('2023-06-15 15:30:00', 'Europe/London') as datetime_london, + toDateTime64('2023-06-15 12:00:00.123', 3, 'Asia/Shanghai') as datetime64_tz_sh, + toDateTime64('2023-06-15 12:00:00.456', 3, 'America/New_York') as datetime64_tz_ny + UNION ALL + SELECT + 2 as row_id, + toDateTime('2000-02-29 09:15:30') as datetime_val, + toDateTime('2023-01-01 12:30:45') as datetime_min, + toDateTime('2023-12-31 18:45:15') as datetime_max, + toDateTime64('2000-02-29 09:15:30.654321', 6) as datetime64_val, + toDateTime64('2023-01-01 08:00:00.111111', 6) as datetime64_min, + toDateTime64('2023-12-31 20:30:45.888888', 6) as datetime64_max, + toDateTime64('2000-02-29 09:15:30.987654321', 9) as datetime64_ns, + toDateTime('2024-01-15 08:30:00', 'UTC') as datetime_utc, + toDateTime('2024-01-15 20:00:00', 'Europe/London') as datetime_london, + toDateTime64('2024-01-15 16:45:30.789', 3, 'Asia/Shanghai') as datetime64_tz_sh, + toDateTime64('2024-01-15 09:15:45.987', 3, 'America/New_York') as datetime64_tz_ny + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + + # Test first row - exact datetime values + # DateTime (second precision) - ClickHouse uses server timezone (likely Asia/Shanghai) + # We need to check what timezone ClickHouse is actually using + actual_tz = 'UTC' + + self.assertEqual(ret.iloc[0]["datetime_val"], pd.Timestamp('2023-12-25 14:30:45', tz=actual_tz)) + self.assertEqual(ret.iloc[0]["datetime_min"], pd.Timestamp('1970-01-01 00:00:00', tz=actual_tz)) + self.assertEqual(ret.iloc[0]["datetime_max"], pd.Timestamp('2106-02-07 06:28:15', tz=actual_tz)) + + # DateTime64 (microsecond precision) - should use same timezone as ClickHouse server + self.assertEqual(ret.iloc[0]["datetime64_val"], pd.Timestamp('2023-12-25 14:30:45.123456', tz=actual_tz)) + self.assertEqual(ret.iloc[0]["datetime64_min"], pd.Timestamp('1900-01-01 00:00:00.000000', tz=actual_tz)) + self.assertEqual(ret.iloc[0]["datetime64_max"], pd.Timestamp('2299-12-31 23:59:59.999999', tz=actual_tz)) + + # DateTime64 (nanosecond precision) - should use same timezone as ClickHouse server + self.assertEqual(ret.iloc[0]["datetime64_ns"], pd.Timestamp('2023-12-25 14:30:45.123456789', tz=actual_tz)) + + # UTC timezone datetime + expected_utc = pd.Timestamp('2023-06-15 12:00:00', tz='UTC') + actual_utc = ret.iloc[0]["datetime_utc"] + self.assertEqual(actual_utc, expected_utc) + + # Europe/London timezone datetime + expected_london = pd.Timestamp('2023-06-15 15:30:00', tz='Europe/London') + actual_london = ret.iloc[0]["datetime_london"] + self.assertEqual(actual_london, expected_london) + + # Timezone-aware datetime64 - Asia/Shanghai + expected_sh = pd.Timestamp('2023-06-15 12:00:00.123', tz='Asia/Shanghai') + actual_sh = ret.iloc[0]["datetime64_tz_sh"] + self.assertEqual(actual_sh, expected_sh) + + # Timezone-aware datetime64 - America/New_York + expected_ny = pd.Timestamp('2023-06-15 12:00:00.456', tz='America/New_York') + actual_ny = ret.iloc[0]["datetime64_tz_ny"] + self.assertEqual(actual_ny, expected_ny) + + # Test second row - exact datetime values with ClickHouse server timezone + self.assertEqual(ret.iloc[1]["datetime_val"], pd.Timestamp('2000-02-29 09:15:30', tz=actual_tz)) + self.assertEqual(ret.iloc[1]["datetime_min"], pd.Timestamp('2023-01-01 12:30:45', tz=actual_tz)) + self.assertEqual(ret.iloc[1]["datetime_max"], pd.Timestamp('2023-12-31 18:45:15', tz=actual_tz)) + self.assertEqual(ret.iloc[1]["datetime64_val"], pd.Timestamp('2000-02-29 09:15:30.654321', tz=actual_tz)) + self.assertEqual(ret.iloc[1]["datetime64_min"], pd.Timestamp('2023-01-01 08:00:00.111111', tz=actual_tz)) + self.assertEqual(ret.iloc[1]["datetime64_max"], pd.Timestamp('2023-12-31 20:30:45.888888', tz=actual_tz)) + self.assertEqual(ret.iloc[1]["datetime64_ns"], pd.Timestamp('2000-02-29 09:15:30.987654321', tz=actual_tz)) + + # Second row timezone datetime tests + expected_utc_2 = pd.Timestamp('2024-01-15 08:30:00', tz='UTC') + actual_utc_2 = ret.iloc[1]["datetime_utc"] + self.assertEqual(actual_utc_2, expected_utc_2) + + expected_london_2 = pd.Timestamp('2024-01-15 20:00:00', tz='Europe/London') + actual_london_2 = ret.iloc[1]["datetime_london"] + self.assertEqual(actual_london_2, expected_london_2) + + # Second row timezone tests (already converted by C++ code) + expected_sh_2 = pd.Timestamp('2024-01-15 16:45:30.789', tz='Asia/Shanghai') + actual_sh_2 = ret.iloc[1]["datetime64_tz_sh"] + self.assertEqual(actual_sh_2, expected_sh_2) + + expected_ny_2 = pd.Timestamp('2024-01-15 09:15:45.987', tz='America/New_York') + actual_ny_2 = ret.iloc[1]["datetime64_tz_ny"] + self.assertEqual(actual_ny_2, expected_ny_2) + + # Precise data type validation + expected_types = { + "row_id": "int64", + "datetime_val": "datetime64[s]", # DateTime types mapped to datetime64[s] (second precision) + "datetime_min": "datetime64[s]", + "datetime_max": "datetime64[s]", + "datetime64_val": "datetime64[ns]", # DateTime64 types mapped to datetime64[ns] (nanosecond precision) + "datetime64_min": "datetime64[ns]", + "datetime64_max": "datetime64[ns]", + "datetime64_ns": "datetime64[ns]", # DateTime64 with 9-digit precision (nanoseconds) + "datetime_utc": "datetime64[s]", # DateTime with timezone -> datetime64[s] + "datetime64_tz_sh": "datetime64[ns]", # DateTime64 with Asia/Shanghai timezone + "datetime64_tz_ny": "datetime64[ns]" # DateTime64 with America/New_York timezone } for col, expected_type in expected_types.items():