Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
5496746
chore: add ChunkCollectorOutputFormat.cpp
wudidapaopao Sep 28, 2025
bd998e2
chore: adjust Python-C interface to support direct DataFrame return
wudidapaopao Sep 29, 2025
4b0276b
chore: update NumpyType.cpp
wudidapaopao Sep 29, 2025
ef1d123
Merge remote-tracking branch 'origin/main' into support_export_dataframe
wudidapaopao Oct 28, 2025
fbaf1c7
chore: add NumpyArray
wudidapaopao Oct 29, 2025
8391e2f
chore: update NumpyArray
wudidapaopao Oct 29, 2025
8af2274
chore: update NumpyArray
wudidapaopao Oct 29, 2025
0255b90
chore: update NumpyArray
wudidapaopao Oct 30, 2025
1b6dade
chore: support timezone
wudidapaopao Oct 31, 2025
2a09093
chore: add more CH types
wudidapaopao Nov 1, 2025
af9cc16
chore: support time and time64 types
wudidapaopao Nov 2, 2025
373bd5e
chore: support more types
wudidapaopao Nov 2, 2025
dd55a08
chore: support nested types
wudidapaopao Nov 3, 2025
afd902a
chore: support converting filed to python object
wudidapaopao Nov 3, 2025
670d87c
chore: support more types
wudidapaopao Nov 4, 2025
b5b4de6
chore: support map type
wudidapaopao Nov 4, 2025
f9f1970
chore: support more nested type
wudidapaopao Nov 4, 2025
dab8450
chore: support more nested type
wudidapaopao Nov 4, 2025
f585a22
Merge remote-tracking branch 'origin/main' into support_export_dataframe
wudidapaopao Nov 4, 2025
9894ace
chore: support object type
wudidapaopao Nov 4, 2025
95ad2d4
fix: fix build issues
wudidapaopao Nov 4, 2025
7ab8fcb
test: update workflow
wudidapaopao Nov 5, 2025
f0406e6
fix: fix test issues
wudidapaopao Nov 5, 2025
c6d370d
test: add more test cases
wudidapaopao Nov 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions .github/workflows/build_linux_arm64_wheels-gh.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@ on:
required: true
release:
types: [created]
push:
branches:
- main
paths-ignore:
- '**/*.md'
pull_request:
branches:
- main
paths-ignore:
- '**/*.md'
# push:
# branches:
# - main
# paths-ignore:
# - '**/*.md'
# pull_request:
# branches:
# - main
# paths-ignore:
# - '**/*.md'


jobs:
Expand Down Expand Up @@ -137,7 +137,7 @@ jobs:
which clang++-19
clang++-19 --version
sudo apt-get install -y make cmake ccache ninja-build yasm gawk wget
# Install WebAssembly linker (wasm-ld)
# Install WebAssembly linker (wasm-ld)
sudo apt-get install -y lld-19
# Create symlink for wasm-ld
if ! command -v wasm-ld &> /dev/null; then
Expand Down
20 changes: 10 additions & 10 deletions .github/workflows/build_macos_arm64_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@ on:
required: true
release:
types: [created]
push:
branches:
- main
paths-ignore:
- '**/*.md'
pull_request:
branches:
- main
paths-ignore:
- '**/*.md'
# push:
# branches:
# - main
# paths-ignore:
# - '**/*.md'
# pull_request:
# branches:
# - main
# paths-ignore:
# - '**/*.md'

jobs:
build_universal_wheel:
Expand Down
20 changes: 10 additions & 10 deletions .github/workflows/build_macos_x86_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@ on:
required: true
release:
types: [created]
push:
branches:
- main
paths-ignore:
- '**/*.md'
pull_request:
branches:
- main
paths-ignore:
- '**/*.md'
# push:
# branches:
# - main
# paths-ignore:
# - '**/*.md'
# pull_request:
# branches:
# - main
# paths-ignore:
# - '**/*.md'

jobs:
build_universal_wheel:
Expand Down
20 changes: 10 additions & 10 deletions .github/workflows/build_musllinux_arm64_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@ on:
required: true
release:
types: [created]
push:
branches:
- main
paths-ignore:
- '**/*.md'
pull_request:
branches:
- main
paths-ignore:
- '**/*.md'
# push:
# branches:
# - main
# paths-ignore:
# - '**/*.md'
# pull_request:
# branches:
# - main
# paths-ignore:
# - '**/*.md'

jobs:
build_musllinux_wheels:
Expand Down
20 changes: 10 additions & 10 deletions .github/workflows/build_musllinux_x86_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@ on:
required: true
release:
types: [created]
push:
branches:
- main
paths-ignore:
- '**/*.md'
pull_request:
branches:
- main
paths-ignore:
- '**/*.md'
# push:
# branches:
# - main
# paths-ignore:
# - '**/*.md'
# pull_request:
# branches:
# - main
# paths-ignore:
# - '**/*.md'


jobs:
Expand Down
35 changes: 6 additions & 29 deletions chdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@ class ChdbError(Exception):
"""


_arrow_format = set({"dataframe", "arrowtable"})
_arrow_format = set({"arrowtable"})
_process_result_format_funs = {
"dataframe": lambda x: to_df(x),
"arrowtable": lambda x: to_arrowTable(x),
}

Expand Down Expand Up @@ -108,33 +107,6 @@ def to_arrowTable(res):
return pa.RecordBatchFileReader(memview.view()).read_all()


# return pandas dataframe
def to_df(r):
"""Convert query result to pandas DataFrame.

Converts a chDB query result to a pandas DataFrame by first converting to
PyArrow Table and then to pandas using multi-threading for better performance.

Args:
r: chDB query result object containing binary Arrow data

Returns:
pd.DataFrame: pandas DataFrame containing the query results

Raises:
ImportError: If pyarrow or pandas are not installed

Example:
>>> result = chdb.query("SELECT 1 as id, 'hello' as msg", "Arrow")
>>> df = chdb.to_df(result)
>>> print(df)
id msg
0 1 hello
"""
t = to_arrowTable(r)
return t.to_pandas(use_threads=True)


# global connection lock, for multi-threading use of legacy chdb.query()
g_conn_lock = threading.Lock()

Expand Down Expand Up @@ -222,6 +194,11 @@ def query(sql, output_format="CSV", path="", udf_path=""):
with g_conn_lock:
conn = _chdb.connect(conn_str)
res = conn.query(sql, output_format)

if lower_output_format == "dataframe":
conn.close()
return res

if res.has_error():
conn.close()
raise ChdbError(res.error_message())
Expand Down
44 changes: 1 addition & 43 deletions chdb/state/sqlitelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@
raise ImportError("Failed to import pyarrow") from None


_arrow_format = set({"dataframe", "arrowtable"})
_arrow_format = set({"arrowtable"})
_process_result_format_funs = {
"dataframe": lambda x: to_df(x),
"arrowtable": lambda x: to_arrowTable(x),
}

Expand Down Expand Up @@ -67,47 +66,6 @@ def to_arrowTable(res):
return pa.RecordBatchFileReader(memview.view()).read_all()


# return pandas dataframe
def to_df(r):
"""Convert query result to Pandas DataFrame.

This function converts chdb query results to a Pandas DataFrame format
by first converting to PyArrow Table and then to DataFrame. This provides
convenient data analysis capabilities with Pandas API.

Args:
r: Query result object from chdb containing Arrow format data

Returns:
pandas.DataFrame: DataFrame containing the query results with
appropriate column names and data types

Raises:
ImportError: If pyarrow or pandas packages are not installed

.. note::
This function uses multi-threading for the Arrow to Pandas conversion
to improve performance on large datasets.

.. seealso::
:func:`to_arrowTable` - For PyArrow Table format conversion

Examples:
>>> import chdb
>>> result = chdb.query("SELECT 1 as num, 'hello' as text", "Arrow")
>>> df = to_df(result)
>>> print(df)
num text
0 1 hello
>>> print(df.dtypes)
num int64
text object
dtype: object
"""
t = to_arrowTable(r)
return t.to_pandas(use_threads=True)


class StreamingResult:
def __init__(self, c_result, conn, result_func, supports_record_batch):
self._result = c_result
Expand Down
6 changes: 6 additions & 0 deletions programs/local/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,19 @@ endif()
if (USE_PYTHON)
set (CHDB_SOURCES
chdb.cpp
ChunkCollectorOutputFormat.cpp
FieldToPython.cpp
FormatHelper.cpp
ListScan.cpp
LocalChdb.cpp
LocalServer.cpp
NumpyArray.cpp
NumpyNestedTypes.cpp
NumpyType.cpp
ObjectToPython.cpp
PandasAnalyzer.cpp
PandasDataFrame.cpp
PandasDataFrameBuilder.cpp
PandasScan.cpp
PyArrowStreamFactory.cpp
PyArrowTable.cpp
Expand Down
91 changes: 91 additions & 0 deletions programs/local/ChunkCollectorOutputFormat.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#include "ChunkCollectorOutputFormat.h"
#include "PandasDataFrameBuilder.h"

#include <IO/NullWriteBuffer.h>
#include <Processors/Port.h>
#include <Client/ClientBase.h>
#include <base/defines.h>

using namespace DB;

namespace CHDB
{

NullWriteBuffer ChunkCollectorOutputFormat::out;

ChunkCollectorOutputFormat::ChunkCollectorOutputFormat(
SharedHeader shared_header,
PandasDataFrameBuilder & builder)
: IOutputFormat(shared_header, out)
, dataframe_builder(builder)
{}

void ChunkCollectorOutputFormat::consume(Chunk chunk)
{
chunks.emplace_back(std::move(chunk));
}

void ChunkCollectorOutputFormat::consumeTotals(Chunk totals)
{
chunks.emplace_back(std::move(totals));
}

void ChunkCollectorOutputFormat::consumeExtremes(Chunk extremes)
{
chunks.emplace_back(std::move(extremes));
}

void ChunkCollectorOutputFormat::finalizeImpl()
{
// Add all collected chunks to the builder
for (const auto & chunk : chunks)
{
dataframe_builder.addChunk(chunk);
}

// Finalize the DataFrame generation
dataframe_builder.finalize();

chunks.clear();
}

/// Global dataframe builder
static std::shared_ptr<PandasDataFrameBuilder> g_dataframe_builder = nullptr;

PandasDataFrameBuilder & getGlobalDataFrameBuilder()
{
return *g_dataframe_builder;
}

void setGlobalDataFrameBuilder(std::shared_ptr<PandasDataFrameBuilder> builder)
{
g_dataframe_builder = builder;
}

void resetGlobalDataFrameBuilder()
{
if (g_dataframe_builder)
{
py::gil_scoped_acquire acquire;
g_dataframe_builder.reset();
}
}

/// create ChunkCollectorOutputFormat for use with function pointer
std::shared_ptr<IOutputFormat> createDataFrameOutputFormat(SharedHeader header)
{
/// Create a PandasDataFrameBuilder and set it globally
auto dataframe_builder = std::make_shared<PandasDataFrameBuilder>(*header);
setGlobalDataFrameBuilder(dataframe_builder);

/// Create and return the format with the builder
return std::make_shared<ChunkCollectorOutputFormat>(header, getGlobalDataFrameBuilder());
}

/// Registration function to be called during initialization
void registerDataFrameOutputFormat()
{
ClientBase::setDataFrameFormatCreator(&createDataFrameOutputFormat);
}

}
Loading