From 8e9d419dcb283a9bc896d32089919834d9dc8e13 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 10 Oct 2023 09:49:18 -0400 Subject: [PATCH] Allow categorical column with no categories (#888) --- fastparquet/core.py | 7 ++++++- fastparquet/dataframe.py | 2 +- fastparquet/test/test_dataframe.py | 9 ++++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/fastparquet/core.py b/fastparquet/core.py index a2156c6f..10e70c00 100644 --- a/fastparquet/core.py +++ b/fastparquet/core.py @@ -446,7 +446,12 @@ def read_col(column, schema_helper, infile, use_cat=False, column. """ cmd = column.meta_data - se = schema_helper.schema_element(cmd.path_in_schema) + try: + se = schema_helper.schema_element(cmd.path_in_schema) + except KeyError: + # column not present in this row group + assign[:] = None + return off = min((cmd.dictionary_page_offset or cmd.data_page_offset, cmd.data_page_offset)) diff --git a/fastparquet/dataframe.py b/fastparquet/dataframe.py index b9341a15..690ea0ca 100644 --- a/fastparquet/dataframe.py +++ b/fastparquet/dataframe.py @@ -191,7 +191,7 @@ def set_cats(values, i=i, col=col, **kwargs): shape[-1] = size if isinstance(bvalues, Categorical): - code = np.zeros(shape=shape, dtype=bvalues.codes.dtype) + code = np.full(fill_value=-1, shape=shape, dtype=bvalues.codes.dtype) values = Categorical.from_codes(codes=code, dtype=bvalues.dtype) diff --git a/fastparquet/test/test_dataframe.py b/fastparquet/test/test_dataframe.py index 5f586182..24da294c 100644 --- a/fastparquet/test/test_dataframe.py +++ b/fastparquet/test/test_dataframe.py @@ -1,6 +1,7 @@ import warnings from unittest import mock +import numpy as np import pandas as pd import pytest from numpy import empty as np_empty @@ -26,7 +27,7 @@ def test_empty(): df, views = empty('category', size=n, cols=['c'], cats={'c': ['one', 'two']}) views['c'][0] = 1 - assert df.c[:2].tolist() == ['two', 'one'] + assert df.c[:2].tolist() == ['two', np.nan] df, views = empty('i4,i8,f8,f8,O', size=n, cols=['i4', 'i8', 'f8_1', 'f8_2', 'O']) @@ -34,6 +35,12 @@ def test_empty(): assert len(views) == 5 +def test_no_cats(): + df, views = empty('category', size=10, cols=['c'], + cats={'c': []}) + assert (views["c"] == -1).all() + + def test_empty_tz_utc(): with warnings.catch_warnings(): warnings.simplefilter("error")