diff --git a/src/core/column.cc b/src/core/column.cc index dfb2427f3a..502f6146d4 100644 --- a/src/core/column.cc +++ b/src/core/column.cc @@ -172,6 +172,15 @@ dt::SType Column::stype() const noexcept { return impl_->type_.stype(); } +dt::SType Column::data_stype() const noexcept { + if (impl_->type_.is_categorical()) { + if (n_children()) return child(0).stype(); + else return dt::SType::VOID; + } else { + return stype(); + } +} + dt::LType Column::ltype() const noexcept { return dt::stype_to_ltype(impl_->stype()); } @@ -273,8 +282,7 @@ static inline py::oobj getelem(const Column& col, size_t i) { } py::oobj Column::get_element_as_pyobject(size_t i) const { - dt::SType st = type().is_categorical()? child(0).stype() - : stype(); + dt::SType st = data_stype(); switch (st) { case dt::SType::VOID: return py::None(); @@ -328,8 +336,7 @@ py::oobj Column::get_element_as_pyobject(size_t i) const { } bool Column::get_element_isvalid(size_t i) const { - dt::SType st = type().is_categorical()? child(0).stype() - : stype(); + dt::SType st = data_stype(); switch (st) { case dt::SType::VOID: return false; diff --git a/src/core/column.h b/src/core/column.h index 8f9f1c902d..f4048240c5 100644 --- a/src/core/column.h +++ b/src/core/column.h @@ -119,8 +119,14 @@ class Column size_t nrows() const noexcept; size_t na_count() const; const dt::Type& type() const noexcept; - dt::SType stype() const noexcept; dt::LType ltype() const noexcept; + dt::SType stype() const noexcept; + + // For categorical columns this method will return the stype of the data, + // the column is backed up with. For all the other column types, + // this method is equivalent to `stype()`. + dt::SType data_stype() const noexcept; + size_t elemsize() const noexcept; bool is_fixedwidth() const noexcept; bool is_virtual() const noexcept; diff --git a/src/core/column/latent.h b/src/core/column/latent.h index 4078137c19..b23461eb46 100644 --- a/src/core/column/latent.h +++ b/src/core/column/latent.h @@ -76,8 +76,7 @@ class Latent_ColumnImpl : public Virtual_ColumnImpl { } static void vivify(const Column& col) { - dt::SType st = col.type().is_categorical()? col.child(0).stype() - : col.stype(); + dt::SType st = col.data_stype(); switch (st) { case SType::VOID: case SType::BOOL: diff --git a/src/core/frame/repr/text_column.cc b/src/core/frame/repr/text_column.cc index 9e2b2f832d..e9a393fb4a 100644 --- a/src/core/frame/repr/text_column.cc +++ b/src/core/frame/repr/text_column.cc @@ -369,8 +369,7 @@ tstring Data_TextColumn::_render_value_string(const Column& col, size_t i) const tstring Data_TextColumn::_render_value(const Column& col, size_t i) const { - SType st = col.type().is_categorical()? col.child(0).stype() - : col.stype(); + SType st = col.data_stype(); switch (st) { case SType::VOID: return na_value_; diff --git a/src/core/types/type_categorical.cc b/src/core/types/type_categorical.cc index 981e5c4c69..be6afbb9ac 100644 --- a/src/core/types/type_categorical.cc +++ b/src/core/types/type_categorical.cc @@ -163,10 +163,10 @@ void Type_Cat::cast_obj_column_(Column& col) const { Groupby gb = std::move(res.second); auto offsets = gb.offsets_r(); - Buffer buf = Buffer::mem(col.nrows() * sizeof(T)); - Buffer buf_cat = Buffer::mem(gb.size() * sizeof(int32_t)); - auto buf_ptr = static_cast(buf.xptr()); - auto buf_cat_ptr = static_cast(buf_cat.xptr()); + Buffer buf_codes = Buffer::mem(col.nrows() * sizeof(T)); + Buffer buf_cats = Buffer::mem(gb.size() * sizeof(int32_t)); + auto buf_codes_ptr = static_cast(buf_codes.xptr()); + auto buf_cats_ptr = static_cast(buf_cats.xptr()); const size_t MAX_CATS = std::numeric_limits::max() + size_t(1); @@ -177,22 +177,22 @@ void Type_Cat::cast_obj_column_(Column& col) const { } // Fill out two buffers: - // - `buf_cat` with row indices of unique elements (one element per category) - // - `buf` with the codes of categories (group ids). + // - `buf_cats` with row indices of unique elements (one element per category) + // - `buf_codes` with the codes of categories (group ids). dt::parallel_for_dynamic(gb.size(), [&](size_t i) { size_t jj; ri.get_element(static_cast(offsets[i]), &jj); - buf_cat_ptr[i] = static_cast(jj); + buf_cats_ptr[i] = static_cast(jj); for (int32_t j = offsets[i]; j < offsets[i + 1]; ++j) { ri.get_element(static_cast(j), &jj); - buf_ptr[static_cast(jj)] = static_cast(i); + buf_codes_ptr[static_cast(jj)] = static_cast(i); } }); // Modify `col` in-place by only leaving one element per a category - const RowIndex ri_cat(std::move(buf_cat), RowIndex::ARR32); + const RowIndex ri_cat(std::move(buf_cats), RowIndex::ARR32); col.apply_rowindex(ri_cat); col.materialize(); @@ -205,7 +205,7 @@ void Type_Cat::cast_obj_column_(Column& col) const { col = Column(new Categorical_ColumnImpl( nrows, std::move(val), - std::move(buf), + std::move(buf_codes), std::move(col) )); } diff --git a/tests/types/test-categorical.py b/tests/types/test-categorical.py index 03dbca0455..50986181a2 100644 --- a/tests/types/test-categorical.py +++ b/tests/types/test-categorical.py @@ -256,8 +256,36 @@ def test_create_multicolumn(t): assert_equals(DT2, DT2[:, :]) + +#------------------------------------------------------------------------------- +# Casting to other types +#------------------------------------------------------------------------------- + +@pytest.mark.parametrize('t', [dt.Type.cat8, + dt.Type.cat16, + dt.Type.cat32]) +def test_void_to_cat(t): + src = [None] * 11 + DT = dt.Frame(src) + DT[0] = t(dt.Type.str32) + DT_ref = dt.Frame(src, type=t(dt.Type.str32)) + assert_equals(DT, DT_ref) + + +@pytest.mark.parametrize('t', [dt.Type.cat8, + dt.Type.cat16, + dt.Type.cat32]) +def test_obj_to_cat(t): + src = [None, "cat", "cat", "dog", "mouse", None, "panda", "dog"] + DT = dt.Frame(A=src, type=object) + DT['A'] = t(dt.Type.str32) + DT_ref = dt.Frame(A=src, type=t(dt.Type.str32)) + assert_equals(DT, DT_ref) + + + #------------------------------------------------------------------------------- -# Conversion +# Conversion to other formats #------------------------------------------------------------------------------- @pytest.mark.parametrize('t', [dt.Type.cat8,