Skip to content

Commit

Permalink
Fix crash when creating frames from numpy arrays of unicode strings (#…
Browse files Browse the repository at this point in the history
…3422)

Closes #3420
  • Loading branch information
oleksiyskononenko committed Feb 10, 2023
1 parent f7eb03a commit 0350710
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 7 deletions.
2 changes: 2 additions & 0 deletions docs/releases/v1.1.0.rst
Expand Up @@ -60,6 +60,8 @@

-[fix] Fixed groupby behavior on columns that contain missing values. [#3331]

-[fix] Fixed creating frames from numpy arrays, that contain unicode strings. [#3420]

-[api] Converting a column of :attr:`void <dt.Type.void>` type into pandas
now produces a pandas ``object`` column filled with ``None``s. Converting
such column back into datatable produces a ``void`` column again. [#3063]
Expand Down
9 changes: 5 additions & 4 deletions src/core/py_buffers.cc
@@ -1,5 +1,5 @@
//------------------------------------------------------------------------------
// Copyright 2018-2021 H2O.ai
// Copyright 2018-2023 H2O.ai
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
Expand Down Expand Up @@ -115,12 +115,13 @@ Column Column::from_pybuffer(const py::robj& pyobj) {

static Column convert_fwchararray_to_column(py::buffer&& view)
{
// Number of characters in each element (each Unicode character is 4 bytes
// in numpy).
// When calculating `k` (the number of characters in an element) and
// `maxsize` (the maximum size of the string buffer), we take into account
// that in numpy the size of each unicode character is 4 bytes.
size_t k = view.itemsize() / 4;
size_t nrows = view.nelements();
size_t stride = view.stride() * k;
size_t maxsize = nrows * k;
size_t maxsize = nrows * k * 4;
auto input = static_cast<uint32_t*>(view.data());

Buffer strbuf = Buffer::mem(maxsize);
Expand Down
33 changes: 30 additions & 3 deletions tests/frame/test-create.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#-------------------------------------------------------------------------------
# Copyright 2018-2021 H2O.ai
# Copyright 2018-2023 H2O.ai
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
Expand Down Expand Up @@ -1009,8 +1009,35 @@ def test_create_from_3d_numpy_array(numpy):
assert "Cannot create Frame from a 3-D numpy array" in str(e.value)


def test_create_from_string_numpy_array(numpy):
a = numpy.array(["alef", "bet", "gimel", "dalet", "he", "юйґї"])
def test_create_from_numpy_array_ascii_strings(numpy):
a = numpy.array(["alef", "bet", "gimel", "dalet", "he", "12345"])
d = dt.Frame(a)
frame_integrity_check(d)
assert d.shape == (6, 1)
assert d.names == ("C0", )
assert d.to_list() == [a.tolist()]


def test_create_from_numpy_array_unicode_strings(numpy):
a = numpy.array(["數據表", "даних", "データ表", "таблиця", "ґїґїґї"])
d = dt.Frame(a)
frame_integrity_check(d)
assert d.shape == (5, 1)
assert d.names == ("C0", )
assert d.to_list() == [a.tolist()]


def test_create_from_numpy_array_unicode_strings_issue3420(numpy):
a = numpy.array(['ы']*100)
d = dt.Frame(a)
frame_integrity_check(d)
assert d.shape == (100, 1)
assert d.names == ("C0", )
assert d.to_list() == [a.tolist()]


def test_create_from_numpy_array_mixed_strings(numpy):
a = numpy.array(["數據表", "one", "數", "юйґї", "データ表", "five"])
d = dt.Frame(a)
frame_integrity_check(d)
assert d.shape == (6, 1)
Expand Down

0 comments on commit 0350710

Please sign in to comment.