In [1]:
import torcharrow as ta
import torcharrow.dtypes as dt

import torch
import black

def pp(s):
    """Beautiful multi-line formatting"""
    print(black.format_str(repr(s), mode=black.Mode()))

In [2]:
df = ta.DataFrame({
    "ints": [2, 3, 5, 7],
    "ints_with_null": [1, None, 2, None],
    "list_of_ints": [[1, 2], [3, 4], [5, 6], [7, 8]],
    "list_of_ints_with_null": [[1, 2], [3, None], [None, 6], [7, 8]],
    "id_score_list": ta.Column(
        [[(1, 1.5), (2, 2.5)], [], [(3, 3.5)], [(4, 4.5), (5, 5.5)]],
        dtype=dt.List(dt.Struct([dt.Field('id', dt.int64), dt.Field('score', dt.float32)]))
    ),
    "str": ["a", "b", "c", "d"],
    "list_of_str": ta.Column([
            ["a", "aa"], 
            ["b"], 
            [], 
            ["d", "dd"]
        ], 
        dtype= dt.List(dt.string)),
    "multi_label_map": ta.Column([
            {"click": 1, "conv": 0}, 
            {"click": 0, "conv": 0}, 
            {}, 
            {"conv": 1}
        ], 
        dtype= dt.Map(dt.string, dt.int64))
    })

df

  index    ints    ints_with_null  list_of_ints    list_of_ints_with_null    id_score_list         str    list_of_str    multi_label_map
-------  ------  ----------------  --------------  ------------------------  --------------------  -----  -------------  -----------------------
      0       2                 1  [1, 2]          [1, 2]                    [(1, 1.5), (2, 2.5)]  a      ['a', 'aa']    {'click': 1, 'conv': 0}
      1       3                    [3, 4]          [3, None]                 []                    b      ['b']          {'click': 0, 'conv': 0}
      2       5                 2  [5, 6]          [None, 6]                 [(3, 3.5)]            c      []             {}
      3       7                    [7, 8]          [7, 8]                    [(4, 4.5), (5, 5.5)]  d      ['d', 'dd']    {'conv': 1}
dtype: Struct([Field('ints', int64), Field('ints_with_null', Int64(nullable=True)), Field('list_of_ints', List(int64)), Field('list_of_ints_with_null', List(Int64(nullable

## `to_python()` just recovers the original data

After implementing it I realized that it's almost the same as `list(df)` :)

The only difference is that it returns named tuples instead of plain tuples and OrderedDict instead of a regular one. Maybe we should collapse them?

In [3]:
pp(list(df))
#pp(df.to_python()) -- WS accidentally deleted to_python...

[
    (
        2,
        1,
        [1, 2],
        [1, 2],
        [(1, 1.5), (2, 2.5)],
        "a",
        ["a", "aa"],
        {"click": 1, "conv": 0},
    ),
    (3, None, [3, 4], [3, None], [], "b", ["b"], {"click": 0, "conv": 0}),
    (5, 2, [5, 6], [None, 6], [(3, 3.5)], "c", [], {}),
    (7, None, [7, 8], [7, 8], [(4, 4.5), (5, 5.5)], "d", ["d", "dd"], {"conv": 1}),
]



## `to_torch()` converts into a very simplified columnar storage using torch.Tensors

Numerical columns just turn into tensors.

Lists become PackedList type with offsets and values. Maps - PackedMaps.

In [4]:
# pp(df["ints"].to_torch())  -- WS accidentally deleted to_torch...
# pp(df["list_of_ints"].to_torch())
# pp(df["id_score_list"].to_torch())

For nullable columns we wrap the value into WithPresence.

Those can be nested!

In [5]:
# pp(df["ints_with_null"].to_torch()) -- WS accidentally deleted to_torch...
# pp(df["list_of_ints_with_null"].to_torch())

Since PyTorch doesn't have string tensors, string columns get converted to `List[str]` in python.

As a special rule, we also don't use PackedList for lists of strings (as it'd be awkward). This special case is also present in F6 today.

In [6]:
# pp(df["str"].to_torch()) -- WS accidentally deleted to_torch...
# pp(df["list_of_str"].to_torch())

But we do use PackedMap for maps even if the keys are string (though no one probably would want it)

In [7]:
# pp(df["multi_label_map"].to_torch()) -- WS accidentally deleted to_torch...

You can convert the entire Dataframe at once!

In [8]:
# pp(df.to_torch()) -- WS accidentally deleted to_torch...

## to be continued...

* specifying output type, so that we can mix output formats, e.g. convert some columns of the dataframe, but keep another as in python
* reverse conversion from these simple structs to Dataframe
* UDFs with automatic conversion back and forth
* explore integration with `__torch_function__`