# Introduction to nested dtypes: List, Array, Object and Struct

In [1]:
from datetime import date
import polars as pl

### `pl.List` dtype

- each row is a `Series` and
- each `Series` has the same dtype

In [2]:
df_lists = pl.DataFrame(
    {
        "ints": [[0, 1], [2, 3]],
        "floats": [[0.0, 1.0], [2.0, 3.0]],
        "strings": [["0", "1"], ["2", "3"]],
    }
)
df_lists

ints,floats,strings
list[i64],list[f64],list[str]
"[0, 1]","[0.0, 1.0]","[""0"", ""1""]"
"[2, 3]","[2.0, 3.0]","[""2"", ""3""]"


The `pl.List` dtype can have a variable number of elements per row. 

There is also a `pl.Array` dtype optimized for cases where all rows have the same number of elements

In [10]:
df_lists.with_columns(
    ints_array = pl.col("ints").cast(pl.Array(shape=2, inner=pl.Int64))
)

ints,floats,strings,ints_array
list[i64],list[f64],list[str],"array[i64, 2]"
"[0, 1]","[0.0, 1.0]","[""0"", ""1""]","[0, 1]"
"[2, 3]","[2.0, 3.0]","[""2"", ""3""]","[2, 3]"


## Object dtype
We create a column with an object dtype when the lists cannot be cast to a homogenous type

In [11]:
df_object = pl.DataFrame(
    {
        "mixed": [
            [1, 2.0, b"d", date(2022, 1, 1)],
        ]
    },
    strict=False,
)
df_object

mixed
object
"[1, 2.0, b'd', datetime.date(2022, 1, 1)]"


In [12]:
df_object[0,0]

[1, 2.0, b'd', datetime.date(2022, 1, 1)]

In [13]:
type(df_object[0,0])

list

## `pl.Struct` dtype
The `pl.Struct` dtype is basically a nested set of columns inside a single `DataFrame` column. 

The nesting can have multiple levels.

In [14]:
df_struct = (
    pl.DataFrame(
        {
            "year":[2020,2021],
            "trades":[
                {"exporter":"India","importer":"USA","quantity":0.0},
                {"exporter":"India","importer":"USA","quantity":1.5},
            ]
          }
    )
)
df_struct

year,trades
i64,struct[3]
2020,"{""India"",""USA"",0.0}"
2021,"{""India"",""USA"",1.5}"


The keys in a struct column are called `fields`.

We can list the keys with `struct.fields` on a `Series`

In [15]:
df_struct["trades"].struct.fields

['exporter', 'importer', 'quantity']

### Accessing  `pl.Struct` fields

In [17]:
df_struct.select(
    pl.col("trades").struct.field("exporter")
)

exporter
str
"""India"""
"""India"""


### Modifying data in a `pl.Struct`
We can add or modify fields inside a `pl.Struct` column inside a `struct.with_fields` expression.

In [18]:
df_struct.with_columns(
    pl.col("trades").struct.with_fields(
        pl.col("trades").struct.field("exporter").str.to_uppercase(),
        round_quantity = pl.col("trades").struct.field("quantity").round(0)
    )
)

year,trades
i64,struct[4]
2020,"{""INDIA"",""USA"",0.0,0.0}"
2021,"{""INDIA"",""USA"",1.5,2.0}"


### Extracting data from a `pl.Struct`

Convert a nested `pl.Struct` column into unnested columns using the `unnest` expression and `DataFrame` method.

In [None]:
df_struct["trades"].struct.unnest()  # unnest particular column

exporter,importer,quantity
str,str,f64
"""India""","""USA""",0.0
"""India""","""USA""",1.5


In [None]:
df_struct.unnest("trades")  # unnest df based on particular column

year,exporter,importer,quantity
i64,str,str,f64
2020,"""India""","""USA""",0.0
2021,"""India""","""USA""",1.5


Multiple layers nesting struct column

In [21]:
df_struct_deep = pl.DataFrame(
    {
        "trades": [
            {"countries": {"exporter": "India", "importer": "USA"}, "quantity": 0.0},
            {"countries": {"exporter": "India", "importer": "USA"}, "quantity": 1.5},
        ]
    }
)
df_struct_deep


trades
struct[2]
"{{""India"",""USA""},0.0}"
"{{""India"",""USA""},1.5}"
