# Creating and selecting `pl.List` and `pl.Array` columns

In [2]:
import polars as pl
import numpy as np

## Creating a sequence column
### Creating a `pl.List` column from a `list`

In [3]:
df_lists = pl.DataFrame(
    {
        'ints':[ 
            [0,1], 
            [2,3]
        ],
        'floats':[ 
            [0.0,1], 
            [2,3]
        ],
        'strings':[ 
            ["0","1"],
            ["2","3"]
        ]
    },
    strict=False
)
df_lists

ints,floats,strings
list[i64],list[f64],list[str]
"[0, 1]","[0.0, 1.0]","[""0"", ""1""]"
"[2, 3]","[2.0, 3.0]","[""2"", ""3""]"


In reality the data on each row of a `pl.List` column is a Polars `Series`.

We can see the underlying `Series` by selecting a row in a `pl.List` column

In [4]:
df_lists[0, "ints"]

0
1


We can configure how many list elements are printed with a `pl.Config` setting

In [5]:
pl.Config.set_fmt_table_cell_list_len(20)

polars.config.Config

### Creating a `pl.Array` column from a `list`
Polars also have the `pl.Array` dtype. The fundamental difference is that:
- for a `pl.List` column the sequence on each row can have `different` lengths
- for a `pl.Array` column the sequence on each row must have the `same` length

As the sequence length is predictable for the `pl.Array` dtype it may use less memory and run computation faster in some cases with large datasets.

In [6]:
df_arrays = pl.DataFrame(
    {
        'ints':[ 
            [0,1], 
            [2,3]
        ],
        'floats':[ 
            [0.0,1], 
            [2,3]
        ],
        'strings':[ 
            ["0","1"],
            ["2","3"]
        ]
    },
    schema={
        "ints":pl.Array(pl.Int32,2),
        "floats":pl.Array(pl.Float32,2),
        "strings":pl.Array(pl.String,2)
    }
)
df_arrays

ints,floats,strings
"array[i32, 2]","array[f32, 2]","array[str, 2]"
"[0, 1]","[0.0, 1.0]","[""0"", ""1""]"
"[2, 3]","[2.0, 3.0]","[""2"", ""3""]"


Create a `pl.Array` dtype we need to pass:
- the inner dtype and
- the shape of the sequence on all rows

Convert a `pl.Array` column from a `pl.List` column with the `list.to_array` expression along with the `width` of each row

In [7]:
df_lists\
.select(
    pl.col("ints"),
    pl.col("ints").list.to_array(width=2).alias("ints_array")
)

ints,ints_array
list[i64],"array[i64, 2]"
"[0, 1]","[0, 1]"
"[2, 3]","[2, 3]"


### Creating a `pl.List` column from a function

In [8]:
pl.DataFrame({"index": [0, 1, 2, 3]})\
    .with_columns(
    pl.int_ranges(0, pl.col("index") + 1).alias("range")
)

index,range
i64,list[i64]
0,[0]
1,"[0, 1]"
2,"[0, 1, 2]"
3,"[0, 1, 2, 3]"


### Creating a sequence column by horizontally-concatenating other columns

In [9]:
pl.DataFrame(
        {
            "val1":[0,1,2,3],
            "val2":[10,11,12,13],
        }
    )\
    .with_columns(
        pl.concat_list("val1", "val2").alias # list
        ("concat")
    )

val1,val2,concat
i64,i64,list[i64]
0,10,"[0, 10]"
1,11,"[1, 11]"
2,12,"[2, 12]"
3,13,"[3, 13]"


> **Note:** If we did not pass `alias` then the column would be called `val1` - this follows a general rule in Polars that if we create an expression from multiple columns then the name is set by the `first` input column listed.

In [10]:
pl.DataFrame(
        {
            "val1":[0,1,2,3],
            "val2":[10,11,12,13],
        }
    )\
    .with_columns(
        pl.concat_arr("val1", "val2").alias # array
        ("concat")
    )

val1,val2,concat
i64,i64,"array[i64, 2]"
0,10,"[0, 10]"
1,11,"[1, 11]"
2,12,"[2, 12]"
3,13,"[3, 13]"


## Selecting sequence columns
To select multiple `pl.List` columns we must pass the column dtype

In [11]:
df_lists.select(
    pl.col(pl.List(pl.Int64))
)

ints
list[i64]
"[0, 1]"
"[2, 3]"


To select multiple `pl.Array` columns we must pass the inner dtype and length

In [12]:
df_arrays.select(
    pl.col(pl.Array(pl.String, shape=2))
)

strings
"array[str, 2]"
"[""0"", ""1""]"
"[""2"", ""3""]"


### Length of `pl.List` columns

In [13]:
pl.DataFrame(
    {
        "values": [[0, 1], [2, 3, 4], [4, 5, 6, 7, 8]],
    }
)\
.with_columns(
    pl.col("values").list.len().alias("len")
)

values,len
list[i64],u32
"[0, 1]",2
"[2, 3, 4]",3
"[4, 5, 6, 7, 8]",5


## Turning sequence columns into rows
We use `explode` to expand each sequence into its own row.

In [14]:
df_list = pl.DataFrame(
    {
        "id":["a","b"],
        'values':[ 
            [0,1], 
            [2,3,4]
        ],
    }
)
df_list

id,values
str,list[i64]
"""a""","[0, 1]"
"""b""","[2, 3, 4]"


In [15]:
df_list.explode("values")

id,values
str,i64
"""a""",0
"""a""",1
"""b""",2
"""b""",3
"""b""",4


### Using `explode` to do operations on sequence columns

In [16]:
df_list.explode("values")\
    .with_columns(
        pl.col("values").rank().over("id").cast(pl.Int32).alias("rank")
    )

id,values,rank
str,i64,i32
"""a""",0,1
"""a""",1,2
"""b""",2,1
"""b""",3,2
"""b""",4,3


`rank().over()` is the window function which occurs in SQL as well.

In [19]:
df_list.explode("values")\
    .with_columns(
        pl.col("values").rank().over("id").cast(pl.Int32).alias("rank")
    )\
    .group_by("id", maintain_order=True)\
    .agg(
        pl.col("values"),
        pl.col("rank")
    )

id,values,rank
str,list[i64],list[i32]
"""a""","[0, 1]","[1, 2]"
"""b""","[2, 3, 4]","[1, 2, 3]"


## Convert a sequence column to a `pl.Struct` column
We convert a `pl.List` column to a `pl.Struct` column with `list.to_struct`.

> Be aware that data is stored in memory differently in a `pl.List` column compared to a `pl.Struct` column.
> - In a `pl.List` column each row horizontally is a `Series`
> - In a `pl.Struct` column each nested column vertically is a `Series`.
> 
> Transforming from a `pl.List` to `pl.Struct` may be memory intensive for large `DataFrames`

In [20]:
pl.DataFrame(
    {
        "values": [[0, 1], [2, 3], [4, 5]],
    }
)\
.with_columns(
    value_struct = pl.col("values").list.to_struct(fields=["value_0", "value_1"])
)

values,value_struct
list[i64],struct[2]
"[0, 1]","{0,1}"
"[2, 3]","{2,3}"
"[4, 5]","{4,5}"


Similarly we can do the same for a `pl.Array` column with `arr.to_struct`.

In [24]:
pl.DataFrame(
    {
        "values": [[0, 1], [2, 3], [4, 5]],
    }
)\
.with_columns(
    value_struct = pl.col("values").cast(pl.Array(pl.Int64, 2)).arr.to_struct(fields=["value_0", "value_1"])
)

values,value_struct
list[i64],struct[2]
"[0, 1]","{0,1}"
"[2, 3]","{2,3}"
"[4, 5]","{4,5}"


`Unnest` the struct data

In [25]:
pl.DataFrame(
    {
        "values": [[0, 1], [2, 3], [4, 5]],
    }
)\
.with_columns(
    value_struct = pl.col("values").list.to_struct(fields=["value_0", "value_1"])
)\
.unnest("value_struct")

values,value_0,value_1
list[i64],i64,i64
"[0, 1]",0,1
"[2, 3]",2,3
"[4, 5]",4,5


While a `pl.List` list can have a variable number of elements, a `pl.Struct` has a fixed number of elements on each row. 

The number of struct nested columns is set by the length of the `first` row the `pl.List` column:
- If subsequent `pl.List` rows are shorter then these become `null` values in the `pl.Struct` nested columns
- If subsequent `pl.List` rows are longer then these are dropped from the the `pl.Struct` nested columns

## Convert a sequence column to a Numpy array
A sequence column is a natural way to hold array data that we may need to conver to a two dimensional Numpy array.

### Convert a `pl.List` column to a Numpy array

In [27]:
df_embeddings = pl.DataFrame(
    {
        "embeddings": [[0.0, 1.0], [2.0, 3.0], [4.0, 5.0]],
    }
)
df_embeddings

embeddings
list[f64]
"[0.0, 1.0]"
"[2.0, 3.0]"
"[4.0, 5.0]"


`to_numpy` only provides **one-dimensional `object`** Numpy array where each element is an array

In [28]:
df_embeddings["embeddings"].to_numpy()

array([array([0., 1.]), array([2., 3.]), array([4., 5.])], dtype=object)

`explode` the `pl.List` column and `reshape` it to have a 2D object 

In [None]:
df_embeddings["embeddings"]\
.explode()\
.to_numpy()\
.reshape(len(df_embeddings), -1) 
# -1 means calculate the dimension automatically

array([[0., 1.],
       [2., 3.],
       [4., 5.]])

### Convert a `pl.Array` column to a Numpy array
A `pl.Array` column is an even better fit for holding data that might go to/from Numpy.

In [31]:
df_embeddings\
.with_columns(
    pl.col("embeddings").list.to_array(width=2)
)\
["embeddings"].to_numpy()

array([[0., 1.],
       [2., 3.],
       [4., 5.]])

## Which one should we use?  

### `pl.List` or `pl.Array`?
- If your rows might have variable length use `pl.List`
- If your dimension is greater than 1 use `pl.Array`
- The `pl.List` pre-dates the `pl.Array` and has more functionality in the `list` expression namespace

## Exercises

### Exercise 1

In [32]:
df_lists = pl.DataFrame(
    {
        'ints':[ 
            [0,1], 
            [2,3]
        ],
        'floats':[ 
            [0.0,1], 
            [2,3]
        ],
        'strings':[ 
            ["0","1"],
            ["2","3"]
        ]

    },
    strict=False
)
df_lists

ints,floats,strings
list[i64],list[f64],list[str]
"[0, 1]","[0.0, 1.0]","[""0"", ""1""]"
"[2, 3]","[2.0, 3.0]","[""2"", ""3""]"


Select the floating point list column from `df_lists`

In [34]:
df_lists\
.select(
    pl.col(pl.List(pl.Float64))
)

floats
list[f64]
"[0.0, 1.0]"
"[2.0, 3.0]"


Select the floating point **and** integer list column from `df_lists`

In [35]:
df_lists\
.select(
    pl.col(pl.List(pl.Float64)),
    pl.col(pl.List(pl.Int64)),
)

floats,ints
list[f64],list[i64]
"[0.0, 1.0]","[0, 1]"
"[2.0, 3.0]","[2, 3]"


Convert the `strings` list column to a `pl.Array` column

In [37]:
df_lists\
.select(
    pl.col("strings").list.to_array(2)
)

strings
"array[str, 2]"
"[""0"", ""1""]"
"[""2"", ""3""]"


### Exercise 2
We create a `pl.List` column from the Titanic dataset by splitting the `Name` column on every whitespace

In [33]:
csv_file = "data/titanic.csv"
df = (
    pl.read_csv(csv_file)
    .select(
        [
            "PassengerId",
            "Pclass",
            "Name",
            pl.col("Name").str.split(" ").alias("Name_list")
        ]
    )
)
df.head(2)

PassengerId,Pclass,Name,Name_list
i64,i64,str,list[str]
1,3,"""Braund, Mr. Owen Harris""","[""Braund,"", ""Mr."", ""Owen"", ""Harris""]"
2,1,"""Cumings, Mrs. John Bradley (Fl…","[""Cumings,"", ""Mrs."", ""John"", ""Bradley"", ""(Florence"", ""Briggs"", ""Thayer)""]"


Expand the `Name_list` column into separate rows

In [39]:
df["Name_list"]\
.explode()\
.head(3)

Name_list
str
"""Braund,"""
"""Mr."""
"""Owen"""


Filter to remove rows with the titles: "Mr.","Mrs.","Miss.","Master." from the output

In [43]:
df\
.explode("Name_list")\
.filter(
    ~pl.col("Name_list").is_in(["Mr.","Mrs.","Miss.","Master."])
).head(3)

PassengerId,Pclass,Name,Name_list
i64,i64,str,str
1,3,"""Braund, Mr. Owen Harris""","""Braund,"""
1,3,"""Braund, Mr. Owen Harris""","""Owen"""
1,3,"""Braund, Mr. Owen Harris""","""Harris"""


Find the most common names:

After filtering the titles count the occurence of each name in the `Name_list` column using `.value_counts(sort=True)`

In [None]:
df\
.explode("Name_list")\
.filter(
    ~pl.col("Name_list").is_in(["Mr.","Mrs.","Miss.","Master."])
)["Name_list"]\
.value_counts(sort=True)\
.head(3)

# DataFrame doesn't have value_counts method.

SyntaxError: invalid syntax (4095833256.py, line 7)