# `Series` and `DataFrame`

In [1]:
import polars as pl

In [2]:
csv_file = "data/titanic.csv"

In [3]:
df = pl.read_csv(csv_file)
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


## Convert Between a `Series` and a `DataFrame` column

DataFrame -> Series

In [4]:
# Series
df["Age"].head(3)

Age
f64
22.0
38.0
26.0


In [6]:
df.select("Age").to_series().head(3)

Age
f64
22.0
38.0
26.0


Series -> DataFrame

In [7]:
df["Age"].to_frame().head(3)

Age
f64
22.0
38.0
26.0


## Create a `Series` or `DataFrame` from a `list` or `dict`

list -> Series

In [8]:
values = [1, 2, 3]
pl.Series(values)

1
2
3


In [None]:
# pass an argument to name parameter
pl.Series("values", values)

values
i64
1
2
3


Series -> list

In [10]:
pl.Series("values", values).to_list()

[1, 2, 3]

list -> DataFrame

In [None]:
data = [
    [1, 2, 3],
    [4, 5, 6]
]

pl.DataFrame(
    data,
    schema=["col0", "col1"] # column name
)

col0,col1
i64,i64
1,4
2,5
3,6


dict -> DataFrame

In [None]:
data_dict = {"col0": [1, 2, 3], "col1": [4, 5, 6]}

pl.DataFrame(
    data_dict
)

col0,col1
i64,i64
1,4
2,5
3,6


Specify dtypes when creating DataFrame with dict

In [13]:
data_dict = {"col0": [1, 2, 3], "col1": [4, 5, 6]}

pl.DataFrame(
    data_dict,
    schema={
        "col0": pl.Int32,
        "col1": pl.Int32
    }
)

col0,col1
i32,i32
1,4
2,5
3,6


DataFrame -> list or dict

In [16]:
data_dict = {"col0": [1, 2, 3], "col1": [4, 5, 6]}

pl.DataFrame(
    data_dict
).to_dicts()

[{'col0': 1, 'col1': 4}, {'col0': 2, 'col1': 5}, {'col0': 3, 'col1': 6}]

## Exercises

### Exercise 1
Extract the `Age` column as a `Series` and then find:
- the `dtype` of the `Series`
- the median of the `Series`

In [18]:
df = pl.read_csv(csv_file)

df["Age"].dtype

Float64

In [20]:
df = pl.read_csv(csv_file)

df["Age"].median()

28.0

### Exercise 2
You have the following Python `lists` with data.  

In [17]:
groups = ["a","a","b","b","c"]
values = [0,1,2,3,4]

Create a `Series` called `groups_series` from the `groups` list. The name inside the `Series` should be `groups`

In [23]:
groups_series = pl.Series(name="groups", values=groups)

Create a `DataFrame` with column names `group` and `values` by passing these as a Python `dict` to `pl.DataFrame`

In [24]:
pl.DataFrame(
    {
        "groups": groups,
        "values": values
    }
)

groups,values
str,i64
"""a""",0
"""a""",1
"""b""",2
"""b""",3
"""c""",4
