# Import Libraries

In [None]:
import polars as pl
import plotly.express as px

# Define Raw Data Path

In [3]:
csv_file = "data/titanic.csv"

# Import Data Via Polars

In [4]:
df = pl.read_csv(csv_file)
df

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""
…,…,…,…,…,…,…,…,…,…,…,…
887,0,2,"""Montvila, Rev. Juozas""","""male""",27.0,0,0,"""211536""",13.0,,"""S"""
888,1,1,"""Graham, Miss. Margaret Edith""","""female""",19.0,0,0,"""112053""",30.0,"""B42""","""S"""
889,0,3,"""Johnston, Miss. Catherine Hele…","""female""",,1,2,"""W./C. 6607""",23.45,,"""S"""
890,1,1,"""Behr, Mr. Karl Howell""","""male""",26.0,0,0,"""111369""",30.0,"""C148""","""C"""


In [5]:
df.head()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [6]:
# Preview data
df.glimpse()

Rows: 891
Columns: 12
$ PassengerId <i64> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ Survived    <i64> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1
$ Pclass      <i64> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2
$ Name        <str> 'Braund, Mr. Owen Harris', 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)', 'Heikkinen, Miss. Laina', 'Futrelle, Mrs. Jacques Heath (Lily May Peel)', 'Allen, Mr. William Henry', 'Moran, Mr. James', 'McCarthy, Mr. Timothy J', 'Palsson, Master. Gosta Leonard', 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)', 'Nasser, Mrs. Nicholas (Adele Achem)'
$ Sex         <str> 'male', 'female', 'female', 'female', 'male', 'male', 'male', 'male', 'female', 'female'
$ Age         <f64> 22.0, 38.0, 26.0, 35.0, 35.0, null, 54.0, 2.0, 27.0, 14.0
$ SibSp       <i64> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1
$ Parch       <i64> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0
$ Ticket      <str> 'A/5 21171', 'PC 17599', 'STON/O2. 3101282', '113803', '373450', '330877', '17463', '349909', '347742', '237736'
$ Fare        <f64> 7.25, 71.2833, 7

# Access Data

Use `[]` to access the particular data like what Pandas did.

In [7]:
df[:3, ["PassengerId", "Pclass", "Name"]]

PassengerId,Pclass,Name
i64,i64,str
1,3,"""Braund, Mr. Owen Harris"""
2,1,"""Cumings, Mrs. John Bradley (Fl…"
3,3,"""Heikkinen, Miss. Laina"""


# Expression API

In [8]:
df.select(
    [pl.col("Pclass"), pl.col("Name").str.to_lowercase(), pl.col(("Age")).round(0)]
)

Pclass,Name,Age
i64,str,f64
3,"""braund, mr. owen harris""",22.0
1,"""cumings, mrs. john bradley (fl…",38.0
3,"""heikkinen, miss. laina""",26.0
1,"""futrelle, mrs. jacques heath (…",35.0
3,"""allen, mr. william henry""",35.0
…,…,…
2,"""montvila, rev. juozas""",27.0
1,"""graham, miss. margaret edith""",19.0
3,"""johnston, miss. catherine hele…",
1,"""behr, mr. karl howell""",26.0


In [9]:
df.group_by(["Survived", "Pclass"]).agg(pl.col("PassengerId").count().alias("counts"))

Survived,Pclass,counts
i64,i64,u32
0,3,372
1,2,87
1,3,119
0,2,97
1,1,136
0,1,80


# Visualization

In [10]:
px.scatter(
    x=df["Age"],
    y=df["Fare"]
)

# Lazy Mode

In [None]:
# Show polars optimizations
print(
    pl.scan_csv(csv_file)
    .group_by(["Survived", "Pclass"])
    .agg(pl.col("PassengerId").count().alias("counts"))
    .explain()
)

AGGREGATE[maintain_order: false]
  [col("PassengerId").count().alias("counts")] BY [col("Survived"), col("Pclass")]
  FROM
  Csv SCAN [data/titanic.csv]
  PROJECT 3/12 COLUMNS
  ESTIMATED ROWS: 971


In [None]:
pl.scan_csv(csv_file).group_by(["Survived", "Pclass"]).agg(
    pl.col("PassengerId").count().alias("counts")
).collect()

Survived,Pclass,counts
i64,i64,u32
0,3,372
1,3,119
1,2,87
1,1,136
0,2,97
0,1,80


In [None]:
pl.scan_csv(csv_file).group_by(["Survived", "Pclass"]).agg(
    pl.col("PassengerId").count().alias("counts")
).collect(
    engine="streaming"
)  # streaming provides faster efficiency

Survived,Pclass,counts
i64,i64,u32
1,3,119
0,2,97
1,1,136
0,3,372
1,2,87
0,1,80
