# Quantile and histograms

In [1]:
import polars as pl

In [2]:
csv_file = "data/titanic.csv"

In [3]:
df = pl.read_csv(csv_file)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


## Quantiles

### Quantiles on a `DataFrame`

In [4]:
df.quantile(0.9)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
802.0,1.0,3.0,,,50.0,1.0,2.0,,77.9583,,


In [5]:
df.select(
    pl.col("Age").quantile(0.9)
)

Age
f64
50.0


### Multiple quantile
We can calculate multiple quantile in an expression using a `list comprehension`.

In [6]:
quantile_list = [0.1,0.5,0.9]

df.select(
    [
        pl.col("Age").quantile(q).alias(f"Age_quantile_{q}") for q in quantile_list
    ]
)

Age_quantile_0.1,Age_quantile_0.5,Age_quantile_0.9
f64,f64,f64
14.0,28.0,50.0


`name.suffix()` to avoid column collision.

In [7]:
quantile_list = [0.1,0.5,0.9]

df.select(
    [
        pl.col(pl.Float64).quantile(q).name.suffix(f"_quantile_{q}") for q in quantile_list
    ]
)

Age_quantile_0.1,Fare_quantile_0.1,Age_quantile_0.5,Fare_quantile_0.5,Age_quantile_0.9,Fare_quantile_0.9
f64,f64,f64,f64,f64,f64
14.0,7.55,28.0,14.4542,50.0,77.9583


Quantile can be calculated using a fast-track algorithm on sorted columns and this way can prevent repeated sorting in Polars.

In [8]:
quantile_list = [0.1,0.5,0.9]

df.select(
    pl.col(pl.Float64).sort()
).select(
    [
        pl.col(pl.Float64).quantile(q).name.suffix(f"_quantile_{q}") for q in quantile_list
    ]
)

Age_quantile_0.1,Fare_quantile_0.1,Age_quantile_0.5,Fare_quantile_0.5,Age_quantile_0.9,Fare_quantile_0.9
f64,f64,f64,f64,f64,f64
14.0,7.55,28.0,14.4542,50.0,77.9583


### Interpolation strategy for quantiles
We can use different interpolation strategies for calculating quantiles:
- `nearest` 
- `higher` 
- `lower` 
- `midpoint`
- `linear`

In [9]:
df.select(
    Age_nearest = pl.col("Age").quantile(0.25, interpolation="nearest"),
    Age_linear = pl.col("Age").quantile(0.25, interpolation="linear")
)

Age_nearest,Age_linear
f64,f64
20.0,20.125


## Histogram

### Histogram method
We can calculate a histogram on a `Series`

In [10]:
df["Age"].hist()

breakpoint,category,count
f64,cat,u32
8.378,"""[0.42, 8.378]""",54
16.336,"""(8.378, 16.336]""",46
24.294,"""(16.336, 24.294]""",177
32.252,"""(24.294, 32.252]""",169
40.21,"""(32.252, 40.21]""",118
48.168,"""(40.21, 48.168]""",70
56.126,"""(48.168, 56.126]""",45
64.084,"""(56.126, 64.084]""",24
72.042,"""(64.084, 72.042]""",9
80.0,"""(72.042, 80.0]""",2


- `breakpoint` is the right-hand value of each bin
- `category` is a categorical column showing the range of each bin
- `count` shows the number of values in each bin

In [11]:
df["Age"].hist(bins=list(range(0, 100, 10))) 

breakpoint,category,count
f64,cat,u32
10.0,"""[0.0, 10.0]""",64
20.0,"""(10.0, 20.0]""",115
30.0,"""(20.0, 30.0]""",230
40.0,"""(30.0, 40.0]""",155
50.0,"""(40.0, 50.0]""",86
60.0,"""(50.0, 60.0]""",42
70.0,"""(60.0, 70.0]""",17
80.0,"""(70.0, 80.0]""",5
90.0,"""(80.0, 90.0]""",0


In [13]:
df["Age"].hist(bins=list(range(0, 100, 10)))\
.plot\
.bar(
    x="category",
    y="count",
    color="category"
).properties(width=700)

# Exercises

## Exercise 1 - calculating quantile
Calculate the 25th,50th and 75th percentiles for the `Age` column. 

Output the results as 3 columns (with appropriate names) in a one row `DataFrame`

In [14]:
df.select(
    [
        pl.col("Age").quantile(q).name.suffix(f"_quantile_{q}") for q in [0.25, 0.5, 0.75]
    ]
)

Age_quantile_0.25,Age_quantile_0.5,Age_quantile_0.75
f64,f64,f64
20.0,28.0,38.0


Calculate the same percentiles for all of the numeric columns.


In [15]:
import polars.selectors as cs

df.select(
    [
        cs.numeric().quantile(q).name.suffix(f"_quantile_{q}") for q in [0.25, 0.5, 0.75]
    ]
)

PassengerId_quantile_0.25,Survived_quantile_0.25,Pclass_quantile_0.25,Age_quantile_0.25,SibSp_quantile_0.25,Parch_quantile_0.25,Fare_quantile_0.25,PassengerId_quantile_0.5,Survived_quantile_0.5,Pclass_quantile_0.5,Age_quantile_0.5,SibSp_quantile_0.5,Parch_quantile_0.5,Fare_quantile_0.5,PassengerId_quantile_0.75,Survived_quantile_0.75,Pclass_quantile_0.75,Age_quantile_0.75,SibSp_quantile_0.75,Parch_quantile_0.75,Fare_quantile_0.75
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
224.0,0.0,2.0,20.0,0.0,0.0,7.925,446.0,0.0,3.0,28.0,0.0,0.0,14.4542,669.0,1.0,3.0,38.0,1.0,0.0,31.0


Visualise the histogram of the `Fare` column with bins of width £25 up to £400

In [20]:
df["Fare"].hist(
    bins=list(range(0, 400, 25))
).plot.bar(
    x="category",
    y="count",
    color="category"
).properties(width=600)