# Statistics

In [1]:
import polars as pl

In [2]:
csv_file = "data/titanic.csv"

In [3]:
df = pl.read_csv(csv_file)
df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


## Statistics on a `DataFrame`

In [4]:
df.mean()

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,


In [5]:
df.describe()

statistic,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
str,f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
"""count""",891.0,891.0,891.0,"""891""","""891""",714.0,891.0,891.0,"""891""",891.0,"""204""","""889"""
"""null_count""",0.0,0.0,0.0,"""0""","""0""",177.0,0.0,0.0,"""0""",0.0,"""687""","""2"""
"""mean""",446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
"""std""",257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
"""min""",1.0,0.0,1.0,"""Abbing, Mr. Anthony""","""female""",0.42,0.0,0.0,"""110152""",0.0,"""A10""","""C"""
"""25%""",224.0,0.0,2.0,,,20.0,0.0,0.0,,7.925,,
"""50%""",446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
"""75%""",669.0,1.0,3.0,,,38.0,1.0,0.0,,31.0,,
"""max""",891.0,1.0,3.0,"""van Melkebeke, Mr. Philemon""","""male""",80.0,8.0,6.0,"""WE/P 5735""",512.3292,"""T""","""S"""


In [6]:
df.describe(percentiles=(0.1,0.3,0.5,0.7,0.9))

statistic,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
str,f64,f64,f64,str,str,f64,f64,f64,str,f64,str,str
"""count""",891.0,891.0,891.0,"""891""","""891""",714.0,891.0,891.0,"""891""",891.0,"""204""","""889"""
"""null_count""",0.0,0.0,0.0,"""0""","""0""",177.0,0.0,0.0,"""0""",0.0,"""687""","""2"""
"""mean""",446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
"""std""",257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
"""min""",1.0,0.0,1.0,"""Abbing, Mr. Anthony""","""female""",0.42,0.0,0.0,"""110152""",0.0,"""A10""","""C"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""30%""",268.0,0.0,2.0,,,22.0,0.0,0.0,,8.05,,
"""50%""",446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
"""70%""",624.0,1.0,3.0,,,36.0,1.0,0.0,,27.0,,
"""90%""",802.0,1.0,3.0,,,50.0,1.0,2.0,,77.9583,,


## Statistics in an expression

In [8]:
df.select(
    pl.col("Fare").mean()
)

Fare
f64
32.204208


## Rolling statistics

In [9]:
df_rolling = (
    pl.DataFrame(
        {
            "value":range(12),
        }
    )
)
df_rolling.head()

value
i64
0
1
2
3
4


In [10]:
df_rolling.with_columns(
    rolling_mean_value = pl.col("value").rolling_mean(window_size=4)
)

value,rolling_mean_value
i64,f64
0,
1,
2,
3,1.5
4,2.5
…,…
7,5.5
8,6.5
9,7.5
10,8.5


In [13]:
df_rolling.with_columns(
    rolling_mean_value=pl.col("value").rolling_mean(window_size=4),
    rolling_mean_value_min_periods=pl.col("value").rolling_mean(
        window_size=4, min_samples=1
    ),
)

value,rolling_mean_value,rolling_mean_value_min_periods
i64,f64,f64
0,,0.0
1,,0.5
2,,1.0
3,1.5,1.5
4,2.5,2.5
…,…,…
7,5.5,5.5
8,6.5,6.5
9,7.5,7.5
10,8.5,8.5


Center the statistic

In [14]:
(
    df_rolling
    .with_columns(
        rolling_mean_value = pl.col("value").rolling_mean(window_size=5),
        rolling_mean_value_center = pl.col("value").rolling_mean(window_size=5,center=True)
    ).head(5)
)

value,rolling_mean_value,rolling_mean_value_center
i64,f64,f64
0,,
1,,
2,,2.0
3,,3.0
4,2.0,4.0


## Exponentially-weighted statistics

In [15]:
df_rolling.with_columns(
    ewm_mean_value = pl.col("value").ewm_mean(span=4)
)

value,ewm_mean_value
i64,f64
0,0.0
1,0.625
2,1.326531
3,2.095588
4,2.921582
…,…
7,5.636665
8,6.591623
9,7.560834
10,8.540053


Exponentially-weighted statistics available are:
- `ewm_mean`
- `ewm_std`
- `ewm_var`

### Multiple statistics

In [16]:
df_rolling.select(
    pl.col(pl.Int64).min().name.suffix("_min"),
    pl.col(pl.Int64).max().name.suffix("_max"),
)

value_min,value_max
i64,i64
0,11


In [18]:
df.with_columns(
    (
        (pl.col(pl.Int64) - pl.col(pl.Int64).min())
        / (pl.col(pl.Int64).max() - pl.col(pl.Int64).min())
    ).name.suffix("_scaled")
).head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,PassengerId_scaled,Survived_scaled,Pclass_scaled,SibSp_scaled,Parch_scaled
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str,f64,f64,f64,f64,f64
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",0.0,0.0,1.0,0.125,0.0
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""",0.001124,1.0,0.0,0.125,0.0
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""",0.002247,1.0,1.0,0.0,0.0


## Horizontal computations

In [19]:
df_hor = pl.DataFrame(
    {
        "vals1":[0,1,2],
        "val2":[3,4,5]
    }
)
df_hor

vals1,val2
i64,i64
0,3
1,4
2,5


In [20]:
df_hor.with_columns(
    pl.max_horizontal(pl.all()).alias("max"),
    pl.min_horizontal(pl.all()).alias("min"),
    pl.sum_horizontal(pl.all()).alias("sum"),
)

vals1,val2,max,min,sum
i64,i64,i64,i64,i64
0,3,3,0,3
1,4,4,1,5
2,5,5,2,7


As any `cum_sum` is not an aggregation (i.e. the output is not a scalar but a `Series` the same length as the input) 

The `cum_sum_horizontal` output is a `pl.Struct` column with the number of fields equal to the number of columns

In [21]:
df_hor.with_columns(
    pl.cum_sum_horizontal(pl.all())
)

vals1,val2,cum_sum
i64,i64,struct[2]
0,3,"{0,3}"
1,4,"{1,5}"
2,5,"{2,7}"


In [23]:
df_hor.with_columns(
    pl.concat_list(pl.all()).alias("concat")
)

vals1,val2,concat
i64,i64,list[i64]
0,3,"[0, 3]"
1,4,"[1, 4]"
2,5,"[2, 5]"


# Exercises

### Exercise 1 - calculating multiple statistics
Calculate the mean and median of the `Age` column for passengers in 1st class

In [24]:
df = pl.read_csv(csv_file)

In [None]:
df.filter(
    pl.col("Pclass") == 1
).select(
    pl.col("Age").mean().alias("Age_mean"),
    pl.col("Age").median().alias("Age_median"),
)

Age_mean,Age_median
f64,f64
38.233441,37.0


Add a new column called `Age_delta` that is the difference between the age and the average age of all passengers

In [27]:
df.with_columns(
    Age_delta = pl.col("Age") - pl.col("Age").mean()
).select(
    "Age", "Age_delta"
)

Age,Age_delta
f64,f64
22.0,-7.699118
38.0,8.300882
26.0,-3.699118
35.0,5.300882
35.0,5.300882
…,…
27.0,-2.699118
19.0,-10.699118
,
26.0,-3.699118


Add another column called `Age_z` that has the z-score for the `Age` where the z-score is the (age - average age of the column) divided by the standard deviation of the age column

In [32]:
df.with_columns(
    (pl.col("Age") - pl.col("Age").mean()).alias("Age_delta"),
    Age_z = (pl.col('Age') - pl.col('Age').mean()) / pl.col("Age").std()
).select(
    "Age", "Age_delta", "Age_z"
)

Age,Age_delta,Age_z
f64,f64,f64
22.0,-7.699118,-0.530005
38.0,8.300882,0.57143
26.0,-3.699118,-0.254646
35.0,5.300882,0.364911
35.0,5.300882,0.364911
…,…,…
27.0,-2.699118,-0.185807
19.0,-10.699118,-0.736524
,,
26.0,-3.699118,-0.254646


Create these new columns for all floating point columns in the CSV. 

Add a `pipe` command if you want to sort the columns alphabetically

In [33]:
df.with_columns(
    (pl.col(pl.Float64) - pl.col(pl.Float64).mean()).name.suffix("_delta"),
    ((pl.col(pl.Float64) - pl.col(pl.Float64).mean()) / pl.col(pl.Float64).std()).name.suffix("_z")
).select(
    pl.col(pl.Float64)
).pipe(lambda df: df.select(sorted(df.columns)))

Age,Age_delta,Age_z,Fare,Fare_delta,Fare_z
f64,f64,f64,f64,f64,f64
22.0,-7.699118,-0.530005,7.25,-24.954208,-0.502163
38.0,8.300882,0.57143,71.2833,39.079092,0.786404
26.0,-3.699118,-0.254646,7.925,-24.279208,-0.48858
35.0,5.300882,0.364911,53.1,20.895792,0.420494
35.0,5.300882,0.364911,8.05,-24.154208,-0.486064
…,…,…,…,…,…
27.0,-2.699118,-0.185807,13.0,-19.204208,-0.386454
19.0,-10.699118,-0.736524,30.0,-2.204208,-0.044356
,,,23.45,-8.754208,-0.176164
26.0,-3.699118,-0.254646,30.0,-2.204208,-0.044356


### Exercise 2
We have the following `DataFrame` with values that occur in sequences in the `records` column

In [34]:
records = (
    pl.DataFrame(
        {
            "values":['A','A','A','B','B','A','A']
        }
    )
)
records

values
str
"""A"""
"""A"""
"""A"""
"""B"""
"""B"""
"""A"""
"""A"""


We want to identify sequences of rows with the same values in the `values` column to get the following output

In [35]:
(
    pl.DataFrame(
        {
            "values":['A','A','A','B','B','A','A'],
            "groups":[0,0,0,1,1,2,2]
        }
    )
)

values,groups
str,i64
"""A""",0
"""A""",0
"""A""",0
"""B""",1
"""B""",1
"""A""",2
"""A""",2


In [37]:
records.with_columns(
    notEqualsPrevious = pl.col("values") != pl.col("values").shift(1)
)

values,notEqualsPrevious
str,bool
"""A""",
"""A""",False
"""A""",False
"""B""",True
"""B""",False
"""A""",True
"""A""",False


Use a cumulative function on `notEqualsPrevious` to increment an integer value whenever a row that is not equal to the previous value is encountered.

In [38]:
records.with_columns(
    notEqualsPrevious = pl.col("values") != pl.col("values").shift(1)
).with_columns(
    (pl.col("notEqualsPrevious").cast(pl.Int32).cum_sum().fill_null(0)).alias("group")
)

values,notEqualsPrevious,group
str,bool,i32
"""A""",,0
"""A""",False,0
"""A""",False,0
"""B""",True,1
"""B""",False,1
"""A""",True,2
"""A""",False,2


### Exercise 3
We are given the following data from three weather stations over 8 months

In [39]:
data = [
    {"Year": 2023, "Month": "Jan", "Station_A (°C)": 20.5, "Station_B (°C)": 18.0, "Station_C (°C)": 25.0},
    {"Year": 2023, "Month": "Feb", "Station_A (°C)": 21.0, "Station_B (°C)": 18.5, "Station_C (°C)": 26.0},
    {"Year": 2023, "Month": "Mar", "Station_A (°C)": 23.5, "Station_B (°C)": 20.0, "Station_C (°C)": 28.0},
    {"Year": 2023, "Month": "Apr", "Station_A (°C)": 25.0, "Station_B (°C)": 22.0, "Station_C (°C)": 29.5},
    {"Year": 2023, "Month": "May", "Station_A (°C)": 26.5, "Station_B (°C)": 23.0, "Station_C (°C)": 30.0},
    {"Year": 2023, "Month": "Jun", "Station_A (°C)": 28.0, "Station_B (°C)": 24.0, "Station_C (°C)": 32.0},
    {"Year": 2023, "Month": "Jul", "Station_A (°C)": 29.0, "Station_B (°C)": 25.5, "Station_C (°C)": 33.5},
    {"Year": 2023, "Month": "Aug", "Station_A (°C)": 30.0, "Station_B (°C)": 26.0, "Station_C (°C)": 34.0}
]
df_weather = pl.DataFrame(data)
df_weather

Year,Month,Station_A (°C),Station_B (°C),Station_C (°C)
i64,str,f64,f64,f64
2023,"""Jan""",20.5,18.0,25.0
2023,"""Feb""",21.0,18.5,26.0
2023,"""Mar""",23.5,20.0,28.0
2023,"""Apr""",25.0,22.0,29.5
2023,"""May""",26.5,23.0,30.0
2023,"""Jun""",28.0,24.0,32.0
2023,"""Jul""",29.0,25.5,33.5
2023,"""Aug""",30.0,26.0,34.0


Add a column with the max temperature for each month

In [44]:
df_weather.with_columns(
    pl.max_horizontal(pl.col(pl.Float64)).alias("max")
)

Year,Month,Station_A (°C),Station_B (°C),Station_C (°C),max
i64,str,f64,f64,f64,f64
2023,"""Jan""",20.5,18.0,25.0,25.0
2023,"""Feb""",21.0,18.5,26.0,26.0
2023,"""Mar""",23.5,20.0,28.0,28.0
2023,"""Apr""",25.0,22.0,29.5,29.5
2023,"""May""",26.5,23.0,30.0,30.0
2023,"""Jun""",28.0,24.0,32.0,32.0
2023,"""Jul""",29.0,25.5,33.5,33.5
2023,"""Aug""",30.0,26.0,34.0,34.0


Add another column called `std` with the standard deviation of measurements for each month rounded off to one decimal place

In [46]:
df_weather.with_columns(
    pl.max_horizontal(pl.col(pl.Float64)).alias("max"),
    std = pl.concat_list(pl.col(pl.Float64)).list.eval(
        pl.element().std()
    ).list.get(0).round(1)
)

Year,Month,Station_A (°C),Station_B (°C),Station_C (°C),max,std
i64,str,f64,f64,f64,f64,f64
2023,"""Jan""",20.5,18.0,25.0,25.0,3.5
2023,"""Feb""",21.0,18.5,26.0,26.0,3.8
2023,"""Mar""",23.5,20.0,28.0,28.0,4.0
2023,"""Apr""",25.0,22.0,29.5,29.5,3.8
2023,"""May""",26.5,23.0,30.0,30.0,3.5
2023,"""Jun""",28.0,24.0,32.0,32.0,4.0
2023,"""Jul""",29.0,25.5,33.5,33.5,4.0
2023,"""Aug""",30.0,26.0,34.0,34.0,4.0
