In [1]:
from zlib import crc32

import polars as pl
import numpy as np

In [4]:
df = pl.read_parquet("../data/predictions/2024-05-03 13:34:johnson-street:LPQZ.parquet")

In [5]:
df

predictions,targets,sequence_id,event_id
list[f32],i64,str,str
"[0.999823, 0.000177]",0,"""Brandy Bailey""","""2014-10-26 19:…"
"[0.999815, 0.000185]",0,"""Brandy Bailey""","""2017-03-23 16:…"
"[0.999733, 0.000267]",0,"""Brandy Bailey""","""2016-03-08 19:…"
"[0.999583, 0.000417]",0,"""Brandy Bailey""","""2015-09-10 08:…"
"[0.99981, 0.00019]",0,"""Brandy Bailey""","""2016-08-02 19:…"
…,…,…,…
"[0.999599, 0.000401]",0,"""Brandon Little…","""2017-08-20 11:…"
"[0.999733, 0.000267]",0,"""Melanie Brewer…","""2012-02-05 19:…"
"[0.999662, 0.000338]",0,"""Melanie Brewer…","""2011-05-03 20:…"
"[0.999735, 0.000265]",0,"""Melanie Brewer…","""2012-04-29 16:…"


In [18]:

seed = (
    pl.col("event_id")
    .cast(pl.String)
    .map_elements(lambda x: float(crc32(str.encode(f"{x}_")) & 0xFFFFFFFF), return_dtype=pl.Float32)
    .mul(1 / 2**32)
)

filtered = df.filter((seed < 0.1) | (pl.col("targets") == 1))

In [19]:
len(filtered)

497326

In [20]:
df.select(seed).describe()

statistic,event_id
str,f64
"""count""",4924800.0
"""null_count""",0.0
"""mean""",0.49939
"""std""",0.288937
"""min""",1e-06
"""25%""",0.248706
"""50%""",0.49859
"""75%""",0.750504
"""max""",0.999994


In [21]:
yhat = np.array(filtered.get_column('predictions').to_list())
y = np.array(filtered.get_column("targets").to_list())

In [22]:
from sklearn.metrics import average_precision_score as ap
from sklearn.metrics import roc_auc_score as auc
from sklearn.metrics import f1_score as f1
from sklearn.metrics import accuracy_score as acc


In [24]:
print(f'Average Precision: {ap(y_true=y, y_score=yhat[:,1]):.5f}')
print(f'ROC AUC: {auc(y_true=y, y_score=yhat[:,1]):.5f}')
print(f'F1: {f1(y_true=y, y_pred=yhat[:,1].round()):.5f}')
print(f'Accuracy: {acc(y_true=y, y_pred=yhat[:,1].round()):.5f}')


Average Precision: 0.90920
ROC AUC: 0.99951
F1: 0.88251
Accuracy: 0.99771
