In [42]:
from zlib import crc32

import polars as pl
import numpy as np

In [43]:
df = pl.read_parquet("../data/predictions/2024-04-29 08:29:woods-hill:DCQO.parquet")

In [44]:
df

predictions,targets,sequence_id,event_id
list[f32],i64,str,str
"[0.0, 1.0]",1,"""Roger Brown""","""2010-06-02 11:…"
"[0.0, 1.0]",1,"""Roger Brown""","""2010-08-20 13:…"
"[0.0, 1.0]",1,"""Roger Brown""","""2020-01-22 10:…"
"[0.0, 1.0]",1,"""Roger Brown""","""2018-12-19 10:…"
"[0.0, 1.0]",1,"""Roger Brown""","""2019-04-16 12:…"
…,…,…,…
"[0.0, 1.0]",1,"""Haley Johnson""","""2010-12-24 11:…"
"[0.0, 1.0]",1,"""Haley Johnson""","""2012-07-01 19:…"
"[0.0, 1.0]",1,"""Haley Johnson""","""2009-12-11 09:…"
"[0.0, 1.0]",1,"""Haley Johnson""","""2012-10-19 20:…"


In [45]:

seed = (
    pl.col("sequence_id")
    .cast(pl.String)
    .map_elements(lambda x: float(crc32(str.encode(f"{x}_")) & 0xFFFFFFFF), return_dtype=pl.Float32)
    .mul(1 / 2**32)
)

filtered = df.filter((seed < 0.1) | (pl.col("targets") == 0))

In [46]:
df.select(seed).describe()

statistic,sequence_id
str,f64
"""count""",4940640.0
"""null_count""",0.0
"""mean""",0.491245
"""std""",0.292947
"""min""",0.002965
"""25%""",0.228349
"""50%""",0.51162
"""75%""",0.758447
"""max""",0.993054


In [47]:
yhat = np.array(filtered.get_column('predictions').to_list())
y = np.array(filtered.get_column("targets").to_list())

In [48]:
from sklearn.metrics import average_precision_score as ap
from sklearn.metrics import roc_auc_score as auc
from sklearn.metrics import f1_score as f1
from sklearn.metrics import accuracy_score as acc


In [49]:
print(f'Average Precision: {ap(y_true=y, y_score=yhat[:,0]):.5f}')
print(f'ROC AUC: {auc(y_true=y, y_score=yhat[:,1]):.5f}')
print(f'F1: {f1(y_true=y, y_pred=yhat[:,1].round()):.5f}')
print(f'Accuracy: {acc(y_true=y, y_pred=yhat[:,1].round()):.5f}')


Average Precision: 0.97714
ROC AUC: 0.99985
F1: 0.99956
Accuracy: 0.99912
