# Notes


In [1]:
import chance
import duckdb
import pyarrow as pa
import pyarrow.compute as pc
import pandas as pd
from graphviz import Digraph
from fields import Fields

## Data cleaning and optimisation

In [2]:
df = pd.read_parquet("./data/us_births_combined.parquet", dtype_backend="pyarrow")

In [3]:
def constrain_pa_series_to_uint8(series: pd.Series, min: int = 0, max: int = 255) -> pd.Series:
    arr = pa.array(series, type=pa.float64())
    arr_i8 = constrain_pa_array_to_uint8(arr, min=min, max=max)
    return pd.Series(arr_i8, dtype="uint8[pyarrow]")


def constrain_pa_series_to_uint16(series: pd.Series, min: int = 0, max: int = 65535) -> pd.Series:
    arr = pa.array(series, type=pa.float64())
    arr_i16 = constrain_pa_array_to_uint16(arr, min=min, max=max)
    return pd.Series(arr_i16, dtype="uint16[pyarrow]")


def constrain_pa_array_to_uint8(arr: pa.Array, min: int = 0, max: int = 255) -> pa.Array:
    trunc = pc.round(arr, ndigits=0, round_mode="towards_zero")
    is_finite = pc.is_finite(trunc)
    lo = pa.scalar(min, type=pa.float64())
    hi = pa.scalar(max, type=pa.float64())
    ge_lo = pc.greater_equal(trunc, lo)
    le_hi = pc.less_equal(trunc, hi)
    in_range = pc.and_kleene(ge_lo, le_hi)
    keep = pc.and_kleene(is_finite, in_range)
    trunc_masked = pc.if_else(keep, trunc, pa.scalar(None, type=pa.float64()))
    arr_i8 = pc.cast(trunc_masked, pa.uint8(), safe=False)
    return arr_i8


def constrain_pa_array_to_uint16(arr: pa.Array, min: int = 0, max: int = 65535) -> pa.Array:
    trunc = pc.round(arr, ndigits=0, round_mode="towards_zero")
    is_finite = pc.is_finite(trunc)
    lo = pa.scalar(min, type=pa.float64())
    hi = pa.scalar(max, type=pa.float64())
    ge_lo = pc.greater_equal(trunc, lo)
    le_hi = pc.less_equal(trunc, hi)
    in_range = pc.and_kleene(ge_lo, le_hi)
    keep = pc.and_kleene(is_finite, in_range)
    trunc_masked = pc.if_else(keep, trunc, pa.scalar(None, type=pa.float64()))
    arr_i16 = pc.cast(trunc_masked, pa.uint16(), safe=False)
    return arr_i16

In [4]:
df[Fields.DOB_YY] = constrain_pa_series_to_uint16(df[Fields.DOB_YY], min=1989)
df[Fields.DOB_MM] = constrain_pa_series_to_uint8(df[Fields.DOB_MM], min=1, max=12)
df[Fields.BFACIL] = constrain_pa_series_to_uint8(df[Fields.BFACIL], min=1, max=9)
df[Fields.F_BFACIL] = constrain_pa_series_to_uint8(df[Fields.BFACIL], min=0, max=1)
df[Fields.MAGE_IMPFLG] = constrain_pa_series_to_uint8(df[Fields.MAGE_IMPFLG], min=0, max=1)
df[Fields.MAGE_REPFLG] = constrain_pa_series_to_uint8(pd.to_numeric(df[Fields.MAGE_REPFLG], errors="coerce"), min=0,
                                                      max=1)
df[Fields.MAGER] = constrain_pa_series_to_uint8(df[Fields.MAGER], min=12, max=50)
df[Fields.MAGER14] = constrain_pa_series_to_uint8(df[Fields.MAGER14], min=1, max=14)
df[Fields.MAGER9] = constrain_pa_series_to_uint8(df[Fields.MAGER9], min=1, max=14)
df[Fields.MBSTATE_REC] = constrain_pa_series_to_uint8(df[Fields.MBSTATE_REC], min=1, max=3)
df[Fields.RESTATUS] = constrain_pa_series_to_uint8(df[Fields.RESTATUS], min=1, max=2)
df[Fields.MRACE31] = constrain_pa_series_to_uint8(df[Fields.MRACE31], min=1, max=31)
df[Fields.MRACE6] = constrain_pa_series_to_uint8(df[Fields.MRACE6], min=1, max=6)
df[Fields.MRACE15] = constrain_pa_series_to_uint8(df[Fields.MRACE15], min=1, max=15)
df[Fields.MRACEIMP] = constrain_pa_series_to_uint8(df[Fields.MRACEIMP], min=1, max=2)
df[Fields.MHISPX] = constrain_pa_series_to_uint8(df[Fields.MHISPX], min=0, max=9)
df[Fields.MHISP_R] = constrain_pa_series_to_uint8(df[Fields.MHISP_R], min=0, max=9)
df[Fields.F_MHISP] = constrain_pa_series_to_uint8(df[Fields.F_MHISP], min=0, max=1)
df[Fields.MRACEHISP] = constrain_pa_series_to_uint8(df[Fields.MRACEHISP], min=1, max=8)
df[Fields.MAR_P] = df[Fields.MAR_P].astype(pd.ArrowDtype(pa.string()))
df[Fields.DMAR] = df[Fields.DMAR].astype(pd.ArrowDtype(pa.string()))
df[Fields.MAR_IMP] = df[Fields.MAR_IMP].astype(pd.ArrowDtype(pa.string()))
df[Fields.F_MAR_P] = constrain_pa_series_to_uint8(df[Fields.F_MAR_P], min=0, max=1)
df[Fields.MEDUC] = constrain_pa_series_to_uint8(df[Fields.MEDUC], min=1, max=9)
df[Fields.F_MEDUC] = constrain_pa_series_to_uint8(df[Fields.F_MEDUC], min=0, max=1)
df[Fields.FAGERPT_FLG] = df[Fields.FAGERPT_FLG].astype(pd.ArrowDtype(pa.string()))
df[Fields.FAGECOMB] = constrain_pa_series_to_uint8(df[Fields.FAGECOMB], min=0, max=99)
df[Fields.FAGEREC11] = constrain_pa_series_to_uint8(df[Fields.FAGEREC11], min=0, max=11)

df[Fields.DOWNS] = constrain_pa_series_to_uint8(df[Fields.DOWNS], min=0, max=255)
df[Fields.UCA_DOWNS] = constrain_pa_series_to_uint8(df[Fields.UCA_DOWNS], min=0, max=255)

df[Fields.CA_DOWN] = df[Fields.CA_DOWN].astype(pd.ArrowDtype(pa.string()))
df[Fields.CA_DOWNS] = df[Fields.CA_DOWNS].astype(pd.ArrowDtype(pa.string()))

df[Fields.CA_DOWN_C] = df[Fields.CA_DOWN].combine_first(df[Fields.CA_DOWNS])


In [5]:
# note: we only have MAGER from 2004
df[Fields.P_DS_LB_NT] = chance.get_ds_lb_nt_probability_array(df[Fields.MAGER])

In [10]:
prevalence_df = pd.DataFrame({
    Fields.DOB_YY: list(range(1989, 2025)),
    Fields.P_DS_LB_WT: [
        0.001038,
        0.001055,
        0.001077,
        0.001083,
        0.001093,
        0.001102,
        0.001121,
        0.001099,
        0.001124,
        0.001136,
        0.001153,
        0.001149,
        0.001179,
        0.001216,
        0.001219,
        0.001218,
        0.001236,
        0.001244,
        0.001261,
        0.001257,
        0.001262,
        0.001244,
        0.00127,
        0.001265,
        0.001283,
        0.001302,
        0.001265051,
        0.001295784,
        0.0013375,
        0.001324215,
        0.001324215,
        0.001324215,
        0.001324215,
        0.001324215,
        0.001324215,
        0.001324215,
    ],
})

df = df.merge(prevalence_df, on=Fields.DOB_YY, how="left")

In [11]:
df_2004 = df[df[Fields.DOB_YY] >= 2004]


In [12]:
df_2004[[Fields.DOB_YY, Fields.P_DS_LB_NT, Fields.P_DS_LB_WT]].groupby(Fields.DOB_YY).sum()

Unnamed: 0_level_0,p_ds_lb_nt,p_ds_lb_wt
dob_yy,Unnamed: 1_level_1,Unnamed: 2_level_1
2004,7440.391929,5016.828726
2005,7493.838661,5123.978904
2006,7665.960631,5315.8919
2007,7748.428247,5452.574088
2008,7699.195333,5348.731092
2009,7565.147534,5221.949032
2010,7439.844521,4984.83862
2011,7426.414635,5030.7494
2012,7500.288505,5010.40694
2013,7568.050772,5056.000212


In [None]:
df[[Fields.DOB_YY, Fields.CA_DOWNS]].groupby(Fields.DOB_YY).value_counts().unstack(fill_value=0).sort_index()

In [None]:
df.dtypes

In [13]:
df.to_parquet("./data/us_births.parquet")

In [14]:
df_2004.to_parquet("./data/us_births_2004_2024.parquet")

In [None]:
from graphviz import Digraph

dag = Digraph('DownSyndromeModel', format='png')

dag.attr(fontname="Helvetica")
dag.attr("node", fontname="Helvetica")
dag.attr("edge", fontname="Helvetica")

# set font sizes
dag.attr(size="8,6")
dag.attr("node", fontsize="14", style="filled", fillcolor="#99ccff")
dag.attr("edge", fontsize="12")

dag.attr(rankdir="TB", splines="spline")  # Top-to-bottom flow
dag.attr("node", shape="circle", fixedsize="true", width="1.75")

edges = [
    ('Age', 'Case'),
    ('Age', 'Screening'),
    ('Age', 'Termination'),
    ('Age', 'Income'),
    ('Income', 'Case'),
    ('Case', 'Termination'),
    ('Case', 'DS birth'),
    ('Screening', 'Termination'),
    ('Termination', 'DS birth'),
    ('DS birth', 'Recorded'),
]

for src, dst in edges:
    dag.edge(src, dst)

dag


In [None]:
con = duckdb.connect(database="./data/us_births.db", read_only=True)

In [None]:
con.execute(
    """
    SELECT dob_yy, ca_down, count(*) as counts
    FROM us_births
    group by dob_yy, ca_down
    order by dob_yy, ca_down
    """
).df()

In [None]:
df_2012 = pd.read_parquet("./data/us_births_2012.parquet")

In [None]:
# counts of each value of df_2012["uca_downs"]
df_2012["ca_downs"].value_counts().sort_index()