# Notes


In [None]:
import gc
import chance
import duckdb
import pyarrow as pa
import pyarrow.compute as pc
import pandas as pd
from graphviz import Digraph
from data_utils import constrain_pa_series_to_uint8, constrain_pa_series_to_uint16
from variables import Variables as vars

In [None]:
df = pd.read_parquet("./data/us_births.parquet")

In [None]:
df.dtypes

In [None]:
df[vars.YEAR].value_counts().sort_index()

In [None]:
df[[vars.YEAR, vars.P_DS_LB_NT, vars.P_DS_LB_WT]].groupby(vars.DOB_YY).sum()

In [None]:
df[[vars.YEAR, vars.CA_DOWN_C]].groupby(vars.DOB_YY).value_counts().unstack(fill_value=0).sort_index()

In [None]:
from graphviz import Digraph

dag = Digraph()

dag.attr(fontname="Helvetica")
dag.attr("node", fontname="Helvetica")
dag.attr("edge", fontname="Helvetica")

# set font sizes
dag.attr(size="8,6")
dag.attr("node", fontsize="14", style="filled", fillcolor="#99ccff")
dag.attr("edge", fontsize="12")

dag.attr(rankdir="TB", splines="spline")  # Top-to-bottom flow
dag.attr("node", shape="circle", fixedsize="true", width="1.75")

edges = [
    ('Age', 'Case'),
    ('Age', 'Screening'),
    ('Age', 'Termination'),
    ('Age', 'Income'),
    ('Income', 'Case'),
    ('Case', 'Termination'),
    ('Case', 'DS birth'),
    ('Screening', 'Termination'),
    ('Termination', 'DS birth'),
    ('DS birth', 'Recorded'),
]

for src, dst in edges:
    dag.edge(src, dst)

dag


In [None]:
con = duckdb.connect(database="./data/us_births.db", read_only=True)

In [None]:
con.execute(
    """
    SELECT dob_yy, ca_down, count(*) as counts
    FROM us_births
    group by dob_yy, ca_down
    order by dob_yy, ca_down
    """
).df()

In [None]:
df_2012 = pd.read_parquet("./data/us_births_2012.parquet")

In [None]:
# counts of each value of df_2012["uca_downs"]
df_2012["ca_downs"].value_counts().sort_index()