# Notes 3


In [1]:
import duckdb
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from graphviz import Digraph
from sympy.physics.units import magnetic_density

from variables import Variables as vars

plt.style.use('../../notebook.mplstyle')

os.makedirs("./output", exist_ok=True)

In [2]:
# con.close()
con = duckdb.connect("./data/us_births.db", read_only=True)

In [3]:
meduc_df = con.execute(
    """
    SELECT b.year,
           CASE WHEN b.mage_c < 35 THEN '<35' ELSE '>=35' END as mage_group,
           b.meduc,
           COUNT(*)                                           as birth_count,
           SUM(b.down_ind)::INT as ds_recorded, SUM(b.p_ds_lb_nt) as ds_est_no_term,
           SUM(b.ds_rec_weight)                               as case_weighted,
    FROM us_births as b
    GROUP BY b.year, mage_group, b.meduc
    ORDER BY b.year, mage_group, b.meduc
    """
).df()
meduc_df.to_csv(f"./output/meduc_age_group_by_year-{datetime.now().strftime("%Y%m%d%H%M")}.csv", index=False)
meduc_df

Unnamed: 0,year,mage_group,meduc,birth_count,ds_recorded,ds_est_no_term,case_weighted
0,1989,<35,,3705383,1491,3374.286550,3078.4824
1,1989,>=35,,340310,463,2061.517526,972.0088
2,1990,<35,,3794667,1468,3474.508813,3196.5755
3,1990,>=35,,368250,515,2242.120406,1139.0436
4,1991,<35,,3730142,1272,3423.876083,3147.3382
...,...,...,...,...,...,...,...
446,2024,>=35,6,219922,240,1500.735940,572.5012
447,2024,>=35,7,142069,119,1005.395800,274.9084
448,2024,>=35,8,53331,33,394.766314,80.9871
449,2024,>=35,9,19695,27,166.382560,68.8783


In [8]:
meduc_df = con.execute(
    """
    SELECT b.year,
           CASE
               WHEN b.mage_c < 20 THEN '<20'
               WHEN b.mage_c < 25 THEN '20-24'
               WHEN b.mage_c < 30 THEN '25-29'
               WHEN b.mage_c < 35 THEN '30-34'
               WHEN b.mage_c < 40 THEN '35-39'
               WHEN b.mage_c < 45 THEN '40-44'
               ELSE '>=45'
               END              as mage_group,
           b.meduc,
           COUNT(*)             as birth_count,
           SUM(b.down_ind)::INT as ds_recorded, SUM(b.p_ds_lb_nt) as ds_est_no_term,
           SUM(b.ds_rec_weight) as case_weighted,
    FROM us_births as b
    GROUP BY b.year, mage_group, b.meduc
    ORDER BY b.year, mage_group, b.meduc
    """
).df()
meduc_df.to_csv(f"./output/meduc_age_group_2_by_year-{datetime.now().strftime("%Y%m%d%H%M")}.csv", index=False)
meduc_df

Unnamed: 0,year,mage_group,meduc,birth_count,ds_recorded,ds_est_no_term,case_weighted
0,1989,20-24,,1078787,346,753.055609,729.1293
1,1989,25-29,,1264749,506,1054.069905,1034.0787
2,1989,30-34,,843483,457,1221.312838,921.5274
3,1989,35-39,,294248,301,1347.605040,626.1874
4,1989,40-44,,44461,146,665.781783,311.1490
...,...,...,...,...,...,...,...
1550,2024,>=45,6,2757,7,86.943315,16.4873
1551,2024,>=45,7,2176,7,68.949078,15.8788
1552,2024,>=45,8,1044,1,33.243610,2.1635
1553,2024,>=45,9,720,2,23.563534,5.2309


In [21]:
meduc_df = con.execute(
    """
    SELECT b.year,
           CASE WHEN b.mage_c < 35 THEN '<35' ELSE '>=35' END as mage_group,
           CASE
               WHEN b.meduc < 6 THEN 'Less than BA'
               WHEN b.meduc > 6 AND b.meduc < 9 THEN 'BA or higher'
               ELSE NULL
               END              as meduc_group,
           CASE
               WHEN b.mracehisp_c = 1 THEN 'NH White'
               WHEN b.mracehisp_c = 2 THEN 'NH Black'
               WHEN b.mracehisp_c = 3 THEN 'NH AI/AN'
               WHEN b.mracehisp_c = 4 THEN 'NH Asian/PI'
               WHEN b.mracehisp_c = 5 THEN 'Hispanic'
               ELSE NULL
               END              as race_ethnicity,
           COUNT(*)             as birth_count,
           SUM(b.down_ind)::INT as ds_recorded, SUM(b.p_ds_lb_nt) as ds_est_no_term,
           SUM(b.ds_rec_weight) as case_weighted,
    FROM us_births as b
    WHERE b.year >= 2003
    GROUP BY b.year, mage_group, meduc_group, race_ethnicity
    ORDER BY b.year, mage_group, meduc_group, race_ethnicity
    """
).df()
meduc_df.to_csv(f"./output/meduc_age_group_3_by_year-{datetime.now().strftime("%Y%m%d%H%M")}.csv", index=False)
meduc_df

Unnamed: 0,year,mage_group,meduc_group,race_ethnicity,birth_count,ds_recorded,ds_est_no_term,case_weighted
0,2003,<35,BA or higher,Hispanic,323,,0.416603,0.0000
1,2003,<35,BA or higher,NH AI/AN,20,,0.024335,0.0000
2,2003,<35,BA or higher,NH Asian/PI,1534,,1.941197,0.0000
3,2003,<35,BA or higher,NH Black,443,,0.553964,0.0000
4,2003,<35,BA or higher,NH White,11223,3,15.082287,7.0362
...,...,...,...,...,...,...,...,...
787,2024,>=35,,NH AI/AN,513,0,3.587496,0.0000
788,2024,>=35,,NH Asian/PI,30317,10,218.566764,28.0510
789,2024,>=35,,NH Black,20148,19,155.990725,55.0582
790,2024,>=35,,NH White,138703,158,919.176509,341.8330


In [22]:
con.close()