# Notes 4


In [6]:
import duckdb
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from variables import Variables as vars

plt.style.use('../../notebook.mplstyle')

os.makedirs("./output", exist_ok=True)

In [7]:
# con.close()
con = duckdb.connect("./data/us_births.db", read_only=True)

In [8]:
meduc_df = con.execute(
    f"""
    SELECT
        b.year,
        CASE
            WHEN b.mage_c < 35
            THEN '<35' ELSE '>=35'
        END as mage_group,
        CASE
           WHEN b.meduc < 6 THEN 'Less than BA'
           WHEN b.meduc >= 6 AND b.meduc < 9 THEN 'BA or higher'
           ELSE NULL
        END as meduc_group,
        CASE
           WHEN b.feduc < 6 THEN 'Less than BA'
           WHEN b.feduc >= 6 AND b.feduc < 9 THEN 'BA or higher'
           ELSE NULL
        END as feduc_group,
        COUNT(*)                as birth_count,
        SUM(b.down_ind)::INT    as ds_recorded,
        SUM(b.p_ds_lb_nt)       as ds_est_no_term,
        SUM(b.ds_case_weight)   as case_weighted,
    FROM us_births as b
    WHERE b.year >= 2009
    GROUP BY b.year, mage_group, meduc_group, feduc_group
    ORDER BY b.year, mage_group, meduc_group, feduc_group
    """
).df()
meduc_df.to_csv(f"./output/year_meduc_feduc_1-{datetime.now().strftime("%Y%m%d%H%M")}.csv", index=False)
meduc_df

Unnamed: 0,year,mage_group,meduc_group,feduc_group,birth_count,ds_recorded,ds_est_no_term,case_weighted
0,2009,<35,BA or higher,BA or higher,336602,124,414.195277,307.2172
1,2009,<35,BA or higher,Less than BA,171207,72,192.001757,177.0838
2,2009,<35,BA or higher,,11994,4,13.062256,12.3308
3,2009,<35,Less than BA,BA or higher,112588,38,120.504808,102.2322
4,2009,<35,Less than BA,Less than BA,1399183,418,1233.945517,1125.9874
...,...,...,...,...,...,...,...,...
283,2024,>=35,Less than BA,Less than BA,248073,474,1794.675547,1198.1089
284,2024,>=35,Less than BA,,48424,96,353.675988,246.1832
285,2024,>=35,,BA or higher,1214,1,12.643474,2.8978
286,2024,>=35,,Less than BA,1767,3,14.133211,8.1287


In [9]:
meduc_df = con.execute(
    f"""
    SELECT
        b.year,
           CASE
               WHEN b.mage_c < 30 THEN '<30'
               WHEN b.mage_c < 35 THEN '30-34'
               WHEN b.mage_c < 40 THEN '35-39'
               ELSE '>=40'
           END              as mage_group,
        CASE
           WHEN b.meduc < 9 THEN b.meduc
           ELSE NULL
        END as meduc,
        CASE
           WHEN b.feduc < 9 THEN b.feduc
           ELSE NULL
        END as feduc,
        COUNT(*)                as birth_count,
        SUM(b.down_ind)::INT    as ds_recorded,
        SUM(b.p_ds_lb_nt)       as ds_est_no_term,
        SUM(b.ds_case_weight)   as case_weighted,
    FROM us_births as b
    WHERE b.year >= 2009
        AND b.bmi >= 13.0 AND b.bmi < 99.0
    GROUP BY b.year, mage_group, meduc, feduc
    ORDER BY b.year, mage_group, meduc, feduc
    """
).df()
meduc_df.to_csv(f"./output/year_meduc_feduc_2-{datetime.now().strftime("%Y%m%d%H%M")}.csv", index=False)
meduc_df

Unnamed: 0,year,mage_group,meduc,feduc,birth_count,ds_recorded,ds_est_no_term,case_weighted
0,2009,30-34,1,1,17756,5,26.549283,14.4608
1,2009,30-34,1,2,5750,5,8.573183,15.1860
2,2009,30-34,1,3,2964,3,4.412054,9.9278
3,2009,30-34,1,4,789,0,1.200070,0.0000
4,2009,30-34,1,5,190,1,0.277781,2.3120
...,...,...,...,...,...,...,...,...
5179,2024,>=40,,5,47,0,0.870380,0.0000
5180,2024,>=40,,6,188,0,3.787985,0.0000
5181,2024,>=40,,7,108,0,2.478102,0.0000
5182,2024,>=40,,8,76,0,1.752756,0.0000


In [10]:
con.close()