In [54]:
import duckdb
import pandas as pd

conn = duckdb.connect('data/duck.db')

conn.sql("""\
INSTALL spatial;
LOAD spatial;
         
INSTALL vss;
LOAD vss;
         
SET hnsw_enable_experimental_persistence = true;
""")


from IPython.core.magic import (register_line_magic, register_cell_magic, register_line_cell_magic, needs_local_scope)
from IPython.core.magic_arguments import (argument, magic_arguments, parse_argstring)

@magic_arguments()
@argument('query', nargs='*', help="The SQL query to execute")
@argument('--db', '-d', default=None, help="Which connection to use")
@argument('--output', '-o', default=None, help="Name of the variable to store output DataFrame in local scope")
@register_line_cell_magic
@needs_local_scope
def quack(line, cell=None, local_ns=None):
    "Magic that works both as %lcmagic and as %%lcmagic"

    args = parse_argstring(quack, line)
    query = cell if cell else ' '.join(args.query)

    if args.db:
        conn = local_ns[args.db]
    elif 'conn' in local_ns:
        conn = local_ns['conn']
    else:
        
        conn = duckdb.connect(':memory:')
        local_ns['conn'] = conn

    result = conn.sql(query)
    
    if args.output:
        local_ns[args.output] = result
    return result

programs = pd.read_csv('data/c2023_a.csv', encoding='latin1')
programs = programs[programs['CIPCODE'] != 99]  # 99 = Grand Total
cip_codes = pd.read_csv('data/cip_codes.csv')

In [55]:
%%quack

SELECT
    cipcode,
    cip_codes.cip_name,
    sum(CTOTALT) as total_graduates,
    count(1)
FROM programs
LEFT JOIN cip_codes ON programs.CIPCODE = cip_codes.cip_id
WHERE UNITID = 101709
GROUP BY 1, 2
ORDER BY 3 DESC

┌─────────┬──────────────────────────────────────────────────────────────┬─────────────────┬──────────┐
│ CIPCODE │                           cip_name                           │ total_graduates │ count(1) │
│ double  │                           varchar                            │     int128      │  int64   │
├─────────┼──────────────────────────────────────────────────────────────┼─────────────────┼──────────┤
│ 52.0201 │ Business Administration and Management, General              │              70 │        3 │
│ 13.1202 │ Elementary Education and Teaching                            │              46 │        2 │
│ 31.0505 │ Exercise Science and Kinesiology                             │              46 │        2 │
│ 50.0701 │ Art/Art Studies, General                                     │              38 │        1 │
│ 13.1101 │ Counselor Education/School Counseling and Guidance Services  │              38 │        1 │
│ 52.1401 │ Marketing/Marketing Management, General             

In [61]:
%%quack --output program_mens

DROP TABLE IF EXISTS program_mens;
CREATE TABLE program_mens AS
WITH raw AS (
  SELECT
      UNITID,
      cipcode,
      cip_name,
      ctotalm,
      ctotalm / sum(CTOTALM) OVER (PARTITION BY UNITID) as pct_awards,
  FROM programs
  JOIN cip_codes ON programs.CIPCODE = cip_codes.cip_id
  WHERE CTOTALM > 0
  ORDER BY 3 DESC
)
SELECT
  unitid,
  cipcode,
  cip_name,
  SUM(pct_awards) as pct_awards,
  SUM(ctotalm) as total_awards,
FROM raw
WHERE pct_awards IS NOT NULL
GROUP BY 1, 2, 3
ORDER BY pct_awards DESC;

SELECT * FROM program_mens;

┌────────┬─────────┬───────────────────────────────────────────────────────────────────────┬─────────────────────┬──────────────┐
│ UNITID │ CIPCODE │                               cip_name                                │     pct_awards      │ total_awards │
│ int64  │ double  │                                varchar                                │       double        │    int128    │
├────────┼─────────┼───────────────────────────────────────────────────────────────────────┼─────────────────────┼──────────────┤
│ 494603 │  5.0122 │ Regional Studies (U.S., Canadian, Foreign)                            │  1.0000000000000002 │          490 │
│ 213589 │ 47.0604 │ Automobile/Automotive Mechanics Technology/Technician                 │                 1.0 │          132 │
│ 448257 │ 12.0409 │ Aesthetician/Esthetician and Skin Care Specialist                     │                 1.0 │            1 │
│ 455062 │ 12.0409 │ Aesthetician/Esthetician and Skin Care Specialist                    

In [62]:
program_mens = program_mens.to_df()

In [None]:
%%quack

WITH raw AS (
  SELECT
      UNITID,
      cipcode,
      cip_codes.cip_name,
      ctotalm,
      ctotalm / sum(CTOTALM) OVER (PARTITION BY UNITID) as pct_graduates,
  FROM program_mens
  LEFT JOIN cip_codes ON program_mens.CIPCODE = cip_codes.cip_id
  WHERE UNITID = 145813
  ORDER BY 3 DESC
), percentages AS (
  SELECT
    unitid,
    cipcode,
    cip_name,
    SUM(ctotalm) as total_graduates,
    SUM(pct_graduates) as pct_graduates
  FROM raw
  GROUP BY 1, 2, 3
  ORDER BY pct_graduates DESC
), ranks AS (
SELECT
  *,
  SUM(pct_graduates) OVER (ORDER BY pct_graduates DESC) as cumulative_pct,
  ROW_NUMBER() OVER (ORDER BY pct_graduates DESC) as rank
FROM percentages
)
SELECT
  *
FROM ranks
WHERE cumulative_pct <= 0.5

┌────────┬─────────┬──────────────────────────────────────────────────────────────────────────┬─────────────────┬──────────────────────┬─────────────────────┬───────┐
│ UNITID │ CIPCODE │                                 cip_name                                 │ total_graduates │    pct_graduates     │   cumulative_pct    │ rank  │
│ int64  │ double  │                                 varchar                                  │     int128      │        double        │       double        │ int64 │
├────────┼─────────┼──────────────────────────────────────────────────────────────────────────┼─────────────────┼──────────────────────┼─────────────────────┼───────┤
│ 145813 │ 52.0201 │ Business Administration and Management, General                          │             150 │  0.06521739130434782 │ 0.06521739130434782 │     1 │
│ 145813 │ 52.0801 │ Finance, General                                                         │             141 │  0.06130434782608696 │  0.1265217391304348 │     2 

In [75]:
%%quack

WITH raw AS (
  SELECT
      UNITID,
      cipcode,
      cip_codes.cip_name,
      ctotalm,
      ctotalm / sum(CTOTALM) OVER (PARTITION BY UNITID) as pct_graduates,
  FROM program_mens
  LEFT JOIN cip_codes ON program_mens.CIPCODE = cip_codes.cip_id
  WHERE UNITID = 145813
  ORDER BY 3 DESC
), percentages AS (
  SELECT
    unitid,
    cipcode,
    cip_name,
    SUM(ctotalm) as total_graduates,
    SUM(pct_graduates) as pct_graduates
  FROM raw
  GROUP BY 1, 2, 3
  ORDER BY pct_graduates DESC
), ranks AS (
SELECT
  *,
  SUM(pct_graduates) OVER (ORDER BY pct_graduates DESC) as cumulative_pct,
  ROW_NUMBER() OVER (ORDER BY pct_graduates DESC) as rank
FROM percentages
)
PIVOT ranks
ON 'cip_rank_' || rank IN ('cip_rank_1', 'cip_rank_2', 'cip_rank_3', 'cip_rank_4', 'cip_rank_5')
USING min(cipcode)
GROUP BY unitid

┌────────┬────────────┬────────────┬────────────┬────────────┬────────────┐
│ UNITID │ cip_rank_1 │ cip_rank_2 │ cip_rank_3 │ cip_rank_4 │ cip_rank_5 │
│ int64  │   double   │   double   │   double   │   double   │   double   │
├────────┼────────────┼────────────┼────────────┼────────────┼────────────┤
│ 145813 │    52.0201 │    52.0801 │    52.1401 │    52.0301 │    43.0104 │
└────────┴────────────┴────────────┴────────────┴────────────┴────────────┘

In [101]:
%%quack --output majors

WITH raw AS (
  SELECT
      UNITID,
      cipcode,
      cip_codes.cip_name,
      total_awards,
      total_awards / sum(total_awards) OVER (PARTITION BY UNITID) as pct_awards,
  FROM program_mens
  LEFT JOIN cip_codes ON program_mens.CIPCODE = cip_codes.cip_id
  --WHERE UNITID = 145813
  ORDER BY 1, 5 DESC
), percentages AS (
  SELECT
    unitid,
    cipcode,
    cip_name,
    SUM(total_awards) as total_graduates,
    SUM(pct_awards) as pct_awards
  FROM raw
  GROUP BY 1, 2, 3
  ORDER BY pct_awards DESC
), ranks AS (
SELECT
  *,
  SUM(pct_awards) OVER (PARTITION BY unitid ORDER BY pct_awards DESC) as cumulative_pct,
  ROW_NUMBER() OVER (PARTITION BY unitid ORDER BY pct_awards DESC) as rank
FROM percentages
)
SELECT
    unitid as college_id,
    array_agg(
        {
            rank: rank,
            cipcode: cipcode,
            cip_name: cip_name,
            total_graduates: total_graduates,
            pct_awards: round(pct_awards, 2),
        }
    ) as top_cips
FROM ranks
WHERE cumulative_pct <= .5
GROUP BY unitid

┌────────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [103]:
majors_df = majors.to_df()
majors_df.to_json('majors.jsonl', orient='records', lines=True)