## Setup

In [1]:
import duckdb

conn = duckdb.connect('data/duck.db')
conn.sql("""\
INSTALL spatial;
LOAD spatial;
         
INSTALL vss;
LOAD vss;
         
SET hnsw_enable_experimental_persistence = true;
""")


from IPython.core.magic import (register_line_magic, register_cell_magic, register_line_cell_magic, needs_local_scope)
from IPython.core.magic_arguments import (argument, magic_arguments, parse_argstring)

@magic_arguments()
@argument('query', nargs='*', help="The SQL query to execute")
@argument('--db', '-d', default=None, help="Which connection to use")
@argument('--output', '-o', default=None, help="Name of the variable to store output DataFrame in local scope")
@register_line_cell_magic
@needs_local_scope
def quack(line, cell=None, local_ns=None):
    "Magic that works both as %lcmagic and as %%lcmagic"

    args = parse_argstring(quack, line)
    query = cell if cell else ' '.join(args.query)

    if args.db:
        conn = local_ns[args.db]
    elif 'conn' in local_ns:
        conn = local_ns['conn']
    else:
        
        conn = duckdb.connect(':memory:')
        local_ns['conn'] = conn

    result = conn.sql(query)
    
    if args.output:
        local_ns[args.output] = result
    return result

In [16]:
import ollama
from typing import Optional

def embeddings(prompt:str) -> list[float]:
    result = ollama.embeddings(model='mxbai-embed-large', prompt=prompt)
    return result.embedding

embeds = embeddings("The quick brown fox jumps over the lazy dog.")
print('Embedding Size: ', len(embeds))

Embedding Size:  1024


In [17]:
try:
    conn.remove_function("embeddings")
except:
    # probably didn't exist
    pass

conn.create_function("embeddings", embeddings)

conn.sql(r"""
drop table if exists foo;
create table foo as
    SELECT embeddings('Arizona State University') as embedding;
""")
conn.sql("select * from foo")


┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [20]:
import pandas as pd
import json

def utr_data():
    data = json.loads(open('data/utr-mens.json').read())
    hits = pd.json_normalize(data['hits'])
    utr = pd.DataFrame()

    fields = [
        'id',
        'source.school.name',
        'source.school.shortName',
        'source.school.power6',
        'source.school.power6High',
        'source.school.power6Low',
        'source.school.conference.division.divisionName',
        'source.location.latLng',
        'source.location.cityName',
        'source.location.stateAbbr',
        'source.url',
        'source.memberCount',
    ]
    for field in fields:
        last_field = field.split('.')[-1]
        utr[last_field] = hits[field]

    return utr

utr = utr_data()

conn.sql(r"""
DROP TABLE IF EXISTS utr_vec;
CREATE TABLE utr_vec AS
SELECT
    utr.*,
    embeddings(concat(utr.name, (case when utr.shortName is not null then concat(' (', utr.shortName, ') ') else '' end), ', location: ', utr.cityName, ', ', utr.stateAbbr, ', ', utr.url))::float[1024] AS vectors
FROM utr
""")
conn.sql(r"""
SELECT COUNT(*) AS "Total utr_vec Records" FROM utr_vec
""")

┌───────────────────────┐
│ Total utr_vec Records │
│         int64         │
├───────────────────────┤
│                   956 │
└───────────────────────┘

In [None]:
schools = pd.read_csv('data/hd2023.csv', encoding='latin1')
charges = pd.read_csv('data/ic2023_ay.csv', encoding='latin1')

costs = conn.sql(r"""
SELECT
    schools.unitid as college_id,
    trim(INSTNM) AS college_name,
    trim(IALIAS) AS short_name,
    regexp_replace(regexp_replace(college_name, '(-.+?ampus$)|(^The )', '', 'g'), '\sat\W(.*)$', ' (\1)') as join_name,
    city AS city,
    stabbr AS state,
    LATITUDE::float as latitude,
    LONGITUD::float as longitude,
    WEBADDR AS url,
    try_cast(CHG3AY3 AS DECIMAL(10,2)) AS total_cost
FROM schools
JOIN charges ON schools.UNITID = charges.UNITID
""")

costs.to_df().to_csv(path_or_buf='data/school_costs.csv', index=False)

conn.sql(r"""
DROP TABLE IF EXISTS costs_vec;
CREATE TABLE costs_vec AS
SELECT
    costs.*,
    embeddings(concat(college_name, (case when short_name is not null then concat(' (', short_name, ') ') else '' end), ', location: ', city, ', ', state, ', ', url))::float[1024] AS vectors,
    null::int as utr_id
FROM costs
""")
conn.sql(r"""
SELECT COUNT(*) as "Total costs_vec Records" FROM costs_vec
""")

┌─────────────────────────┐
│ Total costs_vec Records │
│          int64          │
├─────────────────────────┤
│                    3825 │
└─────────────────────────┘

In [21]:
%%quack
select * from utr_vec where vectors is null;


┌─────────┬─────────┬───────────┬────────┬────────────┬───────────┬──────────────┬──────────┬──────────┬───────────┬─────────┬─────────────┬─────────────┐
│   id    │  name   │ shortName │ power6 │ power6High │ power6Low │ divisionName │  latLng  │ cityName │ stateAbbr │   url   │ memberCount │   vectors   │
│ varchar │ varchar │  varchar  │ double │   double   │  double   │   varchar    │ double[] │ varchar  │  varchar  │ varchar │    int64    │ float[1024] │
├─────────┴─────────┴───────────┴────────┴────────────┴───────────┴──────────────┴──────────┴──────────┴───────────┴─────────┴─────────────┴─────────────┤
│                                                                         0 rows                                                                         │
└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘

In [23]:
%%quack
create index cost_vec_idx on costs_vec using hnsw (vectors);
create index utr_vec_idx on utr_vec using hnsw (vectors);

In [261]:
%%quack
-- reset all the UTR assignments
UPDATE costs_vec SET utr_id = null;

-- Set Exact Name Matches
WITH
unassigned_utr AS (
    SELECT
        utr_vec.*
    FROM utr_vec
    LEFT JOIN costs_vec on costs_vec.utr_id = utr_vec.id
    WHERE costs_vec.utr_id IS NULL
),
matches AS (
    SELECT
        DISTINCT ON (college_id)
        costs_vec.* exclude (vectors, utr_id),
        utr.* exclude (vectors)
    FROM costs_vec
    JOIN unassigned_utr AS utr on college_name = utr.name AND stateAbbr = state
    WHERE costs_vec.utr_id IS NULL
    ORDER BY college_id
),
counts as (
    select
        count(college_id) over (partition by name) as name_count,
        name as utr_name,
        id as utr_id,
        * exclude(short_name, join_name, name, id, utr_id),
    from matches
    order by 1 desc, 2 asc
)
UPDATE costs_vec SET
    utr_id = matches.id
FROM matches
WHERE costs_vec.college_id = matches.college_id
;


In [None]:
%%quack

-- Set tricky matches
update costs_vec set utr_id = 1345 where college_id = 110680;
update costs_vec set utr_id = 2944 where college_id = 159391;
update costs_vec set utr_id = 921 where college_id = 207500;
update costs_vec set utr_id = 1806 where college_id = 139764;
update costs_vec set utr_id = 1069 where college_id = 221740;
update costs_vec set utr_id = 1724 where college_id = 240444;
update costs_vec set utr_id = 1815 where college_id = 123165;
update costs_vec set utr_id = 1630 where college_id = 159568;
update costs_vec set utr_id = 1092 where college_id = 187985;
update costs_vec set utr_id = 2593  where college_id =  139861;
update costs_vec set utr_id = 1718  where college_id =  216366;
update costs_vec set utr_id = 2081   where college_id =  121345;
update costs_vec set utr_id = 911    where college_id =  221759;
update costs_vec set utr_id = 132    where college_id =  204796;
update costs_vec set utr_id = 1159    where college_id =  214777;
update costs_vec set utr_id = 919    where college_id =  204796;
update costs_vec set utr_id = 1728    where college_id =  247834;
update costs_vec set utr_id = 2623    where college_id =  220701;
update costs_vec set utr_id = 2657    where college_id =  177968;
update costs_vec set utr_id = 2493    where college_id =  177968;
update costs_vec set utr_id = 999    where college_id =  145637;
update costs_vec set utr_id = 1878    where college_id =  180984;
update costs_vec set utr_id = 955    where college_id =  139755;
update costs_vec set utr_id = 951    where college_id =  170976;
update costs_vec set utr_id = 1031    where college_id =  218663;
update costs_vec set utr_id = 1366023    where college_id =  101897;






delete from utr_vec where id in (
    2994, -- defunct: https://www.nysed.gov/asacollege
    2703, -- defunct: https://www.insidehighered.com/news/2019/01/24/green-mountain-latest-small-college-close
    2301, -- https://en.wikipedia.org/wiki/Newbury_College_(United_States)
    2360, -- 
    2766, -- https://nces.ed.gov/collegenavigator/?q=Holy+Names+University&s=all
    1098923, -- duplicate of 2623
    2754, -- not found in NCES
    2372, -- not found in NCES
    2657, -- duplicate of MO school, 2493
    2383, -- not found in NCES
    2683, -- not found in NCES
    3062, -- duplicate of 1878
    2926, -- not found in NCES
    2483, -- not found in NCES
    1855, -- not in USA,
    1152283, -- not found in NCES
)
/*
│          1 │ Coastal Alabama Community College North │ 1366023 │     101897 │ Northeast Alabama Community College                 │ Rainsville       │ AL      │  34.54547 │   -85.91167 │ https://www.nacc.edu/       │       8790.00 │      0.26883435 │ Coastal Alabama Community College North │  31.84 │       7.25 │      3.18 │ NJCAA              │ [31.4892088, -87.32567499999999] │ Monroeville     │ AL        │ NULL                                                                 │           9 │

│          2 │ University of South Carolina            │ 1031    │     218663 │ University of South Carolina-Columbia                      │ Columbia         │ SC      │ 33.996788 │   -81.02693 │ www.sc.edu/                                            │      34934.00 │      0.23851758 │ South Carolina                          │  78.04 │      13.18 │     12.83 │ NCAA - Division I  │ [34.0008322, -81.035147]         │ Columbia        │ SC        │ http://www.gamecocksonline.com/sports/m-tennis/scar-m-tennis-body.html │         149 │

│          2 │ University of Michigan                  │ 951     │     170976 │ University of Michigan-Ann Arbor                           │ Ann Arbor        │ MI      │ 42.278374 │   -83.73481 │ https://umich.edu/                                     │      58072.00 │      0.25100017 │ Michigan                                │  77.56 │      13.54 │     12.49 │ NCAA - Division I  │ [42.2808256, -83.7430378]        │ Ann Arbor       │ MI        │ http://www.mgoblue.com/sports/m-tennis/mich-m-tennis-body.html         │         185 │

│          2 │ Georgia Tech                            │ 955     │     139755 │ Georgia Institute of Technology-Main Campus                │ Atlanta          │ GA      │  33.77242 │   -84.39483 │ www.gatech.edu/                                        │      32876.00 │        0.257639 │ Georgia Tech                            │  76.24 │       12.9 │     12.58 │ NCAA - Division I  │ [33.7501275, -84.3885209]        │ Atlanta         │ GA        │ http://www.ramblinwreck.com/sports/m-tennis/geot-m-tennis-body.html    │         145 │

│          3 │ University of Illinois                  │ 999     │     145637 │ University of Illinois Urbana-Champaign                    │ Champaign        │ IL      │ 40.104717 │   -88.22911 │ www.illinois.edu/                                      │      34501.00 │      0.25091988 │ Illinois                                │  78.77 │      13.87 │     12.57 │ NCAA - Division I  │ [40.1164204, -88.2433829]        │ Champaign       │ IL        │ http://fightingillini.com/index.aspx?path=mten&                        │          89 │


2360    │     168430
│          2 │ Newbury College              │ 2301    │     165033 │ Bristol Community College                │ Fall River   │ MA      │ 41.721992 │   -71.11913 │ www.bristolcc.edu/         │      10356.00 │      0.24932134 │ NULL           │    0.0 │        0.0 │       0.0 │ NCAA - Division III │ [42.33176419999999, -71.1211635] │ Brookline    │ MA        │ NULL                                                                   │           3 │

│          4 │ Penn State University        │ 1159    │     214777 │ Pennsylvania State University-Main Campus                  │ University Park   │ PA      │   40.7965 │   -77.86285 │ https://www.psu.edu/              │      40188.00 │      0.23686433 │ Penn State     │  72.14 │      12.78 │     11.29 │ NCAA - Division I   │ [40.7933949, -77.8600012]        │ State College │ PA        │ http://www.gopsusports.com/sports/m-tennis/psu-m-tennis-body.html      │         107 │

│         17 │ ASA College - New York                │ 2994    │     192192 │ Keuka College                                       │ Keuka Park   │ NY      │ 42.615253 │   -77.09056 │ https://www.keuka.edu/ │      38000.00 │       0.2842319 │ NULL             │    0.0 │        0.0 │       0.0 │ NJCAA               │ [40.7127753, -74.0059728]        │ New York         │ NY        │ NULL                                                                 │           6 │

-- 110680 │ University of California-San Diego   │ La Jolla       │ CA      │  32.87775 │ -117.23586 │ www.ucsd.edu/                 │      46042.00 │   NULL │      0.12578559 │ 1345    │ UC San Diego                      │ NULL          │  70.13 │      12.34 │     11.26 │ NCAA - Division I   │ [32.715738, -117.1610838]               │ San Diego      │ CA        │ NULL                                                │         125 │     NULL │          3 │
--│          3 │ Louisiana State University             │     159391 │ Louisiana State University and Agricultural & Mechanical College │ Baton Rouge     │ LA      │ 30.414986 │ -91.178925 │ www.lsu.edu/              │      28631.00 │   NULL │      0.15846908 │ 2944    │ LSU            │  76.86 │       13.1 │     12.54 │ NCAA - Division I   │ [30.4514677, -91.18714659999999]        │ Baton Rouge     │ LA        │ http://www.lsusports.net/SportSelect.dbml?&&DB_OEM_ID=5200&SPID=2162&SPSID=27802 │         183 │     NULL │
-- │          3 │ University of Oklahoma                 │ 921     │     207500 │ University of Oklahoma-Norman Campus                    │ Norman          │ OK      │ 35.209408 │  -97.444214 │ www.ou.edu/                       │      26665.00 │      0.15535092 │ Oklahoma         │  79.19 │       13.6 │     12.93 │ NCAA - Division I   │ [35.2225668, -97.4394777]        │ Norman          │ OK        │ http://www.soonersports.com/SportSelect.dbml?SPID=127253                                       │         435 │
--│          2 │ Georgia Southwestern State             │ 1806    │     139764 │ Georgia Southwestern State University                   │ Americus        │ GA      │  32.05466 │  -84.217735 │ https://gsw.edu/                  │      15702.00 │      0.15730268 │ NULL             │    0.0 │        0.0 │       0.0 │ NCAA - Division II  │ [32.0723861, -84.23268759999999] │ Americus        │ GA        │ NULL                                                                                           │           7 │
-- │          2 │ University of Tennessee at Chattanooga │ 1069    │     221740 │ The University of Tennessee-Chattanooga                 │ Chattanooga     │ TN      │  35.04752 │   -85.30025 │ utc.edu/                          │      18208.00 │      0.15560246 │ Chattanooga      │  70.11 │      12.25 │     11.32 │ NCAA - Division I   │ [35.0457984, -85.3093995]        │ Chattanooga     │ TN        │ http://www.gomocs.com/index.aspx?path=mten&&                                                   │          92 │
-- │          2 │ University of Wisconsin, Madison       │ 1724    │     240444 │ University of Wisconsin-Madison                         │ Madison         │ WI      │  43.07541 │    -89.4041 │ www.wisc.edu/             │      40603.00 │      0.16415614 │ Wisconsin        │  73.36 │      12.64 │      12.0 │ NCAA - Division I   │ [43.0721661, -89.4007501]        │ Madison         │ WI        │ http://www.uwbadgers.com/index.aspx?path=mten                                                  │          66 │

-- │          2 │ Claremont-Mudd-Scripps Colleges        │ 1815    │     123165 │ Scripps College                                         │ Claremont       │ CA      │  34.10415 │ -117.711296 │ www.scrippscollege.edu/        │      63434.00 │      0.20048791 │ NULL             │  69.27 │      12.23 │     11.14 │ NCAA - Division III │ [34.0966764, -117.7197785]       │ Claremont       │ CA        │ http://www.cmsathletics.org/sports/mten/index                                                  │         170 │
-- │          2 │ Louisiana College                      │ 1630    │     159568 │ Louisiana Christian University                          │ Pineville       │ LA      │ 31.324528 │   -92.42597 │ www.lcuniversity.edu/          │      19740.00 │      0.20771849 │ NULL             │    0.0 │        0.0 │       0.0 │ NCAA - Division III │ [30.5190775, -91.5208624]        │ NULL            │ LA        │ NULL                                                                                           │           9 │
-- │          2 │ University of New Mexico               │ 1092    │     187985 │ University of New Mexico-Main Campus                    │ Albuquerque     │ NM      │ 35.083866 │ -106.620155 │ www.unm.edu/                   │      26450.00 │      0.15649223 │ New Mexico       │  72.68 │      12.38 │     11.79 │ NCAA - Division I   │ [35.0843859, -106.650422]        │ Albuquerque     │ NM        │ http://www.golobos.com/index.aspx?path=mten                                                    │          34 │

--│          2 │ Georgia College                        │ 2593    │     139861 │ Georgia College & State University                  │ Milledgeville   │ GA      │ 33.082787 │  -83.231964 │ www.gcsu.edu/                  │      28178.00 │      0.21582055 │ NULL             │  62.25 │      11.25 │      9.42 │ NCAA - Division II  │ [33.0801429, -83.2320991]        │ Milledgeville   │ GA        │ https://www.gcbobcats.com/sports/mten/index                                                    │          35 │
--│          2 │ Jefferson University                   │ 1718    │     216366 │ Thomas Jefferson University                         │ Philadelphia    │ PA      │ 39.948273 │  -75.158264 │ www.jefferson.edu/             │      45683.00 │       0.1512199 │ NULL             │  63.53 │      11.19 │     10.08 │ NCAA - Division II  │ [39.9525839, -75.1652215]        │ Philadelphia    │ PA        │ NULL                                                                                           │          20 │
--│          2 │ Pomona-Pitzer Colleges                 │ 2081    │     121345 │ Pomona College                                      │ Claremont       │ CA      │ 34.099026 │ -117.714554 │ www.pomona.edu/                │      62326.00 │      0.21265012 │ NULL             │  61.67 │      10.51 │     10.04 │ NCAA - Division III │ [34.0966764, -117.7197785]       │ Claremont       │ CA        │ https://www.sagehens.com/sports/mten/index                                                     │         401 │
--│          2 │ University of Tennessee                │ 911     │     221759 │ The University of Tennessee-Knoxville               │ Knoxville       │ TN      │ 35.952084 │   -83.92585 │ www.utk.edu/                   │      31974.00 │      0.21157789 │ Tennessee        │  77.62 │      13.47 │     12.51 │ NCAA - Division I   │ [35.9606384, -83.9207392]        │ Knoxville       │ TN        │ http://www.utsports.com/sports/m-tennis/tenn-m-tennis-body.html                                │         109 │
*/

## Test out `array_cosine_distance` to find similar vectors. 0 = no difference, 1 = very different

In [108]:
%%quack

select
    utr_vec.* exclude (vectors),
    '|' as "|",
    array_cosine_distance(utr_vec.vectors, costs_vec.vectors)::float as cosine_distance,
    costs_vec.* exclude (vectors)
from utr_vec
full join costs_vec on stateAbbr = state
where id in (2803, 2383)
  and array_cosine_distance(utr_vec.vectors, costs_vec.vectors) < 0.3
  and utr_id is null
order by name, array_cosine_distance(utr_vec.vectors, costs_vec.vectors)::float asc
;


┌─────────┬─────────────────────────────────┬─────────────┬────────┬────────────┬───────────┬─────────────────────┬──────────────────────────────────┬──────────────┬───────────┬─────────┬─────────────┬────────┬─────────┬─────────────────┬────────────┬─────────────────────────────────┬────────────┬─────────────────────────────────┬───────────────────┬─────────┬───────────┬────────────┬─────────────────────────────────────┬───────────────┐
│   id    │              name               │  shortName  │ power6 │ power6High │ power6Low │    divisionName     │              latLng              │   cityName   │ stateAbbr │   url   │ memberCount │ utr_id │    |    │ cosine_distance │ college_id │          college_name           │ short_name │            join_name            │       city        │  state  │ latitude  │ longitude  │                 url                 │  total_cost   │
│ varchar │             varchar             │   varchar   │ double │   double   │  double   │       varchar       │ 

In [264]:
%%quack

WITH
unassigned_utr AS (
    SELECT
        utr_vec.*
    FROM utr_vec
    LEFT JOIN costs_vec on costs_vec.utr_id = utr_vec.id
    WHERE costs_vec.utr_id IS NULL
),
matches AS (
    SELECT
        DISTINCT ON (college_id)
        costs_vec.* exclude (vectors),
        array_cosine_distance(utr.vectors, costs_vec.vectors)::float as cosine_distance,
        utr.* exclude (vectors)
    FROM costs_vec
    LEFT JOIN unassigned_utr AS utr on stateAbbr = state
    WHERE  array_cosine_distance(utr.vectors, costs_vec.vectors) < 0.07
      AND costs_vec.utr_id IS NULL
    ORDER BY college_id, array_cosine_distance(utr.vectors, costs_vec.vectors)::float asc
),
counts as (
    select
        name,
        count(1)
    from matches
    group by 1
    having count(1) > 1
)
UPDATE costs_vec SET
    utr_id = matches.id
FROM matches
WHERE costs_vec.college_id = matches.college_id
;


In [265]:
%%quack

WITH
unassigned_utr AS (
    SELECT
        utr_vec.*
    FROM utr_vec
    LEFT JOIN costs_vec on costs_vec.utr_id = utr_vec.id
    WHERE costs_vec.utr_id IS NULL
),
matches AS (
    SELECT
        DISTINCT ON (college_id)
        costs_vec.* exclude (vectors),
        array_cosine_distance(utr.vectors, costs_vec.vectors)::float as cosine_distance,
        utr.* exclude (vectors)
    FROM costs_vec
    LEFT JOIN unassigned_utr AS utr on stateAbbr = state
    WHERE  array_cosine_distance(utr.vectors, costs_vec.vectors) < 0.09
      AND costs_vec.utr_id IS NULL
    ORDER BY college_id, array_cosine_distance(utr.vectors, costs_vec.vectors)::float asc
),
counts as (
    select
        name,
        count(1)
    from matches
    group by 1
    having count(1) > 1
)
UPDATE costs_vec SET
    utr_id = matches.id
FROM matches
WHERE costs_vec.college_id = matches.college_id
;


In [266]:
%%quack

WITH
unassigned_utr AS (
    SELECT
        utr_vec.*
    FROM utr_vec
    LEFT JOIN costs_vec on costs_vec.utr_id = utr_vec.id
    WHERE costs_vec.utr_id IS NULL
),
matches AS (
    SELECT
        DISTINCT ON (college_id)
        costs_vec.* exclude (vectors),
        array_cosine_distance(utr.vectors, costs_vec.vectors)::float as cosine_distance,
        utr.* exclude (vectors)
    FROM costs_vec
    LEFT JOIN unassigned_utr AS utr on stateAbbr = state
    WHERE  array_cosine_distance(utr.vectors, costs_vec.vectors) < 0.125
      AND costs_vec.utr_id IS NULL
    ORDER BY college_id, array_cosine_distance(utr.vectors, costs_vec.vectors)::float asc
),
counts as (
    select
        * exclude(short_name, join_name),
        count(college_id) over (partition by name) as name_count
    from matches
)
UPDATE costs_vec SET
    utr_id = matches.id
FROM matches
WHERE costs_vec.college_id = matches.college_id
;


In [267]:
%%quack

WITH
unassigned_utr AS (
    SELECT
        utr_vec.*
    FROM utr_vec
    LEFT JOIN costs_vec on costs_vec.utr_id = utr_vec.id
    WHERE costs_vec.utr_id IS NULL
),
matches AS (
    SELECT
        DISTINCT ON (college_id)
        costs_vec.* exclude (vectors),
        array_cosine_distance(utr.vectors, costs_vec.vectors)::float as cosine_distance,
        utr.* exclude (vectors)
    FROM costs_vec
    LEFT JOIN unassigned_utr AS utr on stateAbbr = state
    WHERE  array_cosine_distance(utr.vectors, costs_vec.vectors) < 0.15
      AND costs_vec.utr_id IS NULL
    ORDER BY college_id, array_cosine_distance(utr.vectors, costs_vec.vectors)::float asc
),
counts as (
    select
        count(college_id) over (partition by name) as name_count,
        name as utr_name,
        * exclude(short_name, join_name, name),
    from matches
)
UPDATE costs_vec SET
    utr_id = matches.id
FROM matches
WHERE costs_vec.college_id = matches.college_id
;


In [312]:
%%quack

select
    sum(case when utr_id is null then 1 end) as unassigned_utr,
    sum(case when utr_id is not null then 1 end) as assigned_utr
from costs_vec;

┌────────────────┬──────────────┐
│ unassigned_utr │ assigned_utr │
│     int128     │    int128    │
├────────────────┼──────────────┤
│           2908 │          917 │
└────────────────┴──────────────┘

In [281]:
%%quack

WITH
unassigned_utr AS (
    SELECT
        utr_vec.*
    FROM utr_vec
    LEFT JOIN costs_vec on costs_vec.utr_id = utr_vec.id
    WHERE costs_vec.utr_id IS NULL
),
matches AS (
    SELECT
        DISTINCT ON (college_id)
        costs_vec.* exclude (vectors, utr_id),
        array_cosine_distance(utr.vectors, costs_vec.vectors)::float as cosine_distance,
        utr.* exclude (vectors)
    FROM costs_vec
    LEFT JOIN unassigned_utr AS utr on stateAbbr = state
    WHERE  array_cosine_distance(utr.vectors, costs_vec.vectors) < .2
      AND costs_vec.utr_id IS NULL
    ORDER BY college_id, array_cosine_distance(utr.vectors, costs_vec.vectors)::float asc
),
counts as (
    select
        count(college_id) over (partition by name) as name_count,
        name as utr_name,
        id as utr_id,
        * exclude(short_name, join_name, name, id, utr_id),
    from matches
    order by 1 desc, 2 asc
)
UPDATE costs_vec SET
    utr_id = matches.id
FROM matches
WHERE costs_vec.college_id = matches.college_id
;


In [294]:
%%quack

WITH
unassigned_utr AS (
    SELECT
        utr_vec.*
    FROM utr_vec
    LEFT JOIN costs_vec on costs_vec.utr_id = utr_vec.id
    WHERE costs_vec.utr_id IS NULL
),
matches AS (
    SELECT
        DISTINCT ON (college_id)
        costs_vec.* exclude (vectors, utr_id),
        array_cosine_distance(utr.vectors, costs_vec.vectors)::float as cosine_distance,
        utr.* exclude (vectors)
    FROM costs_vec
    LEFT JOIN unassigned_utr AS utr on stateAbbr = state
    WHERE  array_cosine_distance(utr.vectors, costs_vec.vectors) < .22
      AND costs_vec.utr_id IS NULL
    ORDER BY college_id, array_cosine_distance(utr.vectors, costs_vec.vectors)::float asc
),
counts as (
    select
        count(college_id) over (partition by name) as name_count,
        name as utr_name,
        id as utr_id,
        * exclude(short_name, join_name, name, id, utr_id),
    from matches
    order by 1 desc, 2 asc
)
UPDATE costs_vec SET
    utr_id = matches.id
FROM matches
WHERE costs_vec.college_id = matches.college_id


In [270]:
%%quack
select * exclude(vectors) from costs_vec where college_name = 'University of Chicago';

┌────────────┬───────────────────────┬────────────┬───────────────────────┬─────────┬─────────┬───────────┬───────────┬───────────────────┬───────────────┬────────┐
│ college_id │     college_name      │ short_name │       join_name       │  city   │  state  │ latitude  │ longitude │        url        │  total_cost   │ utr_id │
│   int64    │        varchar        │  varchar   │        varchar        │ varchar │ varchar │   float   │   float   │      varchar      │ decimal(10,2) │ int32  │
├────────────┼───────────────────────┼────────────┼───────────────────────┼─────────┼─────────┼───────────┼───────────┼───────────────────┼───────────────┼────────┤
│     144050 │ University of Chicago │            │ University of Chicago │ Chicago │ IL      │ 41.787994 │ -87.59954 │ www.uchicago.edu/ │      66939.00 │   1433 │
└────────────┴───────────────────────┴────────────┴───────────────────────┴─────────┴─────────┴───────────┴───────────┴───────────────────┴───────────────┴────────┘

In [397]:
%%quack

WITH
unassigned_utr AS (
    SELECT
        utr_vec.*
    FROM utr_vec
    LEFT JOIN costs_vec on costs_vec.utr_id = utr_vec.id
    WHERE costs_vec.utr_id IS NULL
),
matches AS (
    SELECT
        DISTINCT ON (college_id)
        costs_vec.* exclude (vectors, utr_id),
        array_cosine_distance(utr.vectors, costs_vec.vectors)::float as cosine_distance,
        utr.* exclude (vectors)
    FROM costs_vec
    LEFT JOIN unassigned_utr AS utr on stateAbbr = state
    WHERE  array_cosine_distance(utr.vectors, costs_vec.vectors) < .3
      AND costs_vec.utr_id IS NULL
    ORDER BY college_id, array_cosine_distance(utr.vectors, costs_vec.vectors)::float asc
),
counts as (
    select
        count(college_id) over (partition by name) as name_count,
        name as utr_name,
        id as utr_id,
        * exclude(short_name, join_name, name, id, utr_id),
    from matches
    order by 1 desc, utr_name, cosine_distance
)
select
    *
from counts


┌────────────┬─────────────────────────────┬─────────┬────────────┬─────────────────────────────────────────────────────┬──────────────────┬─────────┬───────────┬─────────────┬─────────────────────────────┬───────────────┬─────────────────┬────────────────┬────────┬────────────┬───────────┬────────────────────┬──────────────────────────────────┬─────────────────┬───────────┬──────────────────────────────────────────────────────────────────────┬─────────────┐
│ name_count │          utr_name           │ utr_id  │ college_id │                    college_name                     │       city       │  state  │ latitude  │  longitude  │             url             │  total_cost   │ cosine_distance │   shortName    │ power6 │ power6High │ power6Low │    divisionName    │              latLng              │    cityName     │ stateAbbr │                                url_1                                 │ memberCount │
│   int64    │           varchar           │ varchar │   int64    │       

In [329]:
import io

result = schools.applymap(lambda x: 'University of Michigan' in str(x)).any(axis=1)
matching_rows = schools[result]
csv_buffer = io.StringIO()
matching_rows.to_csv(csv_buffer, index=False)
print(csv_buffer.getvalue())

UNITID,INSTNM,IALIAS,ADDR,CITY,STABBR,ZIP,FIPS,OBEREG,CHFNM,CHFTITLE,GENTELE,EIN,UEIS,OPEID,OPEFLAG,WEBADDR,ADMINURL,FAIDURL,APPLURL,NPRICURL,VETURL,ATHURL,DISAURL,SECTOR,ICLEVEL,CONTROL,HLOFFER,UGOFFER,GROFFER,HDEGOFR1,DEGGRANT,HBCU,HOSPITAL,MEDICAL,TRIBAL,LOCALE,OPENPUBL,ACT,NEWID,DEATHYR,CLOSEDAT,CYACTIVE,POSTSEC,PSEFLAG,PSET4FLG,RPTMTH,INSTCAT,C21BASIC,C21IPUG,C21IPGRD,C21UGPRF,C21ENPRF,C21SZSET,C18BASIC,C15BASIC,CCBASIC,CARNEGIE,LANDGRNT,INSTSIZE,F1SYSTYP,F1SYSNAM,F1SYSCOD,CBSA,CBSATYPE,CSA,COUNTYCD,COUNTYNM,CNGDSTCD,LONGITUD,LATITUDE,DFRCGID,DFRCUSCG
170976,University of Michigan-Ann Arbor,U of Michigan|U of M|Univ of Michigan|U Michigan Ann Arbor|University of Michigan Ann Arbor,1109 Geddes Avenue,Ann Arbor,MI,48109,26,3,Santa J. Ono,President,7347641817,386006309,GNJ7BBP73WE9,232500,1,https://umich.edu/,https://admissions.umich.edu/,https://finaid.umich.edu/,https://admissions.umich.edu/apply,https://npc.collegeboard.org/app/umich,https://vets.umich.edu/,https://mgoblue.com/new

  result = schools.applymap(lambda x: 'University of Michigan' in str(x)).any(axis=1)


In [None]:
import os
import requests
from textwrap import dedent

LITELLM_API_KEY = os.getenv('LITELLM_API_KEY', None)
if not LITELLM_API_KEY:
    raise Exception('Please set the LITELLM_API_KEY environment variable')

def reply(messages, model='llama3.2:latest'):
    url = "https://litellm.dataturd.com/v1/chat/completions"
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {LITELLM_API_KEY}'
    }
    data = {
        "model": model,
        "messages": messages
    }
    response = requests.post(url, headers=headers, json=data)
    if response.status_code != 200:
        raise Exception(f"Request failed with status {response.status_code}: {response.text}")
    
    return response.json()['choices'][0]['message']['content']

assert schools, "Schools data not loaded! Make sure to run the cell above."

def match_ipeds_record(record:str, model='qwen2.5:14b') -> str:

    messages = [
        {
            "role": "system", "content": dedent(f"""
            Act as a data quality expert and researcher.
            You are working with this source data record:
            ---
            {json.dumps(record, indent=2)}
            """)
        },
        {
            "role": "user", "content": dedent(f"""
            I found these potential matches from the IPEDSIntegrated Postsecondary Education Data System database:
            ```csv

            ```
            Which record should I match to the source record?
            How confident are you in the match?
            Return in the following JSON format:
            ```json
            {{
                "match": $UNITID,
                "confidence": $0.8
            }}
            ```
            """)
        },
    ]
    text = reply(messages=messages, model=model)
    fenceless = '\n'.join([line for line in text.split('\n') if not line.startswith('```')])
    return fenceless

college_id = 951
row = conn.sql(f"""SELECT * FROM utr WHERE id = {college_id}""").fetchdf().to_dict(orient='records')[0]

match_ipeds_record(row)