In [None]:
import re, torch, json, pickle
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
from scipy.sparse import lil_matrix
from sklearn.cluster import AgglomerativeClustering
from tqdm import tqdm
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict
import warnings

warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [2]:
with open('bill_subjects.json', 'r') as f:
    bill_subjects = json.load(f)

In [3]:
subject_originals = pickle.load(open('subjects_original.pkl', 'rb'))

In [4]:
so = {k: subject_originals[v] for k, v in bill_subjects.items() if v in subject_originals}

In [5]:
subs = pd.DataFrame.from_dict(so, orient='index', columns=['subject']).reset_index().rename(columns={'index':'bill_id'})
subs = subs.loc[subs['subject'].notna()]

In [6]:
digests = torch.load('digests.pt')

In [7]:
digests = {k: v.cpu().numpy() for k, v in digests.items()}

In [8]:
digest_df = pd.read_csv('ca_leg/legislation_data/digest.csv')

In [9]:
bv2b = pickle.load(open('bill_id_mapping.pkl', 'rb'))
digest_df['bill_ID'] = digest_df['bill_id'].map(bv2b)

In [10]:
subject_embeddings = torch.load('subject_embeddings.pt')
subject_embeddings = {k: v.cpu().numpy() for k, v in subject_embeddings.items()}

In [11]:
full_table = digest_df.merge(subs, left_on='bill_ID', right_on='bill_id', how='right')
full_table.drop(columns=['bill_id_y'], inplace=True)
full_table.columns = ['bill_version', 'digest', 'bill_ID', 'subject']

In [12]:
original_subjects = {v: k for k, v in subject_originals.items()}
full_table['subject_embedding'] = full_table['subject'].map(original_subjects).map(subject_embeddings)

In [13]:
def text_clean(title):
    if not isinstance(title, str):
        return ''
    title = re.sub(r'\(.*?\)', '', title)
    title = re.sub(r'[^a-zA-Z0-9\s]', ' ', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title.lower()

full_table['digest'] = full_table['digest'].apply(text_clean)
full_table['digest_emb'] = full_table['digest'].map(digests)
full_table = full_table.loc[full_table['digest_emb'].notna()]

In [1]:
ANCHOR_TEXTS = {
  "Urban Development, Zoning & Land Use": (
    "Urban development and land use policy including zoning and rezoning, general plans and local planning, "
    "housing element requirements, development approvals, growth management, infill development, "
    "regional planning, CEQA-related planning issues, and land use governance."
  ),

  "Housing Supply, Affordability & Affordable Housing": (
    "Housing supply and affordability policy including residential development incentives, housing production, "
    "affordable housing finance, subsidies and tax credits, housing authorities, homelessness-related housing supply tools, "
    "mortgages and housing finance programs, housing affordability programs, and statewide housing initiatives."
  ),

  "Residential Permitting, Building Standards & Housing Quality": (
    "Residential permitting and building standards policy including housing standards and habitability, "
    "residential building codes, inspection and enforcement, permitting and approvals processes, "
    "accessory dwelling units and residential construction rules, seismic and safety upgrades, "
    "and policies affecting residential construction quality and compliance."
  ),

  "Tenant Rights, Renting & Evictions": (
    "Tenant protections and rental housing policy including rent stabilization and rent caps, "
    "evictions and unlawful detainer process, just-cause protections, security deposits, "
    "rental fees and screening practices, tenant remedies and enforcement, "
    "landlord obligations, and rental housing dispute resolution."
  ),

  "K–12 Education": (
    "K-12 education policy including public schools, charter schools, school funding, "
    "curriculum and standards, testing and accountability, student discipline, teacher "
    "workforce, special education, school facilities, school safety, and student services."
  ),

  "Education Equity & Student Rights": (
    "Education equity and student rights policy including equal access to education, anti-discrimination protections, "
    "student civil rights, inclusive curriculum, language access, disability accommodations, "
    "discipline equity, foster and homeless youth education supports, and educational opportunity gaps."
  ),

  "Higher Education (UC/CSU/Community Colleges)": (
    "Higher education policy covering public universities and community colleges, tuition and fees, "
    "financial aid, admissions, campus operations, academic programs, research funding, "
    "student housing, governance, and workforce training partnerships."
  ),

  "Healthcare": (
    "Healthcare policy including access to care, Medi-Cal, public health programs, hospitals and clinics, "
    "health insurance coverage, costs and billing, health workforce, patient protections, "
    "disease prevention, and health system regulation."
  ),

  "Public Health & Disease Control": (
    "Public health and disease control policy including communicable disease surveillance, vaccination policy, "
    "public health emergency powers, outbreak response, testing and reporting requirements, "
    "health officer authority, quarantine and isolation rules, and prevention programs for population health."
  ),

  "Mental Health": (
    "Mental health policy including behavioral health services, crisis response, community mental health programs, "
    "psychiatric care, parity and insurance coverage, treatment access, "
    "and supports for severe mental illness."
  ),

  "Substance Use Treatment & Recovery": (
    "Substance use disorder policy including addiction treatment, rehabilitation programs, harm reduction, "
    "recovery services, overdose prevention, treatment access and funding, "
    "and regulation of treatment facilities."
  ),

  "Energy & Utilities": (
    "Energy and public utilities policy including electricity and gas service, investor-owned and public utilities, "
    "rates and billing, grid reliability, renewable energy deployment, transmission, "
    "distributed energy resources, utility regulation, and energy affordability programs."
  ),

  "Transportation": (
    "Transportation policy including roads and highways, transit systems, rail, aviation, ports, "
    "active transportation, freight and logistics, transportation funding, "
    "traffic safety, and infrastructure maintenance."
  ),

  "Vehicles & Driver Regulation": (
    "Vehicle policy including vehicle registration and titling, driver licensing, traffic laws and enforcement, "
    "vehicle safety standards, emissions and smog rules, electric vehicles and charging requirements, "
    "commercial vehicles, towing and impound rules, and DMV administration."
  ),

  "Infrastructure, Construction & Public Works": (
    "Infrastructure and public works policy including public construction projects, capital improvements, "
    "public contracting and procurement, prevailing wage and construction labor rules, "
    "infrastructure funding and delivery, project permitting and timelines, "
    "and maintenance of public facilities and state/local assets."
  ),

  "State Workforce & Labor Relations": (
    "State workforce policy including public employee pay and benefits, pensions, collective bargaining, "
    "civil service rules, recruitment and hiring, workplace conditions, labor relations, "
    "state departments' staffing, and employee discipline or grievances."
  ),

  "Labor & Workplace Standards": (
    "Labor and workplace policy including wages and hours, worker protections, workplace safety, "
    "collective bargaining, paid leave, discrimination in employment, "
    "gig and contractor classification, and labor enforcement."
  ),

  "State Budget & Appropriations": (
    "State budget and fiscal policy including appropriations, budget process, state spending plans, "
    "budget control, fiscal forecasting, funding allocations across programs, "
    "bond measures, and budget trailer bills."
  ),

  "Local Government": (
    "Local government policy including cities, counties, special districts, local governance structures, "
    "local fiscal powers, local program implementation, intergovernmental coordination, "
    "and local administrative authorities."
  ),

  "Government Operations & Administration": (
    "Government operations and administrative policy including state agencies and departments, "
    "administrative law and rulemaking, audits and oversight, program implementation, "
    "state service delivery, public records and transparency, procurement administration, "
    "and modernization of government operations."
  ),

  "Elections, Campaigns & Government Ethics": (
    "Political process policy including legislative rules and procedure, ethics rules, campaign finance, "
    "political disclosures, lobbying rules, election administration, ballot measures, "
    "and institutional governance of elections."
  ),

  "Voting Rights & Redistricting": (
    "Voting rights and representation policy including voter access, voter registration, vote-by-mail, "
    "election protections, language assistance, redistricting processes, "
    "district maps, and representation fairness."
  ),

  "Civil Rights & Anti-Discrimination": (
    "Civil rights policy including anti-discrimination protections in housing, employment, education, "
    "public accommodations, hate crime-related protections, equal access, "
    "and enforcement of civil rights laws."
  ),

  "Immigration Policy and Resources": (
    "Immigration-related state policy including services for immigrants, language access, "
    "state and local cooperation rules, employment and licensing access for immigrants, "
    "legal services, education and health access policies affecting immigrant communities."
  ),

  "Tribal Affairs": (
    "Tribal affairs policy including state-tribal relations, tribal sovereignty, tribal lands, "
    "compacts and agreements, cultural resource protections, tribal services, "
    "and consultation requirements."
  ),

  "Public Safety & Policing": (
    "Public safety and policing policy including law enforcement standards, use of force rules, "
    "police accountability, public safety funding, emergency response, "
    "community safety programs, and oversight mechanisms."
  ),

  "Firearms Policy": (
    "Firearms policy including gun sales and licensing, background checks, restrictions on possession, "
    "safe storage requirements, violence prevention, enforcement, "
    "and regulation of firearms and ammunition."
  ),

  "Criminal Law & Criminal Courts": (
    "Criminal law policy including crimes and penalties, charging and sentencing rules, criminal procedure, "
    "court processes, prosecutorial authority, defense rights, evidence standards, "
    "and court administration for criminal cases."
  ),

  "Incarceration & Pretrial Detention": (
    "Incarceration policy including jail and prison operations, custody standards, bail and pretrial detention, "
    "conditions of confinement, incarceration costs, capacity and overcrowding, "
    "and detention oversight."
  ),

  "Reentry, Probation & Parole": (
    "Corrections and reentry policy including probation and parole supervision, reentry services, "
    "rehabilitation programs, community supervision rules, expungement and record relief, "
    "and support for successful reintegration."
  ),

  "Courts & Judicial Administration": (
    "Court system and legal procedure policy including court administration, judicial procedures, "
    "rules of evidence, filings and fees, access to courts, legal aid, "
    "case management, and judicial governance."
  ),

  "Civil Law & Liability": (
    "Civil law policy including statutes and codes, civil procedure, torts and liability, damages, "
    "contracts, property and landlord-tenant civil rules, consumer civil remedies, "
    "and civil court operations."
  ),

  "Children & Family Courts / Child Welfare": (
    "Children and family court policy including juvenile justice, dependency court, family law proceedings, "
    "child welfare, foster care, adoption, guardianship, youth services, "
    "and child protective systems."
  ),

  "Violence Prevention & Victim Services": (
    "Violence and abuse policy including domestic violence, sexual violence, intimate partner violence, "
    "child abuse, restraining orders, victim services, prevention programs, "
    "and enforcement and reporting requirements."
  ),

  "Homelessness & Social Services": (
    "Homelessness and social services policy including homelessness programs, shelters and supportive housing, "
    "social safety net benefits, cash assistance, community services, "
    "case management, and service delivery coordination."
  ),

  "Disability Rights & Services": (
    "Disability policy including accessibility standards, disability rights, services and supports, "
    "community living programs, special education interfaces, employment supports, "
    "and protections under disability law."
  ),

  "Aging & Long-Term Care": (
    "Aging and elder care policy including long-term services and supports, nursing facilities, "
    "home- and community-based care, elder abuse prevention, caregiving supports, "
    "and programs for older adults."
  ),

  "Childcare & Early Childhood": (
    "Early childhood and childcare policy including childcare access and affordability, "
    "early learning programs, preschool, provider licensing and quality standards, "
    "childcare workforce, and family support services."
  ),

  "Professional Licensing & Workforce Development": (
    "Workforce development policy including professional licensing boards, credentialing, apprenticeships, "
    "job training programs, workforce pipelines, skills certification, "
    "and regulation of licensed professions."
  ),

  "Medical Practice & Clinical Research": (
    "Medical practice and clinical research policy including biomedical research, clinical trials, "
    "medical ethics, patient consent, research oversight, hospital clinical standards, "
    "and regulation of medical research institutions."
  ),

  "Consumer Protection": (
    "Consumer protection policy including unfair and deceptive practices, product safety, "
    "fraud prevention, consumer financial protections, debt collection, "
    "warranties, and enforcement by consumer agencies."
  ),

  "Banking & Financial Services": (
    "Banking and financial services policy including banks and credit unions, consumer finance regulation, "
    "lending and underwriting rules, fintech regulation, payments, money transmission, "
    "fraud protections, and financial oversight."
  ),

  "Insurance": (
    "Insurance policy including property and casualty insurance, health insurance regulation, "
    "claims and consumer protections, underwriting rules, disaster insurance markets, "
    "rates, and oversight of insurers."
  ),

  "Privacy, Data & Technology": (
    "Privacy and technology policy including data privacy rights, data security, cybersecurity, "
    "consumer data collection and sharing, AI and automated decision systems, "
    "digital identity, and technology regulation."
  ),

  "Telecommunications & Broadband": (
    "Telecommunications policy including broadband deployment, internet access affordability, "
    "telecom infrastructure, 911 and emergency communications, wireless and wireline regulation, "
    "and oversight of telecom providers."
  ),

  "Corporate Tax & Business Finance": (
    "Business and corporate finance policy including corporate taxation, business fees, financial reporting, "
    "corporate governance, state revenue measures affecting businesses, incentives, "
    "credits, and compliance requirements."
  ),

  "Corporate Regulation & Business Law": (
    "Corporate regulation and business law including corporate governance standards, mergers and acquisitions, "
    "securities and disclosures, business entity formation and compliance, fiduciary duties, "
    "antitrust and competition issues, and regulation of business practices and commercial conduct."
  ),

  "Personal Income Tax": (
    "Personal taxation policy including income tax, credits and deductions, filing rules, enforcement, "
    "taxpayer services, withholding, and state revenue measures affecting individuals and households."
  ),

  "Sales & Use Tax": (
    "Sales and use tax policy including tax base changes, exemptions, rates, administration, "
    "compliance and enforcement, and revenue impacts related to sales taxation."
  ),

  "Small Business & Entrepreneurship": (
    "Small business and entrepreneurship policy including business formation, licensing and permitting, "
    "access to capital, procurement opportunities, small business assistance programs, "
    "innovation policy, and regulatory burdens on small firms."
  ),

  "Parks, Wildlife & Recreation": (
    "Parks and wildlife policy including state parks, recreation areas, wildlife management, habitat protection, "
    "hunting and fishing regulation, conservation programs, endangered species stewardship, "
    "and outdoor access and recreation planning."
  ),

  "Natural Resources & Conservation": (
    "Resource conservation policy including conservation of water, land, forests, and natural resources, "
    "resource management programs, sustainability initiatives, and stewardship of public lands."
  ),

  "Environment & Climate Policy": (
    "Environmental and climate policy including greenhouse gas reduction, air quality, emissions standards, "
    "climate adaptation, environmental permitting, pollution control, "
    "environmental justice, and regulatory compliance."
  ),

  "Coastal & Ocean Policy": (
    "Coastal and ocean policy including coastal zone management, ocean resources, fisheries and marine habitats, "
    "sea level rise adaptation, coastal development permitting, "
    "and protection of beaches and marine ecosystems."
  ),

  "Agriculture & Food Systems": (
    "Agriculture and food systems policy including farming and ranching, farm labor, pesticides and inputs, "
    "food safety, agricultural water use, food supply chains, nutrition programs, "
    "and agricultural markets and subsidies."
  ),

  "Water Policy": (
    "Water policy including water rights and allocation, groundwater management, water quality, drought response, "
    "water infrastructure, storage and conveyance, drinking water access, "
    "and water agency governance."
  ),

  "Wildfire Prevention & Mitigation": (
    "Wildfire and fire prevention policy including fire risk reduction, forest management, defensible space, "
    "vegetation management, fire-safe building standards, utility wildfire mitigation, "
    "and community wildfire preparedness."
  ),

  "Emergency Management & Disaster Response": (
    "Emergency management policy including disaster preparedness, emergency response coordination, "
    "mutual aid, state emergency powers, disaster recovery programs, emergency communications, "
    "public alerts, and resilience planning for earthquakes, fires, floods, and storms."
  ),

  "Alcohol Regulation": (
    "Alcohol policy including licensing and regulation of alcohol sales, retail and on-premise service rules, "
    "public safety and DUI measures, underage access, taxation and fees, "
    "and enforcement by alcohol regulatory agencies."
  ),

  "Cannabis Regulation": (
    "Cannabis policy including licensing and regulation of cultivation, manufacturing, distribution, and retail, "
    "public health rules, taxation, local control, enforcement, product standards, "
    "and equity programs."
  ),

  "Gaming & Gambling": (
    "Gaming and gambling policy including casinos, tribal gaming, card rooms, sports betting, "
    "lottery operations, gambling regulation, consumer protections, "
    "and enforcement of gaming laws."
  ),

  "Animal Welfare & Veterinary Policy": (
    "Animal policy including animal welfare, veterinary regulation, livestock and agricultural animals, "
    "pets and shelters, animal health, wildlife-human conflict, "
    "and enforcement of animal protection standards."
  ),

  "Arts, Culture & Historic Preservation": (
    "Arts, culture, and history policy including cultural institutions, museums and libraries, "
    "historic preservation, cultural grants, public arts programs, "
    "and protection of cultural resources."
  ),

  "Entertainment & Media": (
    "Entertainment and media policy including film and television production, digital media, "
    "broadcast and streaming-related regulation, creative industries workforce, "
    "public media programs, and incentives supporting arts and entertainment sectors."
  ),

  "Tourism & Hospitality": (
    "Tourism and hospitality policy including visitor economy programs, hotels and lodging rules, "
    "short-term rentals, tourism promotion, restaurant and hospitality regulation, "
    "and local economic development tied to tourism."
  ),

  "Holidays & Commemorations": (
    "Symbolic and commemorative policy including state holidays, awareness days and months, anniversaries"
    "memorials and commemorations, honorary designations, and ceremonial recognitions."
  ),
}


In [31]:
def to_arr(x):
    if x is None:
        return None
    if isinstance(x, np.ndarray):
        return x
    try:
        return np.array(x, dtype=np.float32)
    except Exception:
        return None

full_table['digest_emb'] = full_table['digest_emb'].map(to_arr)
full_table['subject_embedding'] = full_table['subject_embedding'].map(to_arr)

In [32]:
alpha = 0.75  # digest weight
g = full_table.groupby('bill_ID', sort=False)

bill_rows = []
for bill_id, sub in g:
    d = [v for v in sub['digest_emb'].values if isinstance(v, np.ndarray)]
    s = [v for v in sub['subject_embedding'].values if isinstance(v, np.ndarray)]

    d_mean = None if len(d) == 0 else np.stack(d).mean(axis=0)
    s_mean = None if len(s) == 0 else np.stack(s).mean(axis=0)

    if d_mean is None and s_mean is None:
        continue
    if d_mean is None:
        bill_vec = s_mean
    elif s_mean is None:
        bill_vec = d_mean
    else:
        bill_vec = alpha * d_mean + (1 - alpha) * s_mean

    bill_rows.append((bill_id, bill_vec.astype(np.float32)))

bill_level = pd.DataFrame(bill_rows, columns=['bill_ID', 'bill_emb'])
bill_level.shape

(46147, 2)

In [33]:
def join_unique(series, max_items=8):
    vals = [str(x) for x in series.dropna().unique().tolist()]
    if len(vals) > max_items:
        vals = vals[:max_items]
    return " | ".join(vals)

digest_roll = full_table.groupby('bill_ID', sort=False)['digest'].apply(join_unique).reset_index()
subject_roll = full_table.groupby('bill_ID', sort=False)['subject'].apply(join_unique).reset_index()

bill_level = bill_level.merge(digest_roll, on='bill_ID', how='left').merge(subject_roll, on='bill_ID', how='left')
bill_level.head()

Unnamed: 0,bill_ID,bill_emb,digest,subject
0,202120220SCR28,"[-0.020102115, 0.07975301, 0.025029115, 0.0082...",this measure would designate a specified porti...,Korean War Veterans Memorial Highway.
1,201720180AB800,"[-0.028764162, 0.040580273, -0.03576208, 0.042...",existing law defines hate crime as a criminal ...,Hate crimes: hotline.
2,200920100SB766,"[-0.06532735, 0.013930081, 0.016930588, 0.0088...",existing law until january 1 2011 authorizes r...,Horse racing.
3,200120020SB937,"[-0.116569154, -0.012979306, 0.01835078, 0.010...",the subletting and subcontracting fair practic...,Public contracts: bids and disputes.
4,201720180AB2213,"[-0.0076826895, 0.06388208, -0.009036416, -0.0...",the bill would exempt from the above described...,Firearms: ammunition sales.


In [34]:
X = np.stack(bill_level['bill_emb'].values).astype(np.float32)
X = normalize(X)
X.shape

(46147, 384)

In [35]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

anchor_labels = list(ANCHOR_TEXTS.keys())
anchor_texts = [ANCHOR_TEXTS[k] for k in anchor_labels]

A = embedder.encode(anchor_texts, normalize_embeddings=True, show_progress_bar=True)
A = np.asarray(A, dtype=np.float32)

len(anchor_labels), A.shape

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

(66, (66, 384))

In [36]:
S = X @ A.T
S.shape

(46147, 66)

In [37]:
top_anchor_k = 2
anchor_gate_thresh = 0.26

anchor_order = np.argsort(-S, axis=1)

allowed_anchors = []
for i in range(S.shape[0]):
    picks = []
    for j in anchor_order[i, :top_anchor_k]:
        if S[i, j] >= anchor_gate_thresh:
            picks.append(j)
    allowed_anchors.append(set(picks))

bill_level['allowed_anchor_ids'] = allowed_anchors

In [38]:
anchor_to_bills = defaultdict(list)

for i, anchors in enumerate(bill_level['allowed_anchor_ids']):
    for a in anchors:
        anchor_to_bills[a].append(i)
n = X.shape[0]
connectivity = lil_matrix((n, n), dtype=np.int8)

In [39]:
sim_thresh = 0.35
k_neighbors = 40
neighbor_sets = {}

for anchor_id, idxs in tqdm(anchor_to_bills.items(), desc="Anchors"):
    if len(idxs) < 2:
        continue

    Xa = X[idxs]

    nn = NearestNeighbors(
        n_neighbors=min(k_neighbors, len(idxs)),
        metric='cosine',
        algorithm='auto'
    )
    nn.fit(Xa)
    _, nbrs = nn.kneighbors(Xa)

    neighbor_sets[anchor_id] = {
        idxs[i]: set(idxs[j] for j in nbrs[i] if j != i)
        for i in range(len(idxs))
    }

    for anchor_id, nbr_map in neighbor_sets.items():
        for i, nbrs_i in nbr_map.items():
            for j in nbrs_i:
                if i in nbr_map.get(j, set()):
                    connectivity[i, j] = 1
                    connectivity[j, i] = 1


Anchors: 100%|██████████| 66/66 [02:03<00:00,  1.87s/it]


In [40]:
nnz = connectivity.nnz
avg_degree = nnz / n
nnz, avg_degree

(1288128, 27.913580514442977)

In [41]:
target_cluster_size = 300
n_clusters = int(n / target_cluster_size)

In [42]:
clust = AgglomerativeClustering(
    n_clusters=n_clusters,
    linkage='ward',
    connectivity=connectivity
)

labels = clust.fit_predict(X)
bill_level['cluster_id'] = labels

  connectivity, n_connected_components = _fix_connectivity(


In [48]:
bill_level['anchors'] = bill_level['allowed_anchor_ids'].apply(lambda s: [anchor_labels[i] for i in s])

In [62]:
bill_level['topic_clustered'] = bill_level['anchors'].apply(lambda x: x[0] if len(x) > 0 else None)
bill_level.loc[bill_level['topic_clustered'].isna(), 'topic_clustered'] = 'Other'

In [65]:
bill_level['tc'] = bill_level['topic_clustered'].astype('category').cat.codes

In [67]:
bill_labels = {}

for id, topic in zip(bill_level['bill_ID'], bill_level['topic_clustered']):
    bill_labels[id] = topic

with open('bill_labels.json', 'w') as f:
    json.dump(bill_labels, f)

In [68]:
subj = bill_level[['topic_clustered', 'tc']].drop_duplicates()
subject_key = {}
for _, row in subj.iterrows():
    subject_key[row['tc']] = row['topic_clustered']

with open('subject_key.json', 'w') as f:
    json.dump(subject_key, f)