In [60]:
import re, torch, json, pickle, itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
import networkx as nx
from community import community_louvain
from sklearn.mixture import GaussianMixture
from sentence_transformers import SentenceTransformer, util
from collections import Counter, defaultdict
import warnings
from rapidfuzz import process, fuzz

warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

In [61]:
with open('bill_subjects.json', 'r') as f:
    bill_subjects = json.load(f)

In [62]:
subject_originals = pickle.load(open('subjects_original.pkl', 'rb'))

In [63]:
so = {k: subject_originals[v] for k, v in bill_subjects.items() if v in subject_originals}

In [64]:
subs = pd.DataFrame.from_dict(so, orient='index', columns=['subject']).reset_index().rename(columns={'index':'bill_id'})
subs = subs.loc[subs['subject'].notna()]

def top_subj(subj):
    if isinstance(subj, str):
        if ':' in subj:
            return subj.split(':')[0].strip()
        return subj.strip()
    return None

subs['top_subject'] = subs['subject'].apply(top_subj)

In [65]:
def canonical(sub):
    txt = sub.split(':')[0].lower()
    txt = re.sub(r'[^a-z\s]', ' ', txt)
    txt = re.sub(r'(?:california|state|bill|law|act|amendment|proposition|measure|initiative|program|act|code|section|chapter|month|awareness|prevention)', '', txt)
    txt = re.sub(r'\s+', ' ', txt).strip()
    return txt

def embed_subjects(subjs):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    def sub_clean(s):
        s = s.lower().strip()
        s = re.sub(r'(?:california|state|bill|law|act|amendment|proposition|measure|initiative|program|act|code|section|chapter|month|awareness|prevention)', '', s)
        s = re.sub(r'[^a-z\s]', ' ', s)
        s = re.sub(r'\s+', ' ', s).strip()
        return s
    subjs = [sub_clean(s) for s in subjs]
    subjs = [s for s in subjs if s]
    embs = model.encode(subjs, normalize_embeddings=True, batch_size=128)
    return np.asarray(embs)


In [66]:
embs = embed_subjects(subs['top_subject'].dropna().unique().tolist())

In [67]:
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage

X = np.asarray(embs, dtype=np.float32)
n = np.linalg.norm(X, axis=1, keepdims=True); n[n == 0] = 1.0
X = X / n

dist_vec = pdist(X, metric='cosine')
Z = linkage(dist_vec, method='average')
d = Z[:, 2]

if len(d) == 0:
    labels = np.arange(len(X))
else:
    jumps = np.diff(d)
    start = int(0.1 * len(jumps)) if len(jumps) else 0
    idx = start + np.argmax(jumps[start:]) if len(jumps) else 0
    cut = 0.5 * (d[idx] + d[idx + 1]) if idx + 1 < len(d) else d[idx]
    min_clusters = max(20, X.shape[0] // 40)
    t = cut * 0.85
    model = AgglomerativeClustering(n_clusters=None, distance_threshold=t, metric='cosine', linkage='average', compute_full_tree=True)
    labels = model.fit_predict(X)
    while labels.max() + 1 < min_clusters and t > 1e-9:
        t *= 0.9
        model = AgglomerativeClustering(n_clusters=None, distance_threshold=t, metric='cosine', linkage='average', compute_full_tree=True)
        labels = model.fit_predict(X)

print("clusters:", labels.max() + 1)

KeyboardInterrupt: 

In [None]:
def sub_clean(s):
        s = s.lower().strip()
        s = re.sub(r'(?:california|state|bill|law|act|amendment|proposition|measure|initiative|program|act|code|section|chapter|month|awareness|prevention)', '', s)
        s = re.sub(r'[^a-z\s]', ' ', s)
        s = re.sub(r'\s+', ' ', s).strip()
        return s

subjs = [s for s in subs['top_subject'].dropna().unique().tolist() if sub_clean(s)]

In [None]:
subj_df = pd.DataFrame({'subject': subjs, 'cluster': labels})

In [None]:
sbs = subs.loc[subs['top_subject'].apply(lambda x: isinstance(x, str) and sub_clean(x) != '')].merge(subj_df, left_on='top_subject', right_on='subject', how='left')

In [68]:
with open('bill_labels.json', 'r') as f:
    bill_labels = json.load(f)

In [89]:
clusters = subs[['bill_id', 'top_subject']]
clusters['cluster'] = clusters['bill_id'].map(bill_labels)
clusters = clusters.loc[clusters['cluster'].notna()]
clusters['cluster'] = clusters['cluster'].astype(int)

clstrs = {}
for _, row in clusters.groupby('cluster')['top_subject'].apply(lambda x: Counter(x).most_common(10)).reset_index().iterrows():
    clstrs[row['cluster']] = [c[0] for c in row['top_subject']]

In [None]:
from transformers import pipeline

labels={}
summarizer=pipeline("text2text-generation", model="google-t5/t5-small", device='mps')

q = "What concise phrase encompasses all topics in this list of phrases"

def funnel_text(v):
    s = f"{q} ?  {"; ".join(v)}"
    p = summarizer(s, do_sample=False)
    return p[0]['generated_text']

for k, v in tqdm(clstrs.items()):
    labels[k] = funnel_text(v)

Device set to use mps
100%|██████████| 437/437 [11:52<00:00,  1.63s/it]


In [118]:
labels

{0: 'Hypodermic needles and syringes.; Bone Marrow Donation',
 1: 'Notice of levy.; Dropout acknowledgment form.; Notice to employees; Official notice',
 2: 'Question',
 3: 'Question',
 4: 'Motorcycle Awareness Month.; Electric bicycles; Motorcycles; Powered wheelchairs; Bicycles',
 5: 'True',
 6: 'Food facilities; Food stamps; School Breakfast Week.; School meals; Food; Milk.;',
 7: 'Unmanned aircraft systems.; Unmanned aircraft systems; Bar pilots; Aviation Awareness Month.;',
 8: 'Oil and gas; Natural gas; Oil spills; Oil spill prevention and response.; Gaso',
 9: 'Transportation funding; California Transportation Commission; High-speed rail; High-Speed Rail Authority; Transportation projects',
 10: 'Telecommunications; Communications; Telephone corporations; Internet; Broadband infrastructure; Interception of electronic communications',
 11: 'Lymphedema Awareness Day.; Multiple Myeloma Awareness Month.;',
 12: 'Inedible kitchen grease.; Topical flouride application.; Grease.; Corn'