In [1]:
import re, torch, json, pickle, itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
import networkx as nx
from community import community_louvain
from sklearn.mixture import GaussianMixture
from sentence_transformers import SentenceTransformer, util
from collections import Counter, defaultdict
from nltk.corpus import stopwords
import warnings

warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
stopwords = set(stopwords.words('english'))

In [2]:
with open('bill_subjects.json', 'r') as f:
    bill_subjects = json.load(f)

In [3]:
subject_originals = pickle.load(open('subjects_original.pkl', 'rb'))

In [4]:
so = {k: subject_originals[v] for k, v in bill_subjects.items() if v in subject_originals}

In [None]:
def canonical(sub):
    txt = sub.lower()
    txt = re.sub(r'[^a-z\s]', ' ', txt)
    txt = re.sub(r'(?:california|state|bill|law|act|amendment|proposition|measure|initiative|program|act|code|section|chapter|month|awareness|prevention)', '', txt)
    txt = re.sub(r'\s+', ' ', txt).strip()
    txt = ' '.join([w for w in txt.split() if w not in stopwords])
    return txt.strip()

canonical_subjects = {k: canonical(v) for k, v in so.items()}
canonical_set = set(canonical_subjects.values())

model = SentenceTransformer("google/embeddinggemma-300m-qat-q4_0-unquantized")
subs = list(canonical_set)
embs = model.encode(subs, show_progress_bar=True, batch_size=256)

In [14]:
import igraph as ig, leidenalg as la
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score

X = normalize(embs.astype(np.float32))
nbrs = NearestNeighbors(n_neighbors=min(22, max(10, int(np.sqrt(len(X))))), metric="cosine").fit(X)
dist, idx = nbrs.kneighbors(X)
sim = 1.0 - dist
rows = np.repeat(np.arange(len(X)), idx.shape[1])
cols = idx.ravel()
weights = sim.ravel()
m = np.vstack([rows, cols, weights]).T
m = m[rows != cols]
G = ig.Graph(n=len(X), edges=list(map(tuple, m[:, :2].astype(int))), directed=False)
G.es["weight"] = m[:, 2].astype(float)
res_grid = np.linspace(0.2, 6.0, 24)

In [15]:
parts = []
scores = []
for r in res_grid:
    part = la.find_partition(G, la.RBConfigurationVertexPartition, weights="weight", resolution_parameter=float(r), seed=42)
    labels = np.array(part.membership)
    if len(np.unique(labels)) > 1:
        try:
            s = silhouette_score(X, labels, metric="cosine")
        except Exception:
            s = -1.0
    else:
        s = -1.0
    parts.append(labels)
    scores.append(s)
best = int(np.argmax(scores))
labels = parts[best]
k = len(np.unique(labels))
centroids = []
for c in range(k):
    msk = labels==c
    v = X[msk].mean(axis=0)
    v = v/np.linalg.norm(v) if np.linalg.norm(v)>0 else v
    centroids.append(v)
centroids = np.stack(centroids)

In [16]:
sims = X @ centroids.T
cluster_sim = sims[np.arange(len(X)), labels]
thr = np.percentile(cluster_sim, 10)
noise = cluster_sim < max(thr, 0.1)
labels_noise = labels.copy()
labels_noise[noise] = -1
clusters = {}
for cid in sorted(set(labels_noise) - {-1}):
    idx_c = np.where(labels_noise==cid)[0]
    cvec = centroids[cid]
    ex_i = idx_c[np.argmax(X[idx_c] @ cvec)]
    clusters[int(cid)] = {
        "indices": idx_c.tolist(),
        "subjects": [subs[i] for i in idx_c],
        "centroid": cvec.tolist(),
        "exemplar_index": int(ex_i),
        "exemplar_subject": subs[ex_i]
    }
result = {
    "labels": labels_noise.tolist(),
    "clusters": clusters,
    "noise_indices": np.where(labels_noise==-1)[0].tolist(),
    "resolution": float(res_grid[best]),
    "silhouette": float(scores[best])
}

In [19]:
results = [(c['exemplar_subject'], c['subjects'], len(c['subjects'])) for c in result['clusters'].values()]

In [23]:
labels = pd.DataFrame({
    'bill_id': list(so.keys()),
    'subject': list(so.values())
})

In [28]:
clust_reverse = {sub: cid for cid, c in clusters.items() for sub in c['subjects']}

In [31]:
labels['clean_subject'] = labels['bill_id'].map(canonical_subjects)
labels['cluster'] = labels['clean_subject'].map(clust_reverse)

In [18]:
with open('bill_ids.txt', 'r') as f:
    bill_ids = f.read().splitlines()

In [19]:
with open('missed_bills.txt', 'r') as f:
    missed_bills = f.read().splitlines()

In [20]:
bill_id_mapping = pickle.load(open('bill_id_mapping.pkl', 'rb'))

In [21]:
missing_variations = [k for k, v in bill_id_mapping.items() if v in missed_bills]

In [22]:
bill_labels = {}
for _, row in labels[['bill_id', 'cluster']].drop_duplicates().iterrows():
    bill_labels[row['bill_id']] = row['cluster']

In [23]:
with open('bill_labels.json', 'w') as f:
    json.dump(bill_labels, f)

In [24]:
digests = pd.read_csv('ca_leg/legislation_data/digest.csv')

In [25]:
digest_embeddings = torch.load('digests.pt', weights_only=False)

In [26]:
repairs = digests.loc[digests['bill_id'].isin(missing_variations)]
repairs['bill'] = repairs['bill_id'].map(bill_id_mapping)
repairs['version'] = repairs['bill_id'].apply(lambda x: x[-5:-3]).astype(int)
repairs = repairs.sort_values('version', ascending=False).groupby('bill').head(1)

In [27]:
de = {k: v.cpu().numpy() for k, v in digest_embeddings.items() if k in repairs['DigestText'].values}

In [28]:
repairs['digest_embedding'] = repairs['DigestText'].map(de)

In [29]:
sample_weights = labels['cluster'].value_counts().to_dict()

def sample_weighted(labs, sample_weights, n):
    lab = labs.copy().sample(frac=1).reset_index(drop=True)
    weights = labs['cluster'].map(sample_weights)
    return lab.sample(n, weights=weights, replace=True)

training_sample = sample_weighted(labels.loc[~labels['bill_id'].isin(missed_bills)], sample_weights, 2000)

In [31]:
t_vars = [k for k, v in bill_id_mapping.items() if v in training_sample['bill_id'].values]
t = digests.loc[digests['bill_id'].isin(t_vars)]
t['bill'] = t['bill_id'].map(bill_id_mapping)
t['version'] = t['bill_id'].apply(lambda x: re.sub(r'\D+', '', x)[-2:]).astype(int)
t = t.sort_values('version', ascending=False).groupby('bill').head(1)

dee = {k: v.cpu().numpy() for k, v in digest_embeddings.items() if k in t['DigestText'].values}
t['digest_embedding'] = t['DigestText'].map(dee)
t = t.loc[t['digest_embedding'].notna()]
t = t.merge(training_sample[['bill_id', 'cluster']], right_on='bill_id', left_on='bill', how='inner')

In [33]:
X = np.stack(t['digest_embedding'].values)
y = t['cluster'].values

In [34]:
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'criterion': ['gini'],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2],
    'min_samples_leaf': [2],
    'max_features': ['sqrt', 'log2']
}
rf = RandomForestClassifier(random_state=42, n_jobs=1)
grid = ParameterGrid(param_grid)
best_score, best_params = 0, None
for params in tqdm(grid):
    rf.set_params(**params)
    rf.fit(X, y)
    score = rf.score(X, y)
    if score > best_score:
        best_score = score
        best_params = params
print(f"Best Score: {best_score:.4f}")

100%|██████████| 6/6 [00:47<00:00,  7.88s/it]

Best Score: 0.9865





In [36]:
repairs['label_pred'] = rf.predict(np.vstack(repairs['digest_embedding']))
reps = {k: v for k, v in repairs[['bill', 'label_pred']].values if v is not None}
bbb = bill_labels.copy()
for k, v in reps.items():
    bbb[k] = v

with open('bill_labels_updated.json', 'w') as f:
    json.dump(bbb, f)

In [5]:
with open('bill_labels_updated.json', 'r') as f:
    updated_labels = json.load(f)

In [None]:
labels = pd.read_csv('sampled_labels - sampled_labels.csv')
so = {row['cluster']: row['Label'] for _, row in labels.iterrows()}

In [38]:
corrections = {}

for l in labels.groupby('Label')['cluster'].count().sort_values(ascending=False).loc[lambda x: x > 1].index:
    correction = [k for k, v in so.items() if v == l]
    m = min(correction)
    for c in correction:
        if c != m:
            corrections[c] = m

In [41]:
updated_labels2 = {}

for k, v in updated_labels.items():
    if v in corrections:
        updated_labels2[k] = corrections[v]
    else:
        updated_labels2[k] = v

In [42]:
with open('bill_labels_updated.json', 'w') as f:
    json.dump(updated_labels2, f)

In [12]:
labels = pd.DataFrame.from_dict(updated_labels, orient='index', columns=['cluster']).reset_index(names='bill_id')

In [17]:
text = labels['bill_id'].map(bill_subjects)
labels['subject'] = text

Unnamed: 0,bill_id,cluster,subject
0,200320040SB73,82,national guard
1,200520060AB1484,5,sexually violent predators definition
2,200920100AB2531,32,redevelopment economic development
3,200520060SB1576,102,foster care transitional housing
4,200920100AB383,34,criminal procedure dna evidence
...,...,...,...
46095,201720180ACR25,115,
46096,200920100SJR35,115,
46097,201720180AB291,34,
46098,201720180ACR6,56,


In [21]:
sample = labels.groupby('cluster').sample(50).reset_index()
sample['count'] = sample.groupby('cluster').cumcount()

In [23]:
sample.pivot(index='cluster', columns='count', values='subject').to_csv('sampled_labels.csv', index=True)

In [20]:
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import CountVectorizer

In [18]:
labels.groupby('cluster').sample(50, replace=True).sort_values('cluster').to_csv('sampled_labels.csv', index=False)

In [133]:
def _clean(text):
    if not isinstance(text, str):
        return ""
    text = text.lower().strip()
    text = re.sub(r"\b(state|bill|act|law|code|section|chapter|california|month)\b", " ", text, flags=re.I)
    text = re.sub(r"\s+", " ", text).lower()
    return text

def text_cluster(label, ngram_range=(2, 3), max_features=125):
    section = labels.loc[labels['cluster'] == label, 'subject'].values
    cleaned_texts = [_clean(text) for text in section if text and isinstance(text, str)]

    if not cleaned_texts:
        return []

    vectorizer = CountVectorizer(
        ngram_range=ngram_range,
        max_features=max_features,
        stop_words='english',
        min_df=5,
        lowercase=True
    )

    try:
        count_matrix = vectorizer.fit_transform(cleaned_texts)
        feature_names = vectorizer.get_feature_names_out()

        phrase_counts = count_matrix.sum(axis=0).A1

        phrase_count_pairs = list(zip(feature_names, phrase_counts))
        phrase_count_pairs.sort(key=lambda x: x[1], reverse=True)

        return phrase_count_pairs[:3]

    except ValueError:
        return []

In [134]:
cluster_phrases = {}
for label in labels['cluster'].unique():
    cluster_phrases[label] = text_cluster(label)

In [135]:
cp = pd.DataFrame.from_dict(cluster_phrases, orient='index', columns=['phrase1', 'phrase2', 'phrase3']).reset_index(names='cluster').sort_values('cluster')