In [6]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, silhouette_samples
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from tqdm import tqdm
import re, torch
import json

In [7]:
with open('bill_subjects.json', 'r') as f:
    bill_subjects = json.load(f)

In [8]:
with open('bill_ids.txt', 'r') as f:
    bill_ids = f.read().splitlines()

In [9]:
import requests
from bs4 import BeautifulSoup

def get_bill_subject(bill_id):
    url = f'https://leginfo.legislature.ca.gov/faces/billStatusClient.xhtml?bill_id={bill_id}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup.select('.statusCellData #subject')[0].text

In [None]:
subs2 = []
for bill_id in tqdm(bill_ids):
    try:
        subs2.append(get_bill_subject(bill_id))
    except:
        continue

  5%|▍         | 1230/26608 [03:50<3:14:11,  2.18it/s]

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', device='mps')

In [None]:
def text_clean(title):
    if not isinstance(title, str):
        return ''
    title = re.sub(r'\(.*?\)', '', title)
    title = re.sub(r'[^a-zA-Z0-9\s]', ' ', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title.lower()

def batched_embeddings(values, output_dims=384):
    vals = [text_clean(v) for v in values if isinstance(v, str)]
    vals = list(set([v for v in vals if v != '']))
    embeddings = model.encode(vals, batch_size=64, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True, num_workers=4, output_dims=output_dims)
    embs = {v: e for v, e in zip(vals, embeddings)}
    return embs

subjects2 = batched_embeddings(subs2)

In [189]:
subjects = torch.load('subject_embeddings.pt')

In [91]:
X = np.stack([subjects[subject].cpu().numpy() for subject in subjects])

In [92]:
clustering = {}
for n in tqdm([375]):
    clusterer = AgglomerativeClustering(
        n_clusters=n,
        linkage='ward'
    )
    clusters = clusterer.fit_predict(X)
    silhouette = silhouette_score(X, clusters)
    clustering[n] = {'clusters': clusters, 'silhouette': silhouette}
pd.DataFrame.from_dict(clustering, orient='index').sort_values('silhouette', ascending=False)

100%|██████████| 1/1 [00:58<00:00, 58.64s/it]


Unnamed: 0,clusters,silhouette
375,"[372, 79, 4, 319, 13, 361, 128, 282, 219, 195,...",0.044097


In [93]:
clusters = clustering[375]['clusters']

In [94]:
silhouettes = silhouette_samples(X, clusters)

In [95]:
subj = pd.DataFrame({'subject': subjects.keys(), 'label': clusters, 'silhouette': silhouettes})

In [96]:
labels = {k: v.values[0][0] for k, v in subj.sort_values('silhouette', ascending=False).groupby('label').head(1).reset_index(drop=True).groupby('label')[['subject']]}

In [97]:
from scipy.cluster.hierarchy import fcluster, linkage

In [98]:
leaf_ids = np.sort(subj['label'].unique())
centroids = np.vstack([X[subj['label'] == leaf_id].mean(axis=0) for leaf_id in leaf_ids])
Z = linkage(centroids, method='ward')

In [99]:
targets = [150, 75, 40, 15]
hierarchy = {}
for k in targets:
    s_labels = fcluster(Z, k, criterion='maxclust') - 1
    map_leaf = dict(zip(leaf_ids, s_labels))
    hierarchy[k] = subj['label'].map(map_leaf).to_numpy()
hierarchy = pd.DataFrame(hierarchy, index=subj['subject']).reset_index()

In [100]:
groupings = subj.merge(hierarchy, on='subject', how='left').rename(columns={150: 'group_150', 75: 'group_75', 40: 'group_40', 15: 'group_15'})

In [101]:
groupings[['subject', 'label']].drop_duplicates().to_csv('subject_labels.csv', index=False)

In [114]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sentence_transformers import SentenceTransformer, util

grouped = groupings.groupby('label')['subject'].apply(list)

clear_cluster_names = {}

for label, subjects in grouped.items():
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 5), max_features=200)
    tfidf = vectorizer.fit_transform(subjects)
    c_vectorizer = CountVectorizer(stop_words='english', ngram_range=(2, 4), max_features=200)

    if tfidf.shape[0] > 1:
        svd = TruncatedSVD(n_components=1, random_state=0)
        topic_vector = svd.fit(tfidf).components_[0]
        c_topic_vector = svd.fit(c_vectorizer.fit_transform(subjects)).components_[0]
        tf_top = vectorizer.get_feature_names_out()[topic_vector.argsort()[::-1][:10]]
        c_top = c_vectorizer.get_feature_names_out()[c_topic_vector.argsort()[::-1][:10]]

    clear_cluster_names[label] = {
        'tf_top': tf_top,
        'c_top': c_top
    }

In [103]:
pd.DataFrame.from_dict(clear_cluster_names, orient='index').reset_index(names='cluster').to_csv('cluster_names.csv', index=False)

In [143]:
cluster_names = pd.read_csv('cluster_names.csv')

In [144]:
cluster_names.columns = ['cluster', 'tf_top', 'c_top', 'summary_phrase']

In [None]:
cluster_labels = {row['cluster']: row['summary_phrase'] for _, row in cluster_names.iterrows()}
groupings['label_name'] = groupings['label'].map(cluster_labels)

In [None]:
groupings.to_csv('groupings.csv', index=False)

In [170]:
subj_labels = {row['subject']: row['label'] for _, row in subj.iterrows()}

bill_subjects_clean = {}
for bill, subject in bill_subjects.items():
    bill_subjects_clean[bill] = subj_labels.get(subject, -1)

In [172]:
with open('bill_labels.json', 'w') as f:
    json.dump(bill_subjects_clean, f)

In [2]:
groupings = pd.read_csv('groupings.csv')

In [47]:
leaf_labels = groupings.groupby('label')['label_name'].first().loc[leaf_ids].tolist()

model = SentenceTransformer('all-MiniLM-L6-v2', device='mps')
label_vecs = model.encode(leaf_labels, normalize_embeddings=True)

In [None]:
centroids_norm = normalize(centroids, norm='l2')
a = 0.75
hybrid = np.hstack([a*centroids_norm, (1-a)*label_vecs])

In [21]:
gr = groupings[['group_150', 'group_75', 'group_40', 'group_15', 'label_name']].drop_duplicates()

In [23]:
gr.to_csv('groupings_clean.csv', index=False)

In [4]:
from transformers import pipeline

In [18]:
def cluster_group_doc(level):
    g = groupings[[level, 'label_name']].drop_duplicates().groupby(level, sort=True)['label_name'].apply(lambda x: '. '.join(x))
    return g

qa = pipeline('question-answering', model='deepset/tinyroberta-squad2', device=0)
def compress_to_phrase(summary):
    question = "In a single, clear phrase, summarize the subject/s that connect the following phrases: "
    answer = qa(question=question, context=summary, max_length=150, min_length=25)['answer']
    return answer

def compress_group_doc(level):
    group = cluster_group_doc(level)
    grouping = {i: '' for i in group.index}
    for g, i in tqdm(zip(group, group.index), total=len(group)):
        phrase = compress_to_phrase(g)
        grouping[i] = phrase.strip()
    return grouping

Device set to use mps:0


In [19]:
a = compress_group_doc('group_150')

100%|██████████| 150/150 [00:07<00:00, 20.73it/s]
