In [3]:
from random import sample
from re import search

import numpy as np
from numba.cuda import mapped

from src.pipeline import umap_clustering as uc
from src.database.db_writer import Database
from sklearn.model_selection import ParameterGrid

In [4]:
def ingest_data():
    db_name = 'maine_legislation_and_testimony'
    db = Database(db_name)
    query = '''
    SELECT 
        Organization AS organization,
        COUNT(*) OVER (PARTITION BY Organization) AS count
    FROM TESTIMONY_HEADER th
    '''
    org_df = db.return_query_as_df(query)
    return org_df

In [5]:
sample_frac = 0.2
X = ingest_data()

In [6]:
param_grid = {
    'umap__n_neighbors': np.linspace(20, 50, 3, dtype=int),
    'umap__min_dist': np.linspace(0.1, 0.4, 4),
    'umap__n_components': np.linspace(2, 30, 4, dtype=int), 
    'hdbscan__min_cluster_size': np.linspace(5, 25, 3, dtype=int)
}

## Best params from experiment 1

Best score: 0.7666997880285907
Sample: 10%
```
{
    'umap__n_neighbors': np.int64(10), 
    'umap__n_components': np.int64(34), 
    'umap__min_dist': np.float64(0.25), 
    'hdbscan__min_samples': np.int64(5), 
    'hdbscan__min_cluster_size': np.int64(10), 
    'hdbscan__cluster_selection_epsilon': 0.0
}
```
Best run 2: 
Sample: 30%
```
{
    'umap__n_neighbors': 30, 
    'umap__n_components': 20, 
    'umap__min_dist': 0.25, 
    'hdbscan__min_samples': 5, 
    'hdbscan__min_cluster_size': 5, 
    'hdbscan__cluster_selection_epsilon': 0
}
```

In [7]:
from importlib import reload
reload(uc)

<module 'src.pipeline.umap_clustering' from '/Users/Darren/git-clones/ds5500-capstone-project/src/pipeline/umap_clustering.py'>

In [8]:
n_iter = 30
pipeline = uc.create_clustering_pipeline()

search_params = dict(
    n_iter=n_iter, 
    sample_frac=sample_frac, 
    cluster_col='organization',
    search_type='random'
)

search_result = uc.clustering_hyperparameter_search(pipeline, param_grid, X, **search_params)

Sampling 20.0 percent of 128437 observations
Evaluating 30 of 144 hyperparameter configurations over 25687 observations


Hyperparameter Search:   0%|          | 0/30 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
Hyperparameter Search:  53%|█████▎    | 16/30 [03:31<03:04, 13.21s/it][Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  4.1min
Hyperparameter Search: 100%|██████████| 30/30 [07:46<00:00, 15.56s/it]
[Parallel(n_jobs=-1)]: Done  22 out of  30 | elapsed: 15.1min remaining:  5.5min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 17.7min finished


Best parameters: {'umap__n_neighbors': np.int64(20), 'umap__n_components': np.int64(30), 'umap__min_dist': np.float64(0.1), 'hdbscan__min_cluster_size': np.int64(5)}
Best score: 0.7909287726144892


In [9]:
search_result.head(10)

Unnamed: 0,params,score,embeddings,labels,model,labeled_df
26,"{'umap__n_neighbors': 20, 'umap__n_components'...",0.790929,"[[3.8620307445526123, 7.552438735961914, 5.873...","[121, 898, -1, 57, 312, 811, 72, 64, 67, 210, ...",(SentenceTransformerEncoder(model=SentenceTran...,ORIGI...
5,"{'umap__n_neighbors': 20, 'umap__n_components'...",0.772761,"[[0.3788496255874634, -18.490711212158203], [3...","[45, 870, -1, 220, 245, -1, 10, 72, 148, 35, 6...",(SentenceTransformerEncoder(model=SentenceTran...,ORIGI...
16,"{'umap__n_neighbors': 20, 'umap__n_components'...",0.76366,"[[8.463165283203125, 1.5468413829803467, 11.63...","[102, 809, -1, 116, 226, 675, 26, 87, 90, 41, ...",(SentenceTransformerEncoder(model=SentenceTran...,ORIGI...
10,"{'umap__n_neighbors': 20, 'umap__n_components'...",0.753386,"[[8.488434791564941, 8.991987228393555, 11.576...","[136, 414, -1, 80, 174, 287, 7, 4, 65, 93, 356...",(SentenceTransformerEncoder(model=SentenceTran...,ORIGI...
9,"{'umap__n_neighbors': 20, 'umap__n_components'...",0.752528,"[[4.763209342956543, 7.368696689605713, 9.7110...","[81, 689, -1, 84, 303, 613, 54, 0, 164, 205, 5...",(SentenceTransformerEncoder(model=SentenceTran...,ORIGI...
24,"{'umap__n_neighbors': 35, 'umap__n_components'...",0.750688,"[[8.154902458190918, 6.0413079261779785, 11.09...","[32, 443, -1, 43, 182, 395, 4, 13, 153, 11, 40...",(SentenceTransformerEncoder(model=SentenceTran...,ORIGI...
8,"{'umap__n_neighbors': 20, 'umap__n_components'...",0.746741,"[[4.579568386077881, 7.829644203186035, 11.555...","[137, 455, -1, 9, 198, 350, 6, 51, 61, 191, 38...",(SentenceTransformerEncoder(model=SentenceTran...,ORIGI...
25,"{'umap__n_neighbors': 20, 'umap__n_components'...",0.743893,"[[2.9779036045074463, 5.146974086761475, 6.168...","[145, 697, -1, 121, 233, 382, 77, 308, 26, 414...",(SentenceTransformerEncoder(model=SentenceTran...,ORIGI...
17,"{'umap__n_neighbors': 20, 'umap__n_components'...",0.742839,"[[2.267648458480835, 4.615531921386719, -0.113...","[10, 429, 441, 76, 159, 441, 46, 56, 226, 275,...",(SentenceTransformerEncoder(model=SentenceTran...,ORIGI...
27,"{'umap__n_neighbors': 35, 'umap__n_components'...",0.739948,"[[5.110751152038574, -0.07681722193956375, -6....","[70, 906, -1, 4, 367, -1, 24, 18, 72, 91, 737,...",(SentenceTransformerEncoder(model=SentenceTran...,ORIGI...


In [10]:
best_params = search_result.iloc[0]['params']
best_params

In [13]:
# Apply the best parameters to get final clustering
pipeline.set_params(**best_params)
pipeline.fit(X['organization'])

# Final clustering labels and embeddings
final_embeddings = pipeline.named_steps['umap'].embedding_
final_labels = pipeline.named_steps['hdbscan'].labels_

# Assign representative labels to each cluster
cluster_representatives = uc.assign_representative_labels(X['organization'], final_labels)

print("Cluster Representatives:")
for cluster_id, representative_label in cluster_representatives.items():
    print(f"Cluster {cluster_id}: {representative_label}")
    

In [14]:
import duckdb

duckdb.query('''
    SELECT DISTINCT ORG_NAME, CLUSTER_LABEL
    FROM mapped_df
    WHERE LOWER(ORG_NAME) LIKE '%sierra%'
''').df()

In [17]:
search_result.head()

In [18]:
for idx, r in search_result.iterrows():
    df = search_result['labeled_df']
    cluster_df = duckdb.query('''
        SELECT DISTINCT ORG_NAME, CLUSTER_LABEL
        FROM mapped_df
        WHERE LOWER(CLUSTER_LABEL) = 'sierra club maine'
    ''').df()
    display(cluster_df)