In [1]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

from pathlib import Path
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN, HDBSCAN
from sklearn.preprocessing import normalize
from itertools import product

from src.database.db_writer import Database

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Params
model_dir = str(Path(os.getcwd()).parents[1] / 'models' / 'all-minilm-l6')

## Helper functions

In [28]:
def ingest_data():
    db_name = 'maine_legislation_and_testimony'
    db = Database(db_name)
    query = '''
    SELECT 
        Organization AS organization,
        COUNT(*) OVER (PARTITION BY Organization) AS count
    FROM TESTIMONY_HEADER th
    '''
    org_df = db.return_query_as_df(query)
    return org_df

def add_embeddings(idf, string_col, model_dir):
    
    # Load sentence transformer model
    model = SentenceTransformer(model_dir)

    # Generate embeddings for unique organization names
    org_names = idf[string_col].unique()
    embeddings = np.array([model.encode(name) for name in org_names])
    normalized_embeddings = normalize(embeddings, norm='l2', axis=1)
    
    # Create a mapping of organization names to embeddings
    name_to_embedding = dict(zip(org_names, normalized_embeddings))

    # Generate weighted embeddings based on counts
    idf['embedding'] = idf[string_col].map(name_to_embedding)
    return idf

def dbscan_clustering(idf, column_to_cluster, eps=0.3, min_samples=5, metric='euclidean', n_jobs=-1):
    """
    Cluster organization names using DBSCAN on sentence embeddings
    
    Parameters:
    df: DataFrame with columns ['organization', 'count']
    eps: DBSCAN epsilon parameter (distance threshold)
    min_samples: DBSCAN min_samples parameter
    
    Returns:
    DataFrame with original data plus cluster labels and representative names
    """
    # Perform DBSCAN clustering
    cdf = idf.copy()
    
    embeddings = np.vstack(cdf['embedding'].values)
    
    clusterer = DBSCAN(
        eps=eps,
        min_samples=min_samples,
        metric=metric,
        n_jobs=n_jobs
    )
    
    clustering = clusterer.fit(embeddings)

    # Map DBSCAN labels back to the original dataframe
    cdf['cluster'] = clustering.labels_

    cluster_representatives = {}
    for cluster_id in set(clustering.labels_):
        if cluster_id != -1:  # Skip noise points
            cluster_mask = cdf['cluster'] == cluster_id
            cluster_df = cdf[cluster_mask]
            representative = cluster_df.loc[cluster_df['count'].idxmax(), 'organization']
            cluster_representatives[cluster_id] = representative

    # Map cluster labels to representative names
    cdf['grouped_name'] = cdf['cluster'].map(cluster_representatives)
    # Fill in noise points with original org value
    cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]
    return cdf

def hdbscan_clustering(idf, column_to_cluster, min_cluster_size=10, min_samples=None, n_jobs=-1):
    """
    Cluster organization names using HDDBSCAN on sentence embeddings
    
    Parameters:
    df: DataFrame with embeddings as 1D numpy array
    min_cluster_size: cutoff for clustering
    min_samples: matches min_cluster when not specified
    n_jobs: number of cores to use, -1 is all cores
    
    Returns:
    DataFrame with original data plus cluster labels and representative names
    """
    # Perform DBSCAN clustering
    cdf = idf.copy()
    
    embeddings = np.vstack(cdf['embedding'].values)
    
    clusterer = HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        n_jobs=n_jobs
    )
    
    clustering = clusterer.fit(embeddings)

    # Map labels back to the original dataframe
    cdf['cluster'] = clustering.labels_

    cluster_representatives = {}
    for cluster_id in set(clustering.labels_):
        if cluster_id != -1:  # Skip noise points
            cluster_mask = cdf['cluster'] == cluster_id
            cluster_df = cdf[cluster_mask]
            representative = cluster_df.loc[cluster_df['count'].idxmax(), 'organization']
            cluster_representatives[cluster_id] = representative

    # Map cluster labels to representative names
    cdf['grouped_name'] = cdf['cluster'].map(cluster_representatives)
    # Fill in noise points with original org value
    cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]
    return cdf

def plot_with_pca(idf):
    # Reduce embeddings to 2D using PCA
    pca = PCA(n_components=2)
    
    embeddings = np.vstack(idf['embedding'].values)
    reduced_embeddings = pca.fit_transform(embeddings)
    
    # Visualize the clusters
    plt.figure(figsize=(10, 8))
    
    # Plot each point with color based on cluster label
    scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=idf['cluster'], cmap='viridis', marker='o')
    
    # Customize plot
    plt.title("PCA Projection of Organization Names Clusters")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.colorbar(scatter, label='Cluster Label')
    plt.show()
    
def grid_search_list(param_dict):
    '''
    :param param_dict: Dictionary with key for each
    hyperparameter and list of params to search. 
    :return: 
    '''
    grid_search_params = [
        dict(zip(param_dict.keys(), values))
        for values in product(*param_dict.values())
    ]
    return grid_search_params

In [34]:
df = ingest_data()

sdf = duckdb.query('''
    SELECT * FROM df WHERE LOWER(organization) LIKE '%sierra%' OR LOWER(organization) LIKE '%legislature%' 
''').df()
sdf.head()

sdf = add_embeddings(sdf, 'organization', model_dir)

## Set hyperparameters for grid search

In [49]:
len(df.embedding[0])

384

In [None]:
param_grid = {
    'eps': np.linspace(0.3, 0.9, 5),
    'min_samples': np.linspace(100, 300, 2, dtype=int)
}
search_list = grid_search_list(param_grid)

In [20]:
param_grid = {
    'min_samples': np.linspace(50, 500, 4, dtype=int),
    'min_cluster_size': np.linspace(50, 500, 4, dtype=int)
}
search_list = grid_search_list(param_grid)

results = []
for hyperparams in search_list:
    result = {}
    result = {**hyperparams}
    print(f'Searching over {hyperparams}')
    result['df'] = hdbscan_clustering(sdf, column_to_cluster='organization', **hyperparams)
    clusters_found = len(result['df']['cluster'].unique())
    print(f'Found {clusters_found} clusters')
    results.append(result)

Searching over {'min_samples': np.int64(50), 'min_cluster_size': np.int64(50)}
Found 5 clusters
Searching over {'min_samples': np.int64(50), 'min_cluster_size': np.int64(200)}
Found 3 clusters
Searching over {'min_samples': np.int64(50), 'min_cluster_size': np.int64(350)}


 'Sierra Club' 'Sierra Club' 'Maine Legislature (Portland, HD 116)']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]


Found 1 clusters
Searching over {'min_samples': np.int64(50), 'min_cluster_size': np.int64(500)}


 'Sierra Club' 'Sierra Club' 'Maine Legislature (Portland, HD 116)']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]


Found 1 clusters
Searching over {'min_samples': np.int64(200), 'min_cluster_size': np.int64(50)}
Found 3 clusters
Searching over {'min_samples': np.int64(200), 'min_cluster_size': np.int64(200)}
Found 3 clusters
Searching over {'min_samples': np.int64(200), 'min_cluster_size': np.int64(350)}


 'Sierra Club' 'Sierra Club' 'Maine Legislature (Portland, HD 116)']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]


Found 1 clusters
Searching over {'min_samples': np.int64(200), 'min_cluster_size': np.int64(500)}


 'Sierra Club' 'Sierra Club' 'Maine Legislature (Portland, HD 116)']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]


Found 1 clusters
Searching over {'min_samples': np.int64(350), 'min_cluster_size': np.int64(50)}


 'Sierra Club' 'Sierra Club' 'Maine Legislature (Portland, HD 116)']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]


Found 1 clusters
Searching over {'min_samples': np.int64(350), 'min_cluster_size': np.int64(200)}


 'Sierra Club' 'Sierra Club' 'Maine Legislature (Portland, HD 116)']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]


Found 1 clusters
Searching over {'min_samples': np.int64(350), 'min_cluster_size': np.int64(350)}


 'Sierra Club' 'Sierra Club' 'Maine Legislature (Portland, HD 116)']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]


Found 1 clusters
Searching over {'min_samples': np.int64(350), 'min_cluster_size': np.int64(500)}


 'Sierra Club' 'Sierra Club' 'Maine Legislature (Portland, HD 116)']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]


Found 1 clusters
Searching over {'min_samples': np.int64(500), 'min_cluster_size': np.int64(50)}


 'Sierra Club' 'Sierra Club' 'Maine Legislature (Portland, HD 116)']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]


Found 1 clusters
Searching over {'min_samples': np.int64(500), 'min_cluster_size': np.int64(200)}


 'Sierra Club' 'Sierra Club' 'Maine Legislature (Portland, HD 116)']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]


Found 1 clusters
Searching over {'min_samples': np.int64(500), 'min_cluster_size': np.int64(350)}


 'Sierra Club' 'Sierra Club' 'Maine Legislature (Portland, HD 116)']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]


Found 1 clusters
Searching over {'min_samples': np.int64(500), 'min_cluster_size': np.int64(500)}
Found 1 clusters


 'Sierra Club' 'Sierra Club' 'Maine Legislature (Portland, HD 116)']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  cdf.loc[cdf['cluster'] == -1, 'grouped_name'] = cdf.loc[cdf['cluster'] == -1, column_to_cluster]


In [21]:
for group_name in ['Maine State Legislature', 'Sierra Club Maine']:
    query = f'''
    SELECT DISTINCT
        organization,
        grouped_name
    FROM result
    WHERE grouped_name LIKE '{group_name}'
    '''
    display(duckdb.query(query).df())

InvalidInputException: Invalid Input Error: Python Object "result" of type "dict" found on line "/var/folders/b8/frpyqjx134d8nmnpx1t2ckvc0000gp/T/ipykernel_22553/885513849.py:9" not suitable for replacement scans.
Make sure that "result" is either a pandas.DataFrame, duckdb.DuckDBPyRelation, pyarrow Table, Dataset, RecordBatchReader, Scanner, or NumPy ndarrays with supported format

In [24]:
results[0].keys()

dict_keys(['min_samples', 'min_cluster_size', 'df'])

In [None]:
results = []
for hyperparams in search_list:
    result = {}
    result = {**hyperparams}
    print(f'Searching over {hyperparams}')
    result['df'] = dbscan_clustering(df, column_to_cluster='organization', **hyperparams)
    clusters_found = len(result['df']['cluster'].unique())
    print(f'Found {clusters_found} clusters')
    results.append(result)

In [26]:
cardinality = len(df['organization'].unique())
for r in results:
    df = r['df']
    print(
    f'''Min cluster size: {r.get('min_cluster_size')}
Min Samples: {r.get('min_samples')}
Original cardinality: {cardinality}
Grouped cardinality: {len(r.get('df')['grouped_name'].unique())}
'''
    )
    for group_name in ['Maine State Legislature', 'Sierra Club Maine']:
        query = f'''
        SELECT DISTINCT
            organization,
            grouped_name
        FROM df
        WHERE grouped_name = '{group_name}'
        '''
        display(duckdb.query(query).df())

Min cluster size: 50
Min Samples: 50
Original cardinality: 32
Grouped cardinality: 24



Unnamed: 0,organization,grouped_name
0,State of Maine Legislature,Maine State Legislature
1,Maine State Legislature,Maine State Legislature
2,MAINE STATE LEGISLATURE,Maine State Legislature
3,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine
1,Sierra Club Maine chapter,Sierra Club Maine
2,Sierra Club of Maine,Sierra Club Maine
3,"Sierra Club, Maine",Sierra Club Maine
4,Sierra Club Maine Chapter,Sierra Club Maine


Min cluster size: 200
Min Samples: 50
Original cardinality: 32
Grouped cardinality: 3



Unnamed: 0,organization,grouped_name
0,Legislature,Maine State Legislature
1,Maine Legislature,Maine State Legislature
2,State Legislature,Maine State Legislature
3,State of Maine Legislature,Maine State Legislature
4,Montana State Legislature,Maine State Legislature
5,Maine State Legislature,Maine State Legislature
6,MAINE STATE LEGISLATURE,Maine State Legislature
7,Maine State Legislature/Houlton Band of Maliseets,Maine State Legislature
8,Maine Legislature,Maine State Legislature
9,Legislature and Policy Counsel Office Tax Policy,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine
1,Sierra Club Maine chapter,Sierra Club Maine
2,Sierra Club of Maine,Sierra Club Maine
3,Sierra Club Energy Team,Sierra Club Maine
4,Sierra Club,Sierra Club Maine
5,"Sierra Club, Maine",Sierra Club Maine
6,SierraClub,Sierra Club Maine
7,Maine Rail Transit Coalition and Sierra Club M...,Sierra Club Maine
8,Sierra Club Maine Energy Team,Sierra Club Maine
9,Sierra Club Maine Chapter,Sierra Club Maine


Min cluster size: 350
Min Samples: 50
Original cardinality: 32
Grouped cardinality: 32



Unnamed: 0,organization,grouped_name
0,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine


Min cluster size: 500
Min Samples: 50
Original cardinality: 32
Grouped cardinality: 32



Unnamed: 0,organization,grouped_name
0,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine


Min cluster size: 50
Min Samples: 200
Original cardinality: 32
Grouped cardinality: 3



Unnamed: 0,organization,grouped_name
0,Montana State Legislature,Maine State Legislature
1,Maine State Legislature,Maine State Legislature
2,MAINE STATE LEGISLATURE,Maine State Legislature
3,Maine State Legislature,Maine State Legislature
4,"Maine State Legislature, HD 53",Maine State Legislature
5,Legislature test,Maine State Legislature
6,Maine Legislature HD73,Maine State Legislature
7,Maine Legislature (HD 18 - part of Sanford),Maine State Legislature
8,Maine State Legislature/Houlton Band of Maliseets,Maine State Legislature
9,Maine Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine
1,Sierra Club Maine chapter,Sierra Club Maine
2,Sierra Club of Maine,Sierra Club Maine
3,SierraClub,Sierra Club Maine
4,"Sierra Club, Maine",Sierra Club Maine
5,Maine Rail Transit Coalition and Sierra Club M...,Sierra Club Maine
6,Sierra Club Maine Energy Team,Sierra Club Maine
7,Sierra Club Maine Chapter,Sierra Club Maine
8,Maine Lobstering Union/Sierra Club,Sierra Club Maine
9,Sierra Club Energy Team,Sierra Club Maine


Min cluster size: 200
Min Samples: 200
Original cardinality: 32
Grouped cardinality: 3



Unnamed: 0,organization,grouped_name
0,Maine Legislature (HD 18 - part of Sanford),Maine State Legislature
1,Montana State Legislature,Maine State Legislature
2,Maine State Legislature,Maine State Legislature
3,MAINE STATE LEGISLATURE,Maine State Legislature
4,Legislature,Maine State Legislature
5,Maine Legislature,Maine State Legislature
6,State Legislature,Maine State Legislature
7,State of Maine Legislature,Maine State Legislature
8,Maine State Legislature,Maine State Legislature
9,"Maine State Legislature, HD 53",Maine State Legislature


Unnamed: 0,organization,grouped_name
0,"Sierra Club, Maine",Sierra Club Maine
1,Sierra Club Maine,Sierra Club Maine
2,Sierra Club Maine chapter,Sierra Club Maine
3,Sierra Club of Maine,Sierra Club Maine
4,Maine Rail Transit Coalition and Sierra Club M...,Sierra Club Maine
5,Sierra Club Maine Energy Team,Sierra Club Maine
6,Sierra Club Maine Chapter,Sierra Club Maine
7,Maine Lobstering Union/Sierra Club,Sierra Club Maine
8,SierraClub,Sierra Club Maine
9,Sierra Club Energy Team,Sierra Club Maine


Min cluster size: 350
Min Samples: 200
Original cardinality: 32
Grouped cardinality: 32



Unnamed: 0,organization,grouped_name
0,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine


Min cluster size: 500
Min Samples: 200
Original cardinality: 32
Grouped cardinality: 32



Unnamed: 0,organization,grouped_name
0,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine


Min cluster size: 50
Min Samples: 350
Original cardinality: 32
Grouped cardinality: 32



Unnamed: 0,organization,grouped_name
0,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine


Min cluster size: 200
Min Samples: 350
Original cardinality: 32
Grouped cardinality: 32



Unnamed: 0,organization,grouped_name
0,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine


Min cluster size: 350
Min Samples: 350
Original cardinality: 32
Grouped cardinality: 32



Unnamed: 0,organization,grouped_name
0,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine


Min cluster size: 500
Min Samples: 350
Original cardinality: 32
Grouped cardinality: 32



Unnamed: 0,organization,grouped_name
0,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine


Min cluster size: 50
Min Samples: 500
Original cardinality: 32
Grouped cardinality: 32



Unnamed: 0,organization,grouped_name
0,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine


Min cluster size: 200
Min Samples: 500
Original cardinality: 32
Grouped cardinality: 32



Unnamed: 0,organization,grouped_name
0,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine


Min cluster size: 350
Min Samples: 500
Original cardinality: 32
Grouped cardinality: 32



Unnamed: 0,organization,grouped_name
0,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine


Min cluster size: 500
Min Samples: 500
Original cardinality: 32
Grouped cardinality: 32



Unnamed: 0,organization,grouped_name
0,Maine State Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine


In [None]:
opt_df = results[5]

In [None]:
for r in results:
    df = r['df']
    print(f'EPS: {r["eps"]} | min_samples: {r["min_samples"]}')
    print(f'Orgs: {len(df["organization"].unique())} '
          f'| Grouped cardinality: {len(df["grouped_name"].unique())}')
    display(
        duckdb.query('''
            SELECT grouped_name,
            GROUP_CONCAT(DISTINCT organization) AS ORG_NAMES,
            COUNT(*) AS INSTANCES
            FROM df
            GROUP BY grouped_name
            ORDER BY COUNT(*) DESC
            LIMIT 7
        ''').df()
    )

In [None]:
for r in results:
    cluster_count = len(r['df']['grouped_name'].unique())
    print(f'EPS {r.get("eps")} | min_samples: {r.get("min_samples")}: {cluster_count}')

In [None]:
for r in results:
    plot_with_pca(r['df'])

## Evaluate HDBSCAN with 200

Found decent performance across only Sierra Club and Maine Legislature strings. Now evaluating on full data set. 

In [37]:
string_col = 'organization'

In [35]:
hdb_200_df = ingest_data()
hdb_200_df = add_embeddings(df, string_col, model_dir)

In [38]:
hdb_200_df = hdbscan_clustering(df, column_to_cluster=string_col, min_cluster_size=200)

In [39]:
for group_name in ['Maine State Legislature', 'Sierra Club Maine']:
    query = f'''
    SELECT DISTINCT
        organization,
        grouped_name
    FROM hdb_200_df
    WHERE grouped_name = '{group_name}'
    '''
    display(duckdb.query(query).df())

Unnamed: 0,organization,grouped_name
0,MAINE STATE LEGISLATURE,Maine State Legislature
1,Maine State Legislature,Maine State Legislature
2,"Maine State Legislature, HD 53",Maine State Legislature
3,Maine State Legislature,Maine State Legislature
4,"Maine Legislature (Portland, HD 116)",Maine State Legislature
5,Maine State Representative,Maine State Legislature
6,Maine Legislature,Maine State Legislature
7,State of Maine Legislature,Maine State Legislature
8,State of Maine,Maine State Legislature
9,Maine Legislature,Maine State Legislature


Unnamed: 0,organization,grouped_name
0,Sierra Club Maine,Sierra Club Maine
1,Sierra Club of Maine,Sierra Club Maine
2,Sierra Club Maine chapter,Sierra Club Maine
3,Sierra Club Maine Chapter,Sierra Club Maine
4,Sierra Club,Sierra Club Maine
5,"Sierra Club, Maine",Sierra Club Maine
6,Maine Appalachian Trail Club,Sierra Club Maine


In [45]:
duckdb.query('''
    SELECT DISTINCT organization, grouped_name
    FROM hdb_200_df
    WHERE LOWER(organization) LIKE '%sierra%'
''').df()

Unnamed: 0,organization,grouped_name
0,"Sierra Club, Maine",Sierra Club Maine
1,Sierra Club Maine Chapter,Sierra Club Maine
2,Sierra Club Maine Energy Team,Sierra Club Maine Energy Team
3,Maine Lobstering Union/Sierra Club,Maine Lobstering Union/Sierra Club
4,Sierra Club,Sierra Club Maine
5,Sierra Club Maine,Sierra Club Maine
6,Sierra Club Energy Team,Sierra Club Energy Team
7,Sierra Club of Maine,Sierra Club Maine
8,Sierra Club Maine chapter,Sierra Club Maine
9,Maine Rail Transit Coalition and Sierra Club M...,Maine Rail Transit Coalition and Sierra Club M...


## Evaluation of different encoder

In [47]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/distilbert-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/distilbert-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "Sierra Club Maine, Sierra Club Energy Team Maine, SierraClub Maine"

ner_results = nlp(example)
print(ner_results)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'B-ORG', 'score': np.float32(0.9950708), 'index': 1, 'word': 'Sierra', 'start': 0, 'end': 6}, {'entity': 'I-ORG', 'score': np.float32(0.9916831), 'index': 2, 'word': 'Club', 'start': 7, 'end': 11}, {'entity': 'I-ORG', 'score': np.float32(0.99234676), 'index': 3, 'word': 'Maine', 'start': 12, 'end': 17}, {'entity': 'B-ORG', 'score': np.float32(0.99775887), 'index': 5, 'word': 'Sierra', 'start': 19, 'end': 25}, {'entity': 'I-ORG', 'score': np.float32(0.99690276), 'index': 6, 'word': 'Club', 'start': 26, 'end': 30}, {'entity': 'I-ORG', 'score': np.float32(0.9955954), 'index': 7, 'word': 'Energy', 'start': 31, 'end': 37}, {'entity': 'I-ORG', 'score': np.float32(0.99343973), 'index': 8, 'word': 'Team', 'start': 38, 'end': 42}, {'entity': 'I-ORG', 'score': np.float32(0.99458534), 'index': 9, 'word': 'Maine', 'start': 43, 'end': 48}, {'entity': 'B-ORG', 'score': np.float32(0.9954209), 'index': 11, 'word': 'Sierra', 'start': 50, 'end': 56}, {'entity': 'B-ORG', 'score': np.float32(0