# Setup

In [None]:
from ebirdtools import EBirdTools
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import plotly.express as px

In [None]:
def load_and_label(path):
    """
    Reads a data file into a df, normalizes elevation data, and labels
    with a new column specifying which cluster each row belongs to.
    """
    df = EBirdTools.load_data(path)
    df['ele'] = df['elevation_meters'].apply(lambda x: x/1000)
    
    clustering = KMeans(n_clusters=round(len(df) ** (1/3))).fit(df[['lat', 'lng', 'ele']])
    df['cluster'] = pd.Series(clustering.labels_, index=df.index)
    
    return df

In [None]:
sanjose    = load_and_label('data/ebd_alt_2025-08-20-20-26-00_37.33_-121.86_30_50.csv')
losangeles = load_and_label('data/ebd_alt_2025-08-20-20-40-54_34.05_-118.24_30_50.csv')
seattle    = load_and_label('data/ebd_alt_2025-08-20-20-44-54_47.61_-122.33_30_50.csv')
chicago    = load_and_label('data/ebd_alt_2025-08-20-20-35-54_41.88_-87.63_30_50.csv')
everglades = load_and_label('data/ebd_alt_2025-08-20-20-31-45_25.75_-80.56_30_50.csv')

In [None]:
def plot_map(df, frac=0.1, cluster=None):
    """
    Shows interactive map of sightings. Has option to show only
    points from the specified cluster.
    """
    if cluster is not None:
        df = df.copy()
        df = df[df['cluster'] == cluster]
    df = df.copy().sample(frac=frac)  # reduces memory load
    fig = px.scatter_mapbox(df, lat='lat', lon='lng', color='cluster', hover_name='comName')
    fig.update_mapboxes(style='open-street-map')
    fig.show()
    
def cluster_hist(df):
    """
    Shows a histogram of cluster sizes.
    """
    df['cluster'].value_counts().plot.hist()

def get_exclusive_species(df_src, df_tgt):
    """
    Returns the set of species found in the source df
    that are not found in the target df.
    """
    spec_src = set(df_src['comName'].values)
    spec_tgt = set(df_tgt['comName'].values)
    exclusive = spec_src.difference(spec_tgt)
    return exclusive

def get_intersecting_species(df_src, df_tgt):
    """
    Returns the set of species found in both df.
    """
    spec_src = set(df_src['comName'].values)
    spec_tgt = set(df_tgt['comName'].values)
    intersect = spec_src.intersection(spec_tgt)
    return intersect

def species_per_cluster(df, by='comName'):
    """
    Takes a df and returns a dict:
        Key: Cluster number
        Val: set of names found in the cluster
    """
    species = {}
    for i in pd.unique(df['cluster']):
        names = set(df[df['cluster'] == i][by])
        species[i] = names
    return species

def get_similarity(df_a, df_b, hist=False):
    """
    Gets cluster similarity by calculating the set similarity
    (intersection over union) of the species names found in
    each cluster.
    """
    output = []
    sp_a = species_per_cluster(df_a)
    sp_b = species_per_cluster(df_b)
    for k_a, v_a in sp_a.items():
        for k_b, v_b in sp_b.items():
            iou = len(v_a.intersection(v_b)) / len(v_a.union(v_b))
            d = {'A': k_a, 'B': k_b, 'iou': iou}
            output.append(d)

    output = pd.DataFrame(output).sort_values('iou')
    if hist:
        output['iou'].plot.hist()

    return output

In [None]:
def analyze(name, df_src, df_tgt):
    if name not in df_src['comName'].values:
        raise KeyError('Unable to find species in the source region.')
    print('Is bird already found in the target region?', name in df_tgt['comName'].values)
    
    # find which source clusters the named species occurs in
    src_clusters = np.unique(df_src[df_src['comName'] == name]['cluster'])
    # drop the clusters that the named species doesn't occur in
    df_srcdrop = df_src[df_src['cluster'].isin(src_clusters)]
    # compare species sets
    sims = get_similarity(df_srcdrop, df_tgt)
    
    s_mean = sims['iou'].mean()
    s_std = sims['iou'].std()
    s_max = sims['iou'].max()
    s_z = (s_max - s_mean) / s_std
    a = sims[sims['iou'] == s_max]['A'].item()
    b = sims[sims['iou'] == s_max]['B'].item()
    print('Sub-cluster compatability:', round(s_mean, 2))
    print('Highest similarity:', round(s_max, 2), f'(z {round(s_z, 2)})')

    sightings_a = df_src['cluster'].value_counts()
    sa_mean = sightings_a.mean()
    sa_std = sightings_a.std()
    sa_z = (sightings_a[a] - sa_mean) / sa_std
    print('Source cluster:', a, f'(num sightings z {round(sa_z, 2)})')
    sightings_b = df_tgt['cluster'].value_counts()
    sb_mean = sightings_b.mean()
    sb_std = sightings_b.std()
    sb_z = (sightings_b[b] - sb_mean) / sb_std
    print('Target cluster:', b, f'(num sightings z {round(sb_z, 2)})')

# Demo

In [None]:
data = (losangeles, everglades)

In [None]:
plot_map(data[0])
cluster_hist(data[0])

In [None]:
plot_map(data[1])
cluster_hist(data[1])

In [None]:
get_similarity(*data, hist=True)

In [None]:
# get_intersecting_species(*data)
get_exclusive_species(*data)

In [None]:
analyze('American Avocet', *data)

In [None]:
plot_map(data[0], cluster=)
plot_map(data[1], cluster=)

In [None]:
analyze('Pygmy Nuthatch', *data)

In [None]:
plot_map(data[0], cluster=)
plot_map(data[1], cluster=)