# UMAP

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from snakeclef.utils import get_spark
from pyspark.sql import functions as F
import umap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

spark = get_spark()
display(spark)

  from .autonotebook import tqdm as notebook_tqdm
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/08 17:27:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/08 17:27:22 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
24/05/08 17:27:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
# Path and dataset names
gcs_path = "gs://dsgt-clef-snakeclef-2024/data"
dct_emb_path = "process/training_v1/dino_dct/data"
train_path = "parquet_files/SnakeCLEF2023-train-small_size"

# Define the GCS path to the embedding files
dct_gcs_path = f"{gcs_path}/{dct_emb_path}"
train_gcs_path = f"{gcs_path}/{train_path}"

# Read the Parquet file into a DataFrame
dct_df = spark.read.parquet(dct_gcs_path)
train_df = spark.read.parquet(train_gcs_path)

# Show the data
dct_df.show(n=5, truncate=50)
train_df.show(n=5, truncate=50)

## UMAP plot

In [None]:
# Transformation
grouped_df = (
    dct_df.groupBy("observation_id")
    .agg(F.count("observation_id").alias("n"))
    .orderBy(F.col("n").desc())
)

# Action
grouped_df.show()

In [None]:
# Param
num_top_species = 5

# Get top species DF
top_species = [
    int(row["species_id"]) for row in grouped_df.limit(num_top_species).collect()
]
print(f"Top {num_top_species} species ids: {top_species}")

subset_df = dct_df.filter(F.col("species_id").isin(top_species)).select(
    ["image_name", "species_id", "dct_embedding"]
)

subset_df = subset_df.join(train_df, "image_name", "inner").select(
    [subset_df.species_id, train_df.species, subset_df.dct_embedding]
)

subset_df.show(20)
print(subset_df.count())

In [None]:
from sklearn.preprocessing import StandardScaler

# Convert to Pandas DF
pandas_df = subset_df.select(["dct_embedding", "species"]).toPandas()

# Extract features and labels
emb_df = np.stack(pandas_df["dct_embedding"].values)
scaled_emb = StandardScaler().fit_transform(emb_df)
labels = pandas_df["species"].tolist()

# UMAP reduction
reducer = umap.UMAP(n_neighbors=15, n_components=2, metric="euclidean", random_state=42)
embedding = reducer.fit_transform(scaled_emb)  # NumPy array with shape (n_samples, 2)

In [None]:
def plot_cluster(pandas_df, embeddings, algorithm_name):
    # Plot the data
    fig, ax = plt.subplots(figsize=(6.4, 4.8), dpi=200)
    fig.suptitle(
        f"{algorithm_name} projection of top 5 plant species",
        fontsize=14,
        weight="bold",
    )

    # Create a scatter plot, color-coded by new species_idx
    colors = ["tab:blue", "tab:orange", "tab:green", "tab:red", "tab:purple"]
    top_species_idx = pandas_df["species"].value_counts().nlargest(5).index
    species_to_color = {species: colors[i] for i, species in enumerate(top_species_idx)}

    # Map species IDs to colors for plotting
    color_list = pandas_df["species"].map(species_to_color).tolist()

    for species, color in species_to_color.items():
        # Select embeddings for the current species
        idx = pandas_df["species"] == species
        ax.scatter(
            embeddings[idx, 0],
            embeddings[idx, 1],
            c=color,
            # cmap="tab10",
            label=species,
            s=5,
            alpha=0.7,
            linewidth=0.5,
        )

    ax.grid(color="blue", linestyle="--", linewidth=1, alpha=0.2)
    ax.legend(loc="best", title="Species Name", fontsize="small")
    for spine in ["top", "right", "bottom", "left"]:
        ax.spines[spine].set_visible(False)
    fig.tight_layout()
    plt.show()

In [None]:
plot_cluster(pandas_df, embedding, algorithm_name="UMAP")

In [None]:
import pacmap

# PaCMAP
pacmap_embedding = pacmap.PaCMAP(
    n_components=2, n_neighbors=15, MN_ratio=0.5, FP_ratio=2.0
)

# fit the data
pacmap_transformed = pacmap_embedding.fit_transform(scaled_emb, init="pca")

In [None]:
plot_cluster(pandas_df, pacmap_transformed, algorithm_name="PaCMAP")