In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

# embedding ideas

This notebook has some code to get started, and some ideas on how to achieve the following goals:

- Building a KNN classifer
- Annotating each track

## setup

### downloading the dataset

Make sure that you have the dataset downloaded locally.
At the root of the project, run this command:

```bash
gsutil -m rsync \
    gs://birdclef-2023/data/processed/birdclef-2023/train_embeddings/consolidated_v3_pre1/ \
    data/processed/birdclef-2023/train_embeddings/consolidated_v3_pre1/ 
```

### using spark

In [2]:
from birdclef.utils import get_spark
from pyspark.sql import functions as F

# modify cores and memory as needed
spark = get_spark(cores=8, memory="16g")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/14 23:26:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
path = "../data/processed/birdclef-2023/train_embeddings/consolidated_v3/"
df = spark.read.parquet(path)
df.printSchema()
df.count()

                                                                                

root
 |-- species: string (nullable = true)
 |-- track_stem: string (nullable = true)
 |-- track_type: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- prediction_vec: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- predictions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- rank: long (nullable = true)
 |    |    |-- index: long (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- mapped_label: string (nullable = true)
 |    |    |-- probability: double (nullable = true)
 |-- start_time: long (nullable = true)
 |-- energy: double (nullable = true)



                                                                                

1198860

In [4]:
df.show(5, vertical=True, truncate=80)

-RECORD 0------------------------------------------------------------------------------------------
 species        | wlwwar                                                                           
 track_stem     | XC475384_part003                                                                 
 track_type     | original                                                                         
 track_name     | wlwwar/XC475384_part003.mp3                                                      
 embedding      | [0.6336879134178162, 0.699510395526886, 0.43676260113716125, 1.18983256816864... 
 prediction_vec | [-10.213504791259766, -9.510478973388672, -12.938481330871582, -9.79550075531... 
 predictions    | [{0, 2005, Nucifraga caryocatactes_Eurasian Nutcracker, eurnut1, 0.0690114870... 
 start_time     | 18                                                                               
 energy         | 1.2206932306289673                                                               


In [5]:
df.show()

+-------+----------------+----------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+
|species|      track_stem|track_type|          track_name|           embedding|      prediction_vec|         predictions|start_time|              energy|
+-------+----------------+----------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+
| wlwwar|XC475384_part003|  original|wlwwar/XC475384_p...|[0.63368791341781...|[-10.213504791259...|[{0, 2005, Nucifr...|        18|  1.2206932306289673|
| grecor|XC629875_part015|   source1|grecor/XC629875_p...|[0.47638714313507...|[-8.6292562484741...|[{0, 291, Atlanti...|        60| 0.02938658744096756|
| grecor|XC629875_part006|   source3|grecor/XC629875_p...|[1.27789175510406...|[-13.741335868835...|[{0, 1079, Egrett...|        87|  1.1638777256011963|
| wlwwar|XC475384_part000|   source0|wlwwar/XC475384_p...|[1.04215753078460.

In [None]:
df.groupBy("species").count().orderBy(F.desc("count")).show(20)

In [None]:
# use a sample of tracks that will make things easier to work with

sample = df.where("species in ('woosan', 'blakit1', 'rbsrob1')").cache()
sample.count()

## plot of embeddings of high confidence prediction

In [None]:
from pyspark.sql import Window, functions as F

# keep the track_type for each
highest_energy_channel = (
    sample
    # get the track stem without the part
    .withColumn("original_track_stem", F.split(F.col("track_stem"), "_").getItem(0))
    .where("track_type != 'original'")
    # get the track type that has the most energy
    .withColumn(
        "rank",
        F.rank().over(
            Window.partitionBy("original_track_stem").orderBy(F.desc("energy"))
        ),
    )
    # keep the first row
    .where(F.col("rank") == 1)
    # drop the rank column
    .select("species", "track_stem", "track_type")
    .distinct()
)

highest_energy_channel.show()

In [None]:
# get the highest predictions by exploding the values

exploded_embeddings = (
    sample
    # join against the highest energy channel
    .join(
        highest_energy_channel,
        on=["species", "track_stem", "track_type"],
        how="inner",
    )
    # explode the embeddings, these are ordered by confidence
    .withColumn("predictions", F.explode("predictions")).select(
        "species",
        "track_stem",
        "track_type",
        "start_time",
        "track_name",
        "embedding",
        "predictions.*",
    )
    # simplifying assumption: we assume the prediction with the highest confidence is the true label
    .where("rank = 0 and probability > 0.5")
)

exploded_embeddings.drop("embedding").show()

In [None]:
# percentage of clips that have a prediction of any kind
positive = exploded_embeddings.count()
total = sample.count()
positive / total * 100

In [None]:
predictions = exploded_embeddings.select(
    "species", "probability", "embedding"
).toPandas()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from umap import UMAP
from sklearn.preprocessing import LabelEncoder

# get the embeddings
X = np.stack(predictions.embedding.values)

# get the species
species = predictions.species.values
le = LabelEncoder()
y = le.fit_transform(species)

# fit the umap model
umap = UMAP(n_components=2)
X_umap = umap.fit_transform(X)

In [None]:
# plot the results
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=y, s=10, alpha=0.1)
# add dummy plot with label for each species
for i, species in enumerate(le.classes_):
    plt.scatter([], [], label=species)
plt.legend()
plt.title("UMAP of blakit1, rbsrob, and woosan")
plt.show()

If we wanted to find the most representative points of each of these species, we could compute the K-means of each species.
We would also want to take care to find all of the points are noisy (human voices, machines, dogs, footsteps, environmental, etc) and create a separate no-call class for them.

## labels from prediction logits

Instead of using the predictions struct, we can derive probabilities from the raw prediction logits.
This could be useful if we're looking to analyze a specific class across all the tracks.

In [None]:
from birdclef import birdnet

# labels = birdnet.load_labels("../vendor/BirdNET-Analyzer")
# mapped_labels = birdnet.load_mapped_labels("../vendor/BirdNET-Analyzer")

labels = birdnet.load_labels("../data/models/birdnet-analyzer-pruned")
mapped_labels = birdnet.load_mapped_labels("../data/models/birdnet-analyzer-pruned")
list(zip(labels, mapped_labels))[:20]

In [None]:
# label with human voice
[(i, x) for i, x in enumerate(labels) if "human" in x.lower()]

In [None]:
label_df = spark.createDataFrame(
    [
        {
            "label": label,
            "mapped_label": mapped_label,
            "index": i,
        }
        for i, (label, mapped_label) in enumerate(zip(labels, mapped_labels))
    ]
)
label_df.show(n=5)

In [None]:
human_vocals = (
    sample
    # explode the predictions with their indices
    .select(
        "species",
        "track_name",
        "start_time",
        F.posexplode("prediction_vec").alias("index", "logit"),
    )
    # join with the labels, in case we want to use it for anything
    .join(label_df, on="index", how="inner")
    # now only keep human vocals
    .where("index = 1450")
    # and convert the logit to a probability via sigmoid
    .withColumn("probability", F.expr("1/(1+exp(-logit))"))
).toPandas()

human_vocals

In [None]:
np.log(human_vocals.probability).hist(bins=100)
plt.title("log probability of human vocals")
plt.show()

In [None]:
human_vocals[human_vocals.probability > 0.2].sort_values("probability", ascending=False)

In [None]:
# let's listen to the audio
# import IPython.display as ipd

# train_embeddings_path = (
#     "../data/processed/birdclef-2023/train_embeddings/consolidated_v3"
# )
# track_name = "rbsrob1/XC393114_source2.mp3"
# ipd.Audio(f"{train_embeddings_path}/audio/{track_name}")

We can hear very muffled human voices in this track.

## track annotation

The code here is similar; we also need a classifier to reasonably annotate the other channels.
However, the output schema is good enough to get started.

In [None]:
join_cols = ["species", "track_name", "start_time"]
annotation = (
    sample.select(*join_cols)
    .join(
        # We add a column with our "prediction", which could can be from a more
        # sophisticated model. Here, we're just using the simplifying assumption
        # that the most confident prediction of the highest energy channel
        # matches the species of the track.
        exploded_embeddings.select(*join_cols).withColumn("label", F.col("species")),
        on=join_cols,
        how="outer",
    )
    .fillna("none")
    .orderBy("species", "track_name", "start_time")
)

# TODO: note how only the track name for the source above has labels. This needs
# to be handled so every track has a label. This is probably best left until
# there's a proper model for predictions.
annotation.show()

## Add dataset with baseline model predictions

In [6]:
from pathlib import Path
import pickle

# Load model from pickle file
model_path = Path("../data/models/baseline/logistic_full.pkl")
clf = pickle.loads(model_path.read_bytes())
print(clf.__class__.__name__)

LogisticRegression


In [7]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, FloatType, StructType, StructField
import warnings
import numpy as np

warnings.filterwarnings("ignore")


def predict_udf(vector):
    # Convert the list of features to a numpy array and reshape it to a 2D array
    embedding_array = np.array(vector).reshape(1, -1)
    # Apply the model to get predictions
    preds = str(clf.predict(embedding_array)[0])
    # Get the probability of the predicted class
    proba = float(clf.predict_proba(embedding_array).max())
    return preds, proba


# Define the schema of the output
schema = StructType(
    [
        StructField("prediction", StringType(), nullable=True),
        StructField("probability", FloatType(), nullable=True),
    ]
)

# Create the UDF with the specified schema
predict_proba = udf(predict_udf, schema)


# df is the consolidated_v3 dataset
# model is the unpickled model
df_with_preds_proba = df.withColumn("prediction_proba", predict_proba(df["embedding"]))

# Results
res = df_with_preds_proba.select(
    "track_name",
    "start_time",
    "prediction_proba.prediction",
    "prediction_proba.probability",
)

In [9]:
# Write out res to a parquet file, using 1 or 2 partitions.
# Use the processed/birdnet-2023 folder and make a new dataset under there.
res.repartition(2).write.mode("overwrite").parquet(
    "../data/processed/birdnet-2023/consolidated_v3_with_preds"
)

                                                                                

In [8]:
# Show results
res.show()

[Stage 6:>                                                          (0 + 1) / 1]

+--------------------+----------+----------+-----------+
|          track_name|start_time|prediction|probability|
+--------------------+----------+----------+-----------+
|wlwwar/XC475384_p...|        18|   thrnig1|  0.5938948|
|grecor/XC629875_p...|        60|   combuz1|  0.2286156|
|grecor/XC629875_p...|        87|    litegr|  0.8291227|
|wlwwar/XC475384_p...|        51|    wlwwar| 0.99999917|
|grecor/XC629875_p...|        15|    strher|  0.3981949|
|grecor/XC629875_p...|        96|   fotdro5|  0.2228314|
|grecor/XC629875_p...|        60|   combuz1|   0.570118|
|wlwwar/XC475384_p...|        66|    wlwwar|  0.6390262|
|wlwwar/XC475384_p...|        21|   combuz1| 0.32505578|
|grecor/XC629875_p...|       117|   combuz1|  0.5007306|
|wlwwar/XC475384_p...|         9|   thrnig1|  0.7924493|
|grecor/XC629875_p...|        21|    egygoo| 0.50189364|
|grecor/XC629875_p...|        84|   thrnig1|  0.4570964|
|grecor/XC629875_p...|        51|    greegr| 0.87462825|
|grecor/XC629875_p...|        7

                                                                                