In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

## Adding predictions to the dataset using model inference

In [2]:
from birdclef.utils import get_spark
from pyspark.sql import functions as F

# modify cores and memory as needed
spark = get_spark(cores=24, memory="30g")

In [3]:
path = "../data/processed/birdclef-2023/train_embeddings/consolidated_v4/"
df = spark.read.parquet(path)
df.printSchema()
df.count()

root
 |-- species: string (nullable = true)
 |-- track_stem: string (nullable = true)
 |-- track_type: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- prediction_vec: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- predictions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- rank: long (nullable = true)
 |    |    |-- index: long (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- mapped_label: string (nullable = true)
 |    |    |-- probability: double (nullable = true)
 |-- start_time: long (nullable = true)
 |-- energy: double (nullable = true)



3418610

## Add dataset with baseline model predictions

In [4]:
from pathlib import Path
import pickle

# Load model from pickle file
model_path = Path("../data/models/baseline/logistic_negative_new.pkl")
clf = pickle.loads(model_path.read_bytes())
print(clf.__class__.__name__)

LogisticRegression


In [5]:
import numpy as np
import pandas as pd
from pyspark.sql.functions import pandas_udf, PandasUDFType


# Predictions
@pandas_udf("string", PandasUDFType.SCALAR)
def make_prediction(embedding_series: pd.Series) -> pd.Series:
    # Convert series of lists to 2D array
    embedding_array = np.vstack(embedding_series.tolist())
    # Make predictions
    predictions = clf.predict(embedding_array)
    return pd.Series(predictions)


# Probabilities
@pandas_udf("double", PandasUDFType.SCALAR)
def make_prediction_proba(embedding_series: pd.Series) -> pd.Series:
    # Convert series of lists to 2D array
    embedding_array = np.vstack(embedding_series.tolist())
    # Get prediction probabilities
    probabilities = clf.predict_proba(embedding_array)
    max_probabilities = np.amax(probabilities, axis=1)
    return pd.Series(max_probabilities)


# Use the UDF to add predictions and probabilities to your dataframe
preds_df = df.withColumn("prediction", make_prediction(df["embedding"]))
proba_df = preds_df.withColumn("probability", make_prediction_proba(df["embedding"]))
res = proba_df.select(
    "track_name",
    "start_time",
    "prediction",
    "probability",
)



In [6]:
res.show()

+--------------------+----------+----------+-------------------+
|          track_name|start_time|prediction|        probability|
+--------------------+----------+----------+-------------------+
|grecor/XC629875_p...|        75|   combuz1|  0.595202838244718|
|grecor/XC629875_p...|       127|    strher|  0.519304999398772|
|grecor/XC629875_p...|       143|    hoopoe|0.39469672179585924|
|grecor/XC629875_p...|       100|    barswa|  0.522068624744396|
|grecor/XC629875_p...|        62|    egygoo| 0.8028985925544311|
|grecor/XC629875_p...|        10|   no_call| 0.8533084422827208|
|grecor/XC629875_p...|       139|    wlwwar| 0.5278238795398049|
|grecor/XC629875_p...|        25|   combuz1|0.40135901403767077|
|grecor/XC629875_p...|        83|    barswa| 0.1204648163949473|
|grecor/XC629875_p...|        53|   yertin1| 0.5402404945683114|
|grecor/XC629875_p...|         1|   thrnig1|0.39749713513198004|
|grecor/XC629875_p...|        34|   gobbun1| 0.7619074297015382|
|grecor/XC629875_p...|   

In [7]:
# Write out res to a parquet file, using 1 or 2 partitions.
# Use the processed/birdnet-2023 folder and make a new dataset under there.
res.repartition(1).write.mode("overwrite").parquet(
    "../data/processed/birdclef-2023/consolidated_v4_with_preds"
)