In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

## setup

### downloading the dataset

Make sure that you have the dataset downloaded locally.
At the root of the project, run this command:

```bash
gsutil -m rsync \
    gs://birdclef-2023/data/processed/birdclef-2023/train_embeddings/consolidated_v3_pre1/ \
    data/processed/birdclef-2023/train_embeddings/consolidated_v3_pre1/ 
```

### using spark

In [2]:
from birdclef.utils import get_spark
from pyspark.sql import functions as F

# modify cores and memory as needed
spark = get_spark(cores=8, memory="16g")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/15 03:30:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
path = "../data/processed/birdclef-2023/train_embeddings/consolidated_v3/"
df = spark.read.parquet(path)
df.printSchema()
df.count()

                                                                                

root
 |-- species: string (nullable = true)
 |-- track_stem: string (nullable = true)
 |-- track_type: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- prediction_vec: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- predictions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- rank: long (nullable = true)
 |    |    |-- index: long (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- mapped_label: string (nullable = true)
 |    |    |-- probability: double (nullable = true)
 |-- start_time: long (nullable = true)
 |-- energy: double (nullable = true)



                                                                                

1198860

## Add dataset with baseline model predictions

In [4]:
from pathlib import Path
import pickle

# Load model from pickle file
model_path = Path("../data/models/baseline/logistic_negative_new.pkl")
clf = pickle.loads(model_path.read_bytes())
print(clf.__class__.__name__)

LogisticRegression


In [5]:
import numpy as np
import pandas as pd
from pyspark.sql.functions import pandas_udf, PandasUDFType


# Predictions
@pandas_udf("string", PandasUDFType.SCALAR)
def make_prediction(embedding_series: pd.Series) -> pd.Series:
    # Convert series of lists to 2D array
    embedding_array = np.vstack(embedding_series.tolist())
    # Make predictions
    predictions = clf.predict(embedding_array)
    return pd.Series(predictions)


# Probabilities
@pandas_udf("double", PandasUDFType.SCALAR)
def make_prediction_proba(embedding_series: pd.Series) -> pd.Series:
    # Convert series of lists to 2D array
    embedding_array = np.vstack(embedding_series.tolist())
    # Get prediction probabilities
    probabilities = clf.predict_proba(embedding_array)
    max_probabilities = np.amax(probabilities, axis=1)
    return pd.Series(max_probabilities)


# Use the UDF to add predictions and probabilities to your dataframe
preds_df = df.withColumn("prediction", make_prediction(df["embedding"]))
proba_df = preds_df.withColumn("probability", make_prediction_proba(df["embedding"]))
res = proba_df.select(
    "track_name",
    "start_time",
    "prediction",
    "probability",
)

In [6]:
res.show()

23/05/15 03:30:29 WARN DAGScheduler: Broadcasting large task binary with size 1372.8 KiB


[Stage 4:>                                                          (0 + 1) / 1]

+--------------------+----------+----------+-------------------+
|          track_name|start_time|prediction|        probability|
+--------------------+----------+----------+-------------------+
|wlwwar/XC475384_p...|        18|    wlwwar| 0.5675699565354009|
|grecor/XC629875_p...|        60|   no_call|0.48938898129941916|
|grecor/XC629875_p...|        87|    litegr| 0.9345325524879964|
|wlwwar/XC475384_p...|        51|    wlwwar| 0.9943723441150192|
|grecor/XC629875_p...|        15|    strher| 0.4401947783710894|
|grecor/XC629875_p...|        96|   combuz1| 0.4643792294901046|
|grecor/XC629875_p...|        60|   combuz1|0.40203665348791984|
|wlwwar/XC475384_p...|        66|    wlwwar| 0.5959878192798792|
|wlwwar/XC475384_p...|        21|   combuz1|0.47864665964721836|
|grecor/XC629875_p...|       117|   combuz1|0.47142785258506426|
|wlwwar/XC475384_p...|         9|   thrnig1|0.37728004342577076|
|grecor/XC629875_p...|        21|    egygoo| 0.7234628309654835|
|grecor/XC629875_p...|   

                                                                                

In [7]:
# Write out res to a parquet file, using 1 or 2 partitions.
# Use the processed/birdnet-2023 folder and make a new dataset under there.
res.repartition(2).write.mode("overwrite").parquet(
    "../data/processed/birdnet-2023/consolidated_v3_with_preds"
)

23/05/15 03:30:40 WARN DAGScheduler: Broadcasting large task binary with size 1374.5 KiB


                                                                                