In [1]:
from birdclef.utils import get_spark
import pyspark.sql.functions as F

In [2]:
spark = get_spark()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/13 14:29:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.parquet("../../data/processed/birdclef-2023/consolidated_v3")
df.printSchema()
df.count()
df.show(n=5, vertical=True)

                                                                                

root
 |-- species: string (nullable = true)
 |-- track_stem: string (nullable = true)
 |-- track_type: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- prediction_vec: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- predictions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- rank: long (nullable = true)
 |    |    |-- index: long (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- mapped_label: string (nullable = true)
 |    |    |-- probability: double (nullable = true)
 |-- start_time: long (nullable = true)
 |-- energy: double (nullable = true)



[Stage 4:>                                                          (0 + 1) / 1]

-RECORD 0------------------------------
 species        | grecor               
 track_stem     | XC629875_part006     
 track_type     | source0              
 track_name     | grecor/XC629875_p... 
 embedding      | [0.65114879608154... 
 prediction_vec | [-13.598222732543... 
 predictions    | [{0, 1451, Human ... 
 start_time     | 69                   
 energy         | 5.219668388366699    
-RECORD 1------------------------------
 species        | grecor               
 track_stem     | XC629875_part008     
 track_type     | source2              
 track_name     | grecor/XC629875_p... 
 embedding      | [1.43535864353179... 
 prediction_vec | [-14.157649040222... 
 predictions    | [{0, 163, Anser a... 
 start_time     | 33                   
 energy         | 2.388000249862671    
-RECORD 2------------------------------
 species        | wlwwar               
 track_stem     | XC475384_part001     
 track_type     | original             
 track_name     | wlwwar/XC475384_p... 


                                                                                

In [4]:
df.groupBy("species").count().orderBy(F.desc("count")).show(20)
df.groupBy("species").count().orderBy(F.asc("count")).show(20)
print(len(set(list(df.select("species").collect()))))

                                                                                

+-------+------+
|species| count|
+-------+------+
|thrnig1|104440|
| wlwwar| 57260|
|eubeat1| 52810|
| hoopoe| 42995|
|combuz1| 42650|
|cohmar1| 35210|
| barswa| 33355|
|eaywag1| 29165|
| comsan| 27180|
|combul2| 24855|
| woosan| 23710|
|rbsrob1| 19910|
|blakit1| 19880|
| litegr| 16710|
| grecor| 16030|
|somgre1| 15280|
|gnbcam2| 13945|
|rerswa1| 13660|
|colsun2| 12545|
|carcha1| 12440|
+-------+------+
only showing top 20 rows



                                                                                

+-------+-----+
|species|count|
+-------+-----+
|afpkin1|   15|
|whhsaw1|   20|
|whctur2|   20|
|golher1|   25|
|rehblu1|   40|
|lotlap1|   40|
|brtcha1|   50|
|lotcor1|   60|
|dotbar1|   60|
|crefra2|   70|
|brcwea1|   75|
|yebsto1|   90|
|fatwid1|  155|
|gobsta5|  160|
|darter3|  170|
|palpri1|  170|
|witswa1|  195|
|sacibi2|  200|
|rufcha2|  225|
|joygre1|  255|
+-------+-----+
only showing top 20 rows



                                                                                

264


In [5]:
from pyspark.sql import Window, functions as F

# keep the track_type for each
highest_energy_channel = (
    df
    # get the track stem without the part
    .withColumn("original_track_stem", F.split(F.col("track_stem"), "_").getItem(0))
    .where("track_type != 'original'")
    # get the track type that has the most energy
    .withColumn(
        "rank",
        F.rank().over(
            Window.partitionBy("original_track_stem").orderBy(F.desc("energy"))
        ),
    )
    # keep the first row
    .where(F.col("rank") == 1)
    # drop the rank column
    .select("species", "track_stem", "track_type")
    .distinct()
)

highest_energy_channel.show()

[Stage 14:>                                                         (0 + 4) / 4]

+-------+----------+----------+
|species|track_stem|track_type|
+-------+----------+----------+
|refcro1|  XC239955|   source0|
|gyhspa1|  XC270259|   source0|
|yebbar1|  XC292826|   source2|
| reccor|  XC312724|   source0|
| litegr|  XC333916|   source2|
|gnbcam2|  XC395639|   source0|
|afpfly1|  XC418708|   source0|
|purgre2|  XC432646|   source2|
|combuz1|  XC463273|   source2|
|walsta1|  XC516711|   source0|
| hoopoe|  XC542705|   source2|
|laudov1|  XC558438|   source2|
|blakit1|  XC572730|   source0|
|afrjac1|  XC585200|   source0|
| comsan|  XC595918|   source2|
|blbpuf2|  XC633870|   source2|
|yertin1|  XC634144|   source1|
|eaywag1|  XC642065|   source0|
|afghor1|  XC720728|   source2|
|afrgos1|  XC147873|   source3|
+-------+----------+----------+
only showing top 20 rows



                                                                                

In [6]:
from pyspark.sql.functions import concat

# get the highest predictions by exploding the values

exploded_embeddings = (
    df
    # join against the highest energy channel
    .join(
        highest_energy_channel,
        on=["species", "track_stem", "track_type"],
        how="inner",
    )
    # explode the embeddings, these are ordered by confidence
    .withColumn("predictions", F.explode("predictions")).select(
        concat(df.track_name, df.start_time, df.track_type).alias("track_id"),
        "species",
        "track_stem",
        "track_type",
        "start_time",
        "track_name",
        "embedding",
        "predictions.*",
    )
    # simplifying assumption: we assume the prediction with the highest confidence is the true label
    .where("rank = 0 and probability > 0.4")
)
exploded_embeddings.drop("embedding").show()

                                                                                

+--------------------+-------+----------------+----------+----------+--------------------+----+-----+--------------------+------------+-------------------+
|            track_id|species|      track_stem|track_type|start_time|          track_name|rank|index|               label|mapped_label|        probability|
+--------------------+-------+----------------+----------+----------+--------------------+----+-----+--------------------+------------+-------------------+
|thrnig1/XC563365_...|thrnig1|XC563365_part003|   source3|        99|thrnig1/XC563365_...|   0| 1686|Luscinia luscinia...|     thrnig1| 0.9531567692756653|
|eubeat1/XC584820_...|eubeat1|XC584820_part000|   source3|        39|eubeat1/XC584820_...|   0| 1787|Merops apiaster_E...|     eubeat1|  0.992747962474823|
|blacuc1/XC420239_...|blacuc1|        XC420239|   source3|        63|blacuc1/XC420239_...|   0|  883|Cuculus clamosus_...|     blacuc1| 0.9119369387626648|
|eaywag1/XC642065_...|eaywag1|        XC642065|   source0|      

In [7]:
from collections import Counter

species = [s["species"] for s in exploded_embeddings.select("species").collect()]

cnts = Counter(species)
for c in cnts:
    if cnts[c] == 1:
        species.remove(c)

species = set(species)
print(len(species))

                                                                                

253


In [8]:
rare_species = (
    df
    # join against the highest energy channel
    .join(
        highest_energy_channel,
        on=["species", "track_stem", "track_type"],
        how="inner",
    )
    # explode the embeddings, these are ordered by confidence
    .withColumn("predictions", F.explode("predictions")).select(
        concat(df.track_name, df.start_time, df.track_type).alias("track_id"),
        "species",
        "track_stem",
        "track_type",
        "start_time",
        "track_name",
        "embedding",
        "predictions.*",
    )
    .filter(~df.species.isin(species))
    # simplifying assumption: we assume the prediction with the highest confidence is the true label
    .where("rank = 0 and probability > 0.05")
)
rare_species.drop("embedding").show()
print(sorted(set(([s["species"] for s in rare_species.select("species").collect()]))))


                                                                                

+--------------------+-------+----------+----------+----------+--------------------+----+-----+--------------------+------------+--------------------+
|            track_id|species|track_stem|track_type|start_time|          track_name|rank|index|               label|mapped_label|         probability|
+--------------------+-------+----------+----------+----------+--------------------+----+-----+--------------------+------------+--------------------+
|blksaw1/XC517147_...|blksaw1|  XC517147|   source0|         3|blksaw1/XC517147_...|   0| 3070|Thraupis palmarum...|     paltan1| 0.07397492229938507|
|blksaw1/XC517147_...|blksaw1|  XC517147|   source0|         0|blksaw1/XC517147_...|   0| 3070|Thraupis palmarum...|     paltan1|0.054684724658727646|
|blksaw1/XC292874_...|blksaw1|  XC292874|   source0|         0|blksaw1/XC292874_...|   0| 2935|Sturnus unicolor_...|     sposta1|0.060267090797424316|
|blksaw1/XC516846_...|blksaw1|  XC516846|   source1|         6|blksaw1/XC516846_...|   0|  123

[Stage 50:>                                                         (0 + 4) / 4]

['afpkin1', 'blksaw1', 'crefra2', 'gobsta5', 'golher1', 'lotlap1', 'rehblu1', 'sacibi2', 'whctur2', 'whhsaw1', 'witswa1']


                                                                                

In [9]:
exploded_embeddings = exploded_embeddings.union(rare_species)

exploded_embeddings.drop("embedding").show()

                                                                                

+--------------------+-------+----------------+----------+----------+--------------------+----+-----+--------------------+------------+-------------------+
|            track_id|species|      track_stem|track_type|start_time|          track_name|rank|index|               label|mapped_label|        probability|
+--------------------+-------+----------------+----------+----------+--------------------+----+-----+--------------------+------------+-------------------+
|thrnig1/XC563365_...|thrnig1|XC563365_part003|   source3|        99|thrnig1/XC563365_...|   0| 1686|Luscinia luscinia...|     thrnig1| 0.9531567692756653|
|eubeat1/XC584820_...|eubeat1|XC584820_part000|   source3|        39|eubeat1/XC584820_...|   0| 1787|Merops apiaster_E...|     eubeat1|  0.992747962474823|
|blacuc1/XC420239_...|blacuc1|        XC420239|   source3|        63|blacuc1/XC420239_...|   0|  883|Cuculus clamosus_...|     blacuc1| 0.9119369387626648|
|eaywag1/XC642065_...|eaywag1|        XC642065|   source0|      

In [10]:
# percentage of clips that have a prediction of any kind
positive = exploded_embeddings.count()
total = df.count()
positive / total * 100

                                                                                

6.135578799859867

In [11]:
from pyspark.sql.functions import rand

length = len(exploded_embeddings.select("embedding").take(1)[0]["embedding"])

# explode_preds_bird_calls = exploded_embeddings.drop("probability")
explode_preds_bird_calls = exploded_embeddings.dropDuplicates(["track_id"])
data = explode_preds_bird_calls.select(
    ["track_id", "species", "probability"]
    + [
        explode_preds_bird_calls.embedding[i].alias("embedding" + str(i))
        for i in range(length)
    ]
)
data = data.orderBy(rand())
data = data.toPandas()
print(len(data))
data.head()

                                                                                

23/05/13 14:33:04 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

73551


Unnamed: 0,track_id,species,probability,embedding0,embedding1,embedding2,embedding3,embedding4,embedding5,embedding6,...,embedding310,embedding311,embedding312,embedding313,embedding314,embedding315,embedding316,embedding317,embedding318,embedding319
0,wlwwar/XC511760_source0.mp33source0,wlwwar,0.990845,1.17717,1.746815,0.715745,2.384658,1.818686,1.827207,1.473857,...,1.126522,0.906444,0.285072,1.045414,1.230909,1.457099,0.959544,0.850391,2.009273,1.737481
1,afghor1/XC360437_source2.mp36source2,afghor1,0.6857,0.57778,1.400259,1.028308,0.66733,0.43521,0.775852,0.205147,...,0.557882,0.662112,1.197826,0.648741,0.40416,0.39775,0.566021,0.361707,1.284315,1.494746
2,fotdro5/XC338365_source3.mp333source3,fotdro5,0.4016,0.429557,0.963952,0.317579,0.731593,0.421621,1.461481,0.575758,...,2.391057,0.357111,1.558307,0.748199,1.260942,1.577218,0.716577,0.780029,1.546573,1.779061
3,combuz1/XC538678_source3.mp312source3,combuz1,0.997282,1.656315,0.714701,0.539804,1.122327,2.557522,1.375711,0.927457,...,2.710279,1.513208,0.790886,0.901417,1.176507,1.826294,1.698841,0.739044,0.565447,1.74061
4,grecor/XC323326_source3.mp3108source3,grecor,0.970848,1.704013,1.268294,0.321613,0.425985,1.554827,1.114459,0.206989,...,1.523228,0.876873,0.281301,0.8322,0.577132,0.438741,0.895232,1.552744,0.335283,1.287564


In [12]:
print(len(data))
print(df.count())

73551
1198860


base classifiers

In [13]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score


train_x, test_x, train_y, test_y = train_test_split(
    data.loc[:, (data.columns != "species") & (data.columns != "track_id")],
    data["species"],
    test_size=0.33,
    stratify=data["species"]
)

# train_x = pd.concat([train_x, temp.loc[:, (temp.columns != "species") & (temp.columns != "track_id")]])
# train_y = pd.concat([train_y, temp["species"]])

def eval(truth, preds):
    print("Average Precision (Kaggle)", average_precision_score(truth, preds))
    print("Accuracy:", accuracy_score(truth, preds))
    print(
        "Precision:",
        # precision_score(truth, preds, average=None),
        precision_score(truth, preds, average="macro"),
    )
    print(
        "Recall:",
        # recall_score(truth, preds, average=None),
        recall_score(truth, preds, average="macro"),
    )
    print(
        "F1 Score:",
        # f1_score(truth, preds, average=None),
        f1_score(truth, preds, average="macro"),
    )

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from skopt import BayesSearchCV
from skopt.space import Real

logs = BayesSearchCV(
    LogisticRegression(max_iter=2000, solver='saga', penalty='elasticnet'),
    {
        'C': Real(1e-3, 1e+3, prior='log-uniform'),
        'l1_ratio': Real(0, 1, prior='uniform')
    },
    n_iter=15,
    scoring="precision_macro",
    n_jobs=-1,
    n_points=1,
    cv=zip([np.arange(0, int(len(train_x)*0.67))], [np.arange(int(len(train_x)*0.67) + 1, len(train_x))]),
    verbose=4
)

logs.fit(train_x, train_y)
print(logs.best_params_)
preds = logs.predict(test_x)
eval(test_y, preds)

# params = {
#     "penalty": ['l2'],
#     "solver": ["newton-cg", "sag", "lbfgs"],
#     "C": [0.01, 1, 100],
# }


# search = GridSearchCV(
#     estimator=LogisticRegression(max_iter=2000), param_grid=params, scoring="precision_macro", n_jobs=-1, cv=zip([np.arange(0, int(len(train_x)*0.67))], [np.arange(int(len(train_x)*0.67) + 1, len(train_x))]), verbose=4
# )
# search.fit(train_x, train_y)
# print(search.best_params_)
# preds = search.predict(test_x)

# eval(test_y, preds)

Fitting 1 folds for each of 1 candidates, totalling 1 fits


In [14]:
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(train_y)

xgbc = XGBClassifier()
xgbc.fit(train_x, le.transform(train_y))
preds = xgbc.predict(test_x)

eval(le.transform(test_y), preds)

Saving models to disk

In [None]:
import pickle

# pickle.dump(
#     log, open("../../data/models/baseline/logistic_reg_2.pkl", "wb"), protocol=2
# )
pickle.dump(
    search, open("../../data/models/baseline/logistic_updated.pkl", "wb"), protocol=3
)

Generating submission.csv

In [49]:
import pandas as pd

results = log.predict_proba(test_x)
species = sorted(set(species))
res = []
i = 0

for row in results:
    assert len(species) == len(row)
    d = dict(zip(species, row))
    d["row_id"] = f"{data.iloc[i]['track_id'][:16]}"
    res.append(d)
    i += 1
res_df = pd.DataFrame(res, columns=["row_id"] + species)
res_df.to_csv("../submission.csv", index=False)