In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

## Neural Network Classifier

Using the [MLPClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier) from sklearn to train the `consolidaded_v3` dataset.

In [2]:
from birdclef.utils import get_spark

spark = get_spark(cores=16, memory="20g")
df = spark.read.parquet(
    "../data/processed/birdclef-2023/train_embeddings/consolidated_v3"
    # "../data/processed/birdclef-2023/train_embeddings/consolidated_v4"
)
df.printSchema()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/20 18:26:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/20 18:26:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


                                                                                

root
 |-- species: string (nullable = true)
 |-- track_stem: string (nullable = true)
 |-- track_type: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- prediction_vec: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- predictions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- rank: long (nullable = true)
 |    |    |-- index: long (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- mapped_label: string (nullable = true)
 |    |    |-- probability: double (nullable = true)
 |-- start_time: long (nullable = true)
 |-- energy: double (nullable = true)



### 1. Data processing

In [3]:
from pyspark.sql import Window, functions as F

# keep the track_type for the highest energy
highest_energy_channel = (
    df
    # get the track stem without the part
    .withColumn("original_track_stem", F.split(F.col("track_stem"), "_").getItem(0))
    .where("track_type != 'original'")
    # get the track type that has the most energy
    .withColumn(
        "rank",
        F.rank().over(
            Window.partitionBy("original_track_stem").orderBy(F.desc("energy"))
        ),
    )
    # keep the first row
    .where(F.col("rank") == 1)
    # drop the rank column
    .select("species", "track_stem", "track_type")
    .distinct()
)

# get the highest predictions by exploding the values
exploded_embeddings = (
    df
    # join against the highest energy channel
    .join(
        highest_energy_channel,
        on=["species", "track_stem", "track_type"],
        how="inner",
    )
    # explode the embeddings, these are ordered by confidence
    .withColumn("predictions", F.explode("predictions")).select(
        "species",
        "track_stem",
        "track_type",
        "start_time",
        "track_name",
        "embedding",
        "predictions.*",
    )
    # simplifying assumption: we assume the prediction with the highest confidence is the true label
    .where("rank = 0")
).cache()

exploded_embeddings.drop("embedding").show(n=5)



+-------+----------+----------+----------+--------------------+----+-----+--------------------+------------+--------------------+
|species|track_stem|track_type|start_time|          track_name|rank|index|               label|mapped_label|         probability|
+-------+----------+----------+----------+--------------------+----+-----+--------------------+------------+--------------------+
|abythr1|  XC233199|   source0|         0|abythr1/XC233199_...|   0|  639|Chloropsis hardwi...|     orblea1|0.002208352088928...|
|abythr1|  XC233199|   source0|        57|abythr1/XC233199_...|   0| 1151|Erpornis zanthole...|     whbyuh1|0.025502817705273628|
|abythr1|  XC233199|   source0|        27|abythr1/XC233199_...|   0| 3164|Turdus abyssinicu...|     abythr1|0.024902962148189545|
|abythr1|  XC233199|   source0|        30|abythr1/XC233199_...|   0|  639|Chloropsis hardwi...|     orblea1|0.012038093991577625|
|abythr1|  XC233199|   source0|        21|abythr1/XC233199_...|   0| 3185|Turdus leucomela

                                                                                

In [4]:
# quick count of the number of samples
counts = (
    exploded_embeddings.groupBy("species")
    .agg(F.count("*").alias("n"))
    .orderBy(F.desc("n"))
)
counts.show(n=5)
counts.orderBy("n").show(n=5)

                                                                                

+-------+-----+
|species|    n|
+-------+-----+
|thrnig1|12987|
| wlwwar| 9249|
|combuz1| 7173|
| hoopoe| 6731|
| barswa| 6191|
+-------+-----+
only showing top 5 rows





+-------+---+
|species|  n|
+-------+---+
|afpkin1|  3|
|whhsaw1|  4|
|whctur2|  4|
|golher1|  5|
|lotlap1|  8|
+-------+---+
only showing top 5 rows



                                                                                

In [5]:
rarity_min_count = 100
rare_species_count = (
    exploded_embeddings.groupBy("species")
    .agg(F.count("*").alias("n"))
    .where(f"n < {rarity_min_count}")
)
rare_species_count.show(n=5)

# if there are a lot of examples, we can use a higher threshold
common_species = exploded_embeddings.where("probability > 0.4").join(
    rare_species_count.select("species"), on="species", how="left_anti"
)
# these ones are less common so we use a lower threshold so we have at least one
# example for each species
rare_species = exploded_embeddings.where("probability > 0.05").join(
    rare_species_count.select("species"), on="species", how="inner"
)
prepared = common_species.union(rare_species).select(
    "species", "probability", "embedding"
)
prepared.show(n=5)
prepared.count()

                                                                                

+-------+---+
|species|  n|
+-------+---+
|purgre2| 60|
|bubwar2| 90|
|rehwea1| 69|
|kvbsun1| 80|
|equaka1| 63|
+-------+---+
only showing top 5 rows



                                                                                

+-------+------------------+--------------------+
|species|       probability|           embedding|
+-------+------------------+--------------------+
|afghor1|0.9965255856513977|[0.57833033800125...|
|afghor1| 0.511886715888977|[1.00166213512420...|
|afghor1|0.9984956979751587|[0.88829582929611...|
|afghor1|0.9988522529602051|[1.26016914844512...|
|afghor1|0.9997662901878357|[1.16302716732025...|
+-------+------------------+--------------------+
only showing top 5 rows



                                                                                

74490

In [6]:
# lets check that we have the right number of classes, and how many examples we are working with
prepared_counts = (
    prepared.groupBy("species").agg(F.count("*").alias("n")).orderBy(F.desc("n"))
)
print(f"number of species {prepared_counts.count()}")

prepared_counts.show(n=5)
prepared_counts.orderBy("n").show(n=5)

                                                                                

number of species 264


                                                                                

+-------+----+
|species|   n|
+-------+----+
|thrnig1|3833|
| hoopoe|3822|
|eubeat1|3116|
| wlwwar|2687|
| barswa|2603|
+-------+----+
only showing top 5 rows



                                                                                

+-------+---+
|species|  n|
+-------+---+
|afpkin1|  2|
|whctur2|  2|
|rehblu1|  2|
|whhsaw1|  3|
|easmog1|  4|
+-------+---+
only showing top 5 rows



In [7]:
# Data for model training
data = prepared.toPandas()
data.head()

                                                                                

Unnamed: 0,species,probability,embedding
0,afghor1,0.996526,"[0.5783303380012512, 1.845029354095459, 0.2178..."
1,afghor1,0.511887,"[1.0016621351242065, 1.2551445960998535, 0.242..."
2,afghor1,0.998496,"[0.8882958292961121, 1.4398638010025024, 0.195..."
3,afghor1,0.998852,"[1.2601691484451294, 2.366661787033081, 0.2103..."
4,afghor1,0.999766,"[1.1630271673202515, 1.7402706146240234, 0.020..."


### 2. MLPClassifier model training

In [8]:
import time
import numpy as np
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
    classification_report,
)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    np.stack(data["embedding"]),
    data["species"],
    test_size=0.33,
    stratify=data["species"],
)

# Data shape
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(49908, 320) (24582, 320)
(49908,) (24582,)


In [9]:
# Select a small subset of data to train the model and test the class implementation
# After training on the data subset, use the original train/test split data to train the model
X_train_sub = X_train[:5000]
X_test_sub = X_test[:1000]
y_train_sub = y_train[:5000]
y_test_sub = y_test[:1000]

# Data shape
print(X_train_sub.shape, X_test_sub.shape)
print(y_train_sub.shape, y_test_sub.shape)

(5000, 320) (1000, 320)
(5000,) (1000,)


In [10]:
# Learner class
class Learner:
    def __init__(self, pipe, params):
        self.pipe = pipe
        self.params = params
        self.clf = None
        self.scores = None
        self.search_name = None
        self.class_report = None
        self.dataset_name = None
        self.learning_curve = {}
        self.validation_curve = {}
        self.cv = StratifiedKFold(n_splits=5, shuffle=True)
        self.name = str(self.pipe["model"].__class__.__name__)

    def fit_gridsearch(self, search_func, X_train, y_train, verbose=False):
        """
        Method to train the model using a search algorithm.

        search_func: GridSearchCV, RandomizedSearchCV from sklearn.
        X_train: training features dataset.
        y_train: training labels dataset.
        verbose: int() Controls the verbosity: the higher, the more messages (1, 2, or 3).
        """
        np.random.seed(42)

        # Train learner
        self.clf = search_func(
            self.pipe,
            self.params,
            scoring={
                "accuracy": make_scorer(accuracy_score),
                "precision": make_scorer(precision_score),
                "recall": make_scorer(recall_score),
                "f1": make_scorer(f1_score),
            },
            refit="f1",
            cv=self.cv,
            verbose=verbose,
            n_jobs=-1,
        )
        # Fit the model
        self.clf.fit(X_train, y_train)
        self.search_name = str(self.clf.__class__.__name__)

    def get_scores(self, X_train, X_test, y_train, y_test, average=None):
        """
        Method to get model scores.

        X_train: training features dataset.
        X_test: test features dataset.
        y_train: training labels dataset.
        y_test: test labels dataset.
        """
        if self.search_name == "Benchmark":
            best_estimator = self.clf
        else:
            best_estimator = self.clf.best_estimator_

        np.random.seed(42)
        # Score on training data
        start_time = time.time()
        best_estimator.fit(X_train, y_train)
        end_time = time.time()
        wall_clock_fit = end_time - start_time
        # train_score = self.clf.score(X_train, y_train)
        train_score = best_estimator.score(X_train, y_train)

        # Score on test data
        start_time = time.time()
        # y_pred = self.clf.predict(X_test)
        y_pred = best_estimator.predict(X_test)
        end_time = time.time()
        wall_clock_pred = end_time - start_time
        # test_score = self.clf.score(X_test, y_test)
        test_score = best_estimator.score(X_test, y_test)
        # Metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average=average)
        recall = recall_score(y_test, y_pred, average=average)
        f1 = f1_score(y_test, y_pred, average=average)
        # Classification report
        self.class_report = classification_report(y_test, y_pred)

        self.scores = {
            "train_score": round(train_score, 3),
            "test_score": round(test_score, 3),
            "accuracy": round(accuracy, 3),
            "precision": round(precision, 3),
            "recall": round(recall, 3),
            "f1": round(f1, 3),
            "wall_clock_fit": wall_clock_fit,
            "wall_clock_pred": wall_clock_pred,
        }

    # Evaluate Learner class
    def evaluate_learner(self):
        """
        Print model scores
        """
        print(f"{'#################################'*2}")
        print(f"{self.search_name}:\t  {self.name}")
        print(f"Train score:     {round(self.scores['train_score'], 3)}")
        print(f"Test score:      {round(self.scores['test_score'], 3)}")
        print(f"Accuracy score:  {round(self.scores['accuracy'], 3)}")
        print(f"Precision score: {round(self.scores['precision'], 3)}")
        print(f"Recall score:    {round(self.scores['recall'], 3)}")
        print(f"F1 score:        {round(self.scores['f1'], 3)}")
        print(f"Wall Clock Fit:  {round(self.scores['wall_clock_fit'], 3)}")
        print(f"Wall Clock Pred: {round(self.scores['wall_clock_pred'], 3)}")
        # Classification report
        print(f"\nClassification report:\n{self.class_report}")

        # Best score and best params
        print(f"Best score: {round(self.clf.best_score_, 3)}")
        print("Best params:")
        for param in self.clf.best_params_.items():
            print(f"\t{param}")
        print()

In [11]:
# MLP pipeline
mlp_pipe = Pipeline(
    steps=[("scaler", StandardScaler()), ("model", MLPClassifier(random_state=42))]
)
# GridSearchCV params
mlp_param_grid = {
    "model__activation": ["relu", "tanh", "logistic"],
    "model__max_iter": [1000],
    "model__hidden_layer_sizes": [
        (
            100,
            100,
        ),
        (
            200,
            200,
        ),
        (
            200,
            200,
            200,
        ),
        (
            300,
            300,
        ),
        (
            300,
            300,
            300,
        ),
    ],
    "model__learning_rate": ["constant", "adaptive"],
    "model__learning_rate_init": [0.001, 0.01, 0.1],
}


# Init learners
mlp = Learner(pipe=mlp_pipe, params=mlp_param_grid)

In [12]:
# Fit model
mlp.fit_gridsearch(RandomizedSearchCV, X_train, y_train, verbose=2)
# Get model scores
mlp.get_scores(X_train, X_test, y_train, y_test, average="macro")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END model__activation=tanh, model__hidden_layer_sizes=(200, 200), model__learning_rate=adaptive, model__learning_rate_init=0.01, model__max_iter=1000; total time= 1.3min
[CV] END model__activation=tanh, model__hidden_layer_sizes=(200, 200), model__learning_rate=adaptive, model__learning_rate_init=0.01, model__max_iter=1000; total time= 1.4min
[CV] END model__activation=tanh, model__hidden_layer_sizes=(200, 200), model__learning_rate=adaptive, model__learning_rate_init=0.01, model__max_iter=1000; total time= 1.6min
[CV] END model__activation=tanh, model__hidden_layer_sizes=(200, 200), model__learning_rate=adaptive, model__learning_rate_init=0.01, model__max_iter=1000; total time= 1.8min
[CV] END model__activation=relu, model__hidden_layer_sizes=(300, 300), model__learning_rate=adaptive, model__learning_rate_init=0.01, model__max_iter=1000; total time= 1.5min
[CV] END model__activation=tanh, model__hidden_layer_sizes=(200,

In [13]:
# Print scores
mlp.evaluate_learner()

##################################################################
RandomizedSearchCV:	  MLPClassifier
Train score:     0.948
Test score:      0.852
Accuracy score:  0.852
Precision score: 0.647
Recall score:    0.61
F1 score:        0.615
Wall Clock Fit:  100.014
Wall Clock Pred: 0.26

Classification report:
              precision    recall  f1-score   support

     abethr1       0.33      0.25      0.29         8
     abhori1       0.86      0.93      0.90       268
     abythr1       0.60      0.60      0.60        30
     afbfly1       0.00      0.00      0.00        11
     afdfly1       1.00      0.80      0.89        46
     afecuc1       0.93      0.92      0.92       190
     affeag1       0.92      0.83      0.87        41
     afgfly1       1.00      0.57      0.73        14
     afghor1       0.86      0.86      0.86       136
     afmdov1       0.82      0.70      0.76        84
     afpfly1       0.70      0.79      0.74       125
     afpkin1       0.00      0.00      0

In [14]:
# MLPClassifier best estimator
mlp.clf.best_estimator_

In [15]:
def model_eval(truth, preds):
    print("Accuracy:", round(accuracy_score(truth, preds), 3))
    print("Precision:", round(precision_score(truth, preds, average="macro"), 3))
    print("Recall:", round(recall_score(truth, preds, average="macro"), 3))
    print("F1 Score:", round(f1_score(truth, preds, average="macro"), 3))

In [16]:
# Model evaluation
cv_model = mlp.clf.best_estimator_
model_eval(y_test, cv_model.predict(X_test))

Accuracy: 0.852
Precision: 0.647
Recall: 0.61
F1 Score: 0.615


In [17]:
import pickle
from pathlib import Path

# Write to pickle file
pickle.dump(
    cv_model,
    Path("../data/models/baseline/mlp-v1.pkl").open("wb"),
)

### 3. XGBoost model

In [None]:
from sklearn.preprocessing import LabelEncoder

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    np.stack(data["embedding"]),
    data["species"],
    test_size=0.33,
    stratify=data["species"],
)


# Create a label encoder object
le = LabelEncoder()

# Fit and transform the target with label encoder
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

In [None]:
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, TransformerMixin


# XGBoost pipeline
xgb_pipe = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("model", XGBClassifier(seed=42)),
    ]
)
# GridSearchCV params
xgb_param_grid = {
    "model__objective": ["multi:softmax"],
    "model__n_estimators": [400],  # np.arange(50, 200, 50),
    "model__max_depth": [
        3,
        4,
        5,
    ],  # np.arange(3, 10),
    # "model__min_child_weight": np.arange(1, 6),
    # "model__gamma": np.linspace(0, 0.6, 5),
    # "model__subsample": np.linspace(0.5, 1.0, 6),
    # "model__colsample_bytree": np.linspace(0.5, 1.0, 6),
}


# Init learners
xgb = Learner(pipe=xgb_pipe, params=xgb_param_grid)

In [None]:
# Fit model
xgb.fit_gridsearch(RandomizedSearchCV, X_train, y_train_enc, verbose=2)
# Get model scores
# xgb.get_scores(X_train, X_test, y_train_enc, y_test_enc, average="macro")

In [None]:
start_time = time.time()
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb.fit(X=X_train, y=y_train_enc)
print("Fit time : ", time.time() - start_time)

In [None]:
y_preds = xgb.predict(X_test)
y_preds = le.inverse_transform(y_preds)
print("Accuracy:", round(accuracy_score(y_test, y_preds), 4))

In [None]:
import matplotlib.pyplot as plt


def plot_bird_classes(y_train):
    class_counts = y_train.value_counts()
    fig, ax = plt.subplots(figsize=(6.4, 4.8), dpi=200)
    ax.margins(x=0.01, y=0.1)  # No margins on x and y-axis
    x = np.arange(len(class_counts))
    width = 0.7
    ax.bar(class_counts.index, class_counts.values, width=width, color="tab:blue")
    ax.set_title(
        f"Total count of bird species for each class", weight="bold", fontsize=16
    )
    ax.set_xlabel("Bird Species")
    ax.set_ylabel("Total count")
    ax.set_xticks([])
    ax.grid(color="blue", linestyle="--", linewidth=1, alpha=0.2)
    for spine in ["top", "right", "bottom", "left"]:
        ax.spines[spine].set_visible(False)
    fig.tight_layout()
    plt.show()