# generating dataset of triples

The purpose of this notebook is to generate triplets to create a dataset for
training. It should contain a fairly large number of examples. Let's create 3
dataset with 100k, 1m, and 10m rows. We'll train the unsupervised model against
all three of these.

We're going to be clever about how we sample from the motif dataset. We utilize
the sampling method referenced in tile2vec. Neighbors are drawn from both within
the same audio sample and within a species. The distant neighbor is drawn from
from the entire dataset, using both a random motif, and an entirely random
sample from across the dataset.

In [52]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [108]:
import pandas as pd
import matplotlib.pyplot as plt

taxa = pd.read_csv("../data/raw/birdclef-2022/eBird_Taxonomy_v2021.csv")

df = pd.read_parquet("../data/intermediate/2022-02-26-motif-consolidated.parquet")
df["species"] = df.source_name.apply(lambda x: x.split("/")[1]).astype(str)
df.head()

Unnamed: 0,source_name,cens_sample_rate,matrix_profile_window,motif_0,motif_1,sample_rate,duration_cens,duration_samples,duration_seconds,species
0,train_audio/afrsil1/XC125458.ogg,10,50,61.0,26.0,22050,113,244800,11.1,afrsil1
1,train_audio/afrsil1/XC175522.ogg,10,50,229.0,305.0,22050,477,1036800,47.02,afrsil1
2,train_audio/afrsil1/XC177993.ogg,10,50,492.0,353.0,22050,545,1185408,53.76,afrsil1
3,train_audio/afrsil1/XC205893.ogg,10,50,397.0,726.0,22050,1069,2325888,105.48,afrsil1
4,train_audio/afrsil1/XC207431.ogg,10,50,,,22050,35,74880,3.4,afrsil1


In [111]:
df_fam = df.merge(
    taxa[["SPECIES_CODE", "FAMILY"]].rename(
        columns={"SPECIES_CODE": "species", "FAMILY": "family"}
    ),
    on="species",
    how="left",
)[["source_name", "species", "family", "motif_0", "motif_1", "duration_seconds"]]
df_fam

Unnamed: 0,source_name,species,family,motif_0,motif_1,duration_seconds
0,train_audio/afrsil1/XC125458.ogg,afrsil1,Estrildidae (Waxbills and Allies),61.0,26.0,11.10
1,train_audio/afrsil1/XC175522.ogg,afrsil1,Estrildidae (Waxbills and Allies),229.0,305.0,47.02
2,train_audio/afrsil1/XC177993.ogg,afrsil1,Estrildidae (Waxbills and Allies),492.0,353.0,53.76
3,train_audio/afrsil1/XC205893.ogg,afrsil1,Estrildidae (Waxbills and Allies),397.0,726.0,105.48
4,train_audio/afrsil1/XC207431.ogg,afrsil1,Estrildidae (Waxbills and Allies),,,3.40
...,...,...,...,...,...,...
14847,train_audio/zebdov/XC629769.ogg,zebdov,Columbidae (Pigeons and Doves),362.0,326.0,61.06
14848,train_audio/zebdov/XC642415.ogg,zebdov,Columbidae (Pigeons and Doves),0.0,25.0,14.86
14849,train_audio/zebdov/XC665873.ogg,zebdov,Columbidae (Pigeons and Doves),397.0,365.0,45.58
14850,train_audio/zebdov/XC666194.ogg,zebdov,Columbidae (Pigeons and Doves),169.0,133.0,21.55


In [112]:
df_fam.family.unique().shape

(41,)

In [123]:
import numpy as np

np.random.randint(0, 1, (3, 2))

array([[0, 0],
       [0, 0],
       [0, 0]])

In [192]:
np.random.rand()

0.33228951014780406

In [195]:
import numpy as np

n_samples = int(1e6)

# we generate two sets of dataset, and then union them at the end first generate
# the same of completely random samples. This will comprise of half our distant
# data. The other half will comprise of random samples from motifs. We will
# always choose to use stratified sampling in order to represent all species
# equally.

# there's no reason that our embedding should try to cluster between classes
# based on the frequency of samples. We should instead try to embed based on the
# actual content of the samples that we hear. This is why this function will
# perform stratified sampling over the family that the audio comes from.
def generate_samples(df, n_samples, grouping_col="family", window_sec=5):
    res = pd.DataFrame()
    groups = df[grouping_col].unique()
    for group in groups:

        def sample_group(k, include=True):
            return (
                df[df[grouping_col] == group]
                if include
                else df[df[grouping_col] != group]
            ).sample(k, replace=True)

        k = n_samples // len(groups) // 2

        # inter_clip
        x, y, z = [sample_group(k, True).fillna(-1).reset_index() for _ in range(3)]

        tmp_ab = pd.concat(
            [
                pd.DataFrame(
                    {
                        "a": x.source_name,
                        "a_loc": x.motif_0,
                        "b": x.source_name,
                        "b_loc": x.motif_1,
                    }
                ),
                pd.DataFrame(
                    {
                        "a": y.source_name,
                        "a_loc": y.motif_0,
                        "b": z.source_name,
                        "b_loc": z.motif_1,
                    }
                ),
            ]
        )
        # now we randomly sample against clips outside the family which are
        # (motifs, random)

        x, y = [sample_group(k, False).fillna(-1).reset_index() for _ in range(2)]
        tmp_c = pd.concat(
            [
                pd.DataFrame({"c": x.source_name, "c_loc": x.motif_0}),
                pd.DataFrame(
                    {
                        "c": y.source_name,
                        "c_loc": y.duration_seconds.apply(
                            lambda s: -1
                            if s <= window_sec
                            else np.random.rand() * (s - window_sec) + (window_sec) / 2
                        ),
                    }
                ),
            ]
        )

        tmp = pd.concat([tmp_ab, tmp_c], axis=1)

        if res.empty:
            res = tmp
        else:
            res = pd.concat([res, tmp])
    return res


sampled_df = generate_samples(df_fam, n_samples)
sampled_df.shape, sampled_df.dtypes
sampled_df

Unnamed: 0,a,a_loc,b,b_loc,c,c_loc
0,train_audio/redava/XC646730.ogg,46.0,train_audio/redava/XC646730.ogg,72.0,train_audio/rinduc/XC452614.ogg,7.000000
1,train_audio/comwax/XC464935.ogg,24.0,train_audio/comwax/XC464935.ogg,50.0,train_audio/whttro/XC303134.ogg,0.000000
2,train_audio/comwax/XC349832.ogg,52.0,train_audio/comwax/XC349832.ogg,78.0,train_audio/houfin/XC573019.ogg,-1.000000
3,train_audio/redava/XC110429.ogg,132.0,train_audio/redava/XC110429.ogg,98.0,train_audio/gnwtea/XC591881.ogg,39.000000
4,train_audio/comwax/XC400871.ogg,72.0,train_audio/comwax/XC400871.ogg,46.0,train_audio/sora/XC177334.ogg,-1.000000
...,...,...,...,...,...,...
12190,train_audio/whfibi/XC455508.ogg,32.0,train_audio/whfibi/XC109188.ogg,207.0,train_audio/gryfra/XC635699.ogg,7.018386
12191,train_audio/whfibi/XC510883.ogg,65.0,train_audio/whfibi/XC368518.ogg,0.0,train_audio/wesmea/XC213061.ogg,13.593726
12192,train_audio/whfibi/XC109189.ogg,142.0,train_audio/whfibi/XC147508.ogg,25.0,train_audio/brnowl/XC582638.ogg,4.154472
12193,train_audio/whfibi/XC572903.ogg,-1.0,train_audio/whfibi/XC382228.ogg,-1.0,train_audio/comgal1/XC492414.ogg,6.104915
