In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm.autonotebook import tqdm
from joblib import Parallel, delayed
import pandas as pd
from datetime import datetime
from pathlib2 import Path

In [32]:
from avgn.utils.paths import DATA_DIR, most_recent_subdirectory, ensure_dir

In [33]:
DATASET_ID = 'fruitbat'

In [34]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2023-03-26_17-25-15'

In [35]:
from avgn.utils.hparams import HParams
from avgn.dataset import DataSet

In [36]:
from avgn.signalprocessing.create_spectrogram_dataset import prepare_wav, create_label_df, get_row_audio

In [37]:
### prepare_wav
### Denoising Must be > Non-Stationary 
### https://github.com/timsainb/noisereduce

## Create dataset (Reflecting full vocalizations, not segmented)

In [50]:
hparams = HParams(
    num_mel_bins = 32,
    mel_lower_edge_hertz=1000,
    mel_upper_edge_hertz=80000, # Look at Data Exploration
    butter_lowcut = 500,
    butter_highcut = 120000,
    ref_level_db = 20,
    min_level_db = -60, # Look at Data Exploration
    mask_spec = True,
    win_length_ms = 0.5, 
    hop_length_ms = 0.05,
    mask_spec_kwargs = {"spec_thresh": 0.9, "offset": 1e-10},
    n_jobs = -1,
    verbosity=1,
    nex = -1
)

In [51]:
# create a dataset object
dataset = DataSet(DATASET_ID, hparams = hparams)

loading json:   0%|          | 0/87985 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 3232 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 46176 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 87401 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done 87985 out of 87985 | elapsed:    7.7s finished


getting unique individuals:   0%|          | 0/87985 [00:00<?, ?it/s]

In [41]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')


[]

In [42]:
dataset.sample_json

OrderedDict([('fid', 23514),
             ('Treatment ID', 1),
             ('Recording channel', 11),
             ('Recording time', '2012-07-09 06:36:11'),
             ('indvs',
              OrderedDict([('0',
                            OrderedDict([('syllables',
                                          OrderedDict([('start_times',
                                                        [4e-06]),
                                                       ('end_times',
                                                        [2.98528]),
                                                       ('addressee', ['0']),
                                                       ('context',
                                                        ['Isolation']),
                                                       ('emit_prevoc_act',
                                                        [0]),
                                                       ('add_prevoc_act', [0]),
                       

In [15]:
dataset.__dict__;

In [52]:
!cat /data0/home/h21/luas6629/Thesis/data/processed/fruitbat/2023-03-05_00-30-35/JSON/130116080549387242.JSON

{
  "fid": 166697,
  "Treatment ID": 17,
  "Recording channel": 1,
  "Recording time": "2013-01-16 08:06:24",
  "indvs": {
    "-210": {
      "syllables": {
        "start_times": [4e-06],
        "end_times": [1.543488],
        "addressee": ["-207"],
        "context": ["Threat-like"],
        "emit_prevoc_act": [2],
        "add_prevoc_act": [2],
        "emit_postvoc_act": [3],
        "add_postvoc_act": [3]
      }
    }
  },
  "species": "Rousettus aegyptiacus",
  "common_name": "Egyptian fruit bat",
  "samplerate_hz": 250000,
  "original_wav": "data/raw/fruitbat/zip_contents/files212/130116080549387242.WAV",
  "length_s": 1.543488,
  "wav_loc": "data/raw/fruitbat/zip_contents/files212/130116080549387242.WAV"
}
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT

### Create dataset of whole vocalization, based on json

In [14]:
from joblib import Parallel, delayed
n_jobs = 4; verbosity = 1

In [15]:
with Parallel(n_jobs=n_jobs, verbose=verbosity) as parallel:
    syllable_dfs = parallel(
        delayed(create_label_df)(
            dataset.data_files[key].data,
            hparams=dataset.hparams,
            labels_to_retain=['context'],
            unit="syllables",
            dict_features_to_retain = [],
            key = key,
        )
        for key in tqdm(dataset.data_files.keys())
    )
syllable_df = pd.concat(syllable_dfs)
len(syllable_df)

PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis
PROJECT_DIR set on:  /data0/home/h21/luas6629/

  0%|          | 0/87985 [00:00<?, ?it/s]

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  68 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 15884 tasks      | elapsed:    7.5s
[Parallel(n_jobs=4)]: Done 47884 tasks      | elapsed:   19.3s
[Parallel(n_jobs=4)]: Done 87985 out of 87985 | elapsed:   35.0s finished


91079

In [18]:
len(syllable_df)

91079

In [24]:
key = list(dataset.data_files.keys())[0]

In [27]:
syllable_df.set_index('key').loc[key]

start_time     0.000004
end_time        2.98528
context       Isolation
indv                  0
indvi                 0
Name: 120709063505240110, dtype: object

In [49]:
get_row_audio(syllable_df[syllable_df.key == key], 
        dataset.data_files[key].data['wav_loc'], 
        dataset.hparams
    )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syllable_df["audio"] = [
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syllable_df["rate"] = rate


Unnamed: 0,start_time,end_time,context,indv,indvi,key,audio,rate
0,4e-06,2.98528,Isolation,0,0,120709063505240110,"[0.00023678757, 0.00012613685, -0.0001682282, ...",250000


In [29]:
syllable_df.head()

Unnamed: 0,start_time,end_time,context,indv,indvi,key
0,4e-06,2.98528,Isolation,0,0,120709063505240110
0,4e-06,1.34688,Sleeping,230,0,130511141449898364
0,4e-06,2.362688,General,-201,0,121004052532634115
0,4e-06,1.34688,Sleeping,215,0,130108045535754954
0,4e-06,2.428224,General,111,0,121111222747145516


In [28]:
file_info

NameError: name 'file_info' is not defined

### Get audio for dataset

In [None]:
with Parallel(n_jobs=n_jobs, verbose=verbosity) as parallel:
    syllable_df = pd.concat( parallel(
        delayed(get_row_audio)(
            syllable_df[syllable_df.key == key], 
            dataset.data_files[key].data['wav_loc'], 
            dataset.hparams
        )
        for key in tqdm(syllable_df.key.unique())
    ))
len(syllable_df)

  0%|          | 0/87985 [00:00<?, ?it/s]

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed:   10.1s
[Parallel(n_jobs=-1)]: Done 386 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 736 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 1186 tasks      | elapsed:   56.4s
[Parallel(n_jobs=-1)]: Done 1736 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 2386 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 3136 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3986 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 4936 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 5986 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 7136 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 8386 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 9736 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 11186 tasks      | elapsed:  8.8min
[Parallel(n_jobs=-1)]: Done 12736 tasks    