In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
import librosa
from datetime import datetime
import json
from IPython.display import display
from pathlib2 import Path

  from tqdm.autonotebook import tqdm


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
from avgn.utils.paths import DATA_DIR, ensure_dir

In [5]:
DATA_DIR = Path('./data')
DSLOC = DATA_DIR / 'raw' / 'fruitbat'

In [29]:
DSLOC

PosixPath('data/raw/fruitbat')

## Prepare DataFrames

In [6]:
# Annotations
annotations =  pd.read_csv(DSLOC/'Annotations.csv')

annotations = annotations.astype({
    'FileID' : int,
    'Emitter' : int,
    'Addressee' : int,
    'Context' :int,
    'Emitter pre-vocalization action' : int,
    'Addressee pre-vocalization action': int,
    'Emitter post-vocalization action' : int,
    'Addressee post-vocalization action' : int,
    'Start sample': float,
    'End sample' : float
})

len(annotations)

91080

In [7]:
# FileInfo

""" fruit bat db is malformed. 
Recreate the db with proper columns.
"""

with open(DSLOC/'FileInfo.csv', 'r') as temp_f:
    # get No of columns in each line
    col_count = [ len(l.split(",")) for l in temp_f.readlines() ]

### Generate column names  (names will be 0, 1, 2, ..., maximum columns - 1)
column_names = [i for i in range(0, max(col_count))]

### Read csv
file_info = pd.read_csv(DSLOC/'FileInfo.csv', header=None, delimiter=",", names=column_names)
file_info, file_info.columns = file_info[1:] , file_info.iloc[0]

# recreate the header
samples_boundaries = np.arange(1, 1 + file_info.shape[1] - 6)
voice_starts = samples_boundaries[::2]
voice_ends = samples_boundaries[1::2]
col_samples_boundaries = { val : float for pair in zip(['voice_start_sample_{}'.format(i + 1)for i in range(len(voice_starts))  ], 
        ['voice_end_sample_{}'.format(i + 1) for i in range(len(voice_ends))  ]) for val in pair }

col_names = ['FileID','Treatment ID','File name','File folder','Recording channel','Recording time'] + \
    [ val for pair in zip(['voice_start_sample_{}'.format(i + 1)for i in range(len(voice_starts))  ], 
        ['voice_end_sample_{}'.format(i + 1) for i in range(len(voice_ends))  ]) for val in pair]

columns_types = {
    'FileID' : int,
    'Treatment ID' : int,
    'File name' : str,
    'File folder' : str,
    'Recording channel' : int,
    'Recording time' : 'datetime64[ms]',
} | col_samples_boundaries
 

# set the new header
file_info.columns = columns_types.keys()


#ensure datatypes are meaningful
file_info = file_info.astype(columns_types)

# set index
file_info.set_index('FileID', inplace=True)

# shape
file_info.shape

  file_info = pd.read_csv(DSLOC/'FileInfo.csv', header=None, delimiter=",", names=column_names)


(293238, 331)

In [8]:
# Wav files
wavfiles = list(DSLOC.glob('zip_contents/*/*.WAV'))
len(wavfiles), wavfiles[0]

(293238,
 PosixPath('data/raw/fruitbat/zip_contents/files222/130527125845741428.WAV'))

In [9]:
wf_df = pd.DataFrame(
    [[wf, wf.name, wf.parent.stem] for wf in tqdm(wavfiles)],
    columns=["wav_loc", "id", "folder"],
)
wf_df = wf_df.set_index('id')
len(wf_df)

  0%|          | 0/293238 [00:00<?, ?it/s]

293238

In [10]:
wf_df.head()

Unnamed: 0_level_0,wav_loc,folder
id,Unnamed: 1_level_1,Unnamed: 2_level_1
130527125845741428.WAV,data/raw/fruitbat/zip_contents/files222/130527...,files222
130530225847479598.WAV,data/raw/fruitbat/zip_contents/files222/130530...,files222
130530004918782202.WAV,data/raw/fruitbat/zip_contents/files222/130530...,files222
130522231228763645.WAV,data/raw/fruitbat/zip_contents/files222/130522...,files222
130531182616717977.WAV,data/raw/fruitbat/zip_contents/files222/130531...,files222


## Prepare Json

In [11]:
context_dict = {
    0:'Unknown',
    1:'Separation',
    2:'Biting',
    3:'Feeding',
    4:'Fighting',
    5:'Grooming',
    6:'Isolation',
    7:'Kissing',
    8:'Landing',
    9:'Mating protest',
    10:'Threat-like',
    11:'General',
    12:'Sleeping',
}

In [12]:
from avgn.utils.audio import get_samplerate
from avgn.utils.json import NoIndent, NoIndentEncoder
from datetime import datetime
import librosa
import json

In [13]:
DATASET_ID = "fruitbat"

In [14]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2023-03-05_00-30-35'

In [15]:
annotations.head()

Unnamed: 0,FileID,Emitter,Addressee,Context,Emitter pre-vocalization action,Addressee pre-vocalization action,Emitter post-vocalization action,Addressee post-vocalization action,Start sample,End sample
0,7,118,0,9,2,2,3,3,1.0,336720.0
1,11,0,0,11,0,0,0,0,1.0,787280.0
2,12,118,0,12,2,2,3,3,1.0,566096.0
3,15,0,0,12,0,0,0,0,1.0,402256.0
4,20,0,0,12,0,0,0,0,1.0,394064.0


In [16]:
wf_df.head()

Unnamed: 0_level_0,wav_loc,folder
id,Unnamed: 1_level_1,Unnamed: 2_level_1
130527125845741428.WAV,data/raw/fruitbat/zip_contents/files222/130527...,files222
130530225847479598.WAV,data/raw/fruitbat/zip_contents/files222/130530...,files222
130530004918782202.WAV,data/raw/fruitbat/zip_contents/files222/130530...,files222
130522231228763645.WAV,data/raw/fruitbat/zip_contents/files222/130522...,files222
130531182616717977.WAV,data/raw/fruitbat/zip_contents/files222/130531...,files222


In [17]:
file_info.head()

Unnamed: 0_level_0,Treatment ID,File name,File folder,Recording channel,Recording time,voice_start_sample_1,voice_end_sample_1,voice_start_sample_2,voice_end_sample_2,voice_start_sample_3,...,voice_start_sample_159,voice_end_sample_159,voice_start_sample_160,voice_end_sample_160,voice_start_sample_161,voice_end_sample_161,voice_start_sample_162,voice_end_sample_162,voice_start_sample_163,voice_end_sample_163
FileID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,120601000005102988.WAV,files101,11,2012-06-01 00:01:57,52100.0,120902.0,130577.0,158906.0,,...,,,,,,,,,,
2,1,120601000005108269.WAV,files101,4,2012-06-01 00:01:57,28895.0,89442.0,102612.0,129387.0,,...,,,,,,,,,,
3,1,120601000011795989.WAV,files101,11,2012-06-01 00:02:04,54143.0,100989.0,101607.0,121337.0,,...,,,,,,,,,,
4,2,120601000944859242.WAV,files101,12,2012-06-01 00:11:43,50134.0,72656.0,,,,...,,,,,,,,,,
5,1,120601002057003279.WAV,files101,4,2012-06-01 00:23:05,35596.0,57118.0,,,,...,,,,,,,,,,


In [18]:
from avgn.utils.json import NoIndent, NoIndentEncoder

In [19]:
file_info[file_info['voice_start_sample_5'].isna() == False]

Unnamed: 0_level_0,Treatment ID,File name,File folder,Recording channel,Recording time,voice_start_sample_1,voice_end_sample_1,voice_start_sample_2,voice_end_sample_2,voice_start_sample_3,...,voice_start_sample_159,voice_end_sample_159,voice_start_sample_160,voice_end_sample_160,voice_start_sample_161,voice_end_sample_161,voice_start_sample_162,voice_end_sample_162,voice_start_sample_163,voice_end_sample_163
FileID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1,120601002131259280.WAV,files101,4,2012-06-01 00:23:39,6108.0,19903.0,38157.0,69403.0,84157.0,...,,,,,,,,,,
8,1,120601002244324281.WAV,files101,4,2012-06-01 00:24:52,53299.0,72118.0,72370.0,89138.0,90062.0,...,,,,,,,,,,
11,1,120601002344862013.WAV,files101,11,2012-06-01 00:25:52,122946.0,198513.0,216776.0,250513.0,332767.0,...,,,,,,,,,,
12,1,120601003734539035.WAV,files101,11,2012-06-01 00:39:46,46916.0,85062.0,93914.0,103208.0,105404.0,...,,,,,,,,,,
13,1,120601003734781292.WAV,files101,4,2012-06-01 00:39:47,9019.0,33462.0,52121.0,71041.0,130720.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293224,20,140219102458100904.WAV,files224,2,2014-02-19 10:23:48,47227.0,95085.0,115060.0,147973.0,173227.0,...,,,,,,,,,,
293235,20,140219120345355439.WAV,files224,1,2014-02-19 12:02:35,51247.0,76790.0,230662.0,257995.0,275909.0,...,,,,,,,,,,
293236,20,140219120345357909.WAV,files224,2,2014-02-19 12:02:35,51498.0,80091.0,231596.0,258636.0,276996.0,...,,,,,,,,,,
293237,20,140219120422500440.WAV,files224,1,2014-02-19 12:03:12,55820.0,78202.0,94011.0,118152.0,128851.0,...,,,,,,,,,,


In [20]:
# check
missing_wav = 0
missing_folder = set()
# for each unique wav
for FID in tqdm(annotations.FileID.unique()):
    file_row = file_info.loc[FID]
    try:
        wav_row = wf_df.loc[file_row["File name"]]
    except KeyError as e:
        missing_wav += 1
        missing_folder.add(file_row["File folder"])
        continue

  0%|          | 0/87986 [00:00<?, ?it/s]

In [59]:
missing_wav, missing_folder

(0, set())

In [94]:
annotations.head()

Unnamed: 0,FileID,Emitter,Addressee,Context,Emitter pre-vocalization action,Addressee pre-vocalization action,Emitter post-vocalization action,Addressee post-vocalization action,Start sample,End sample
0,7,118,0,9,2,2,3,3,1.0,336720.0
1,11,0,0,11,0,0,0,0,1.0,787280.0
2,12,118,0,12,2,2,3,3,1.0,566096.0
3,15,0,0,12,0,0,0,0,1.0,402256.0
4,20,0,0,12,0,0,0,0,1.0,394064.0


In [101]:
annotations[annotations.Emitter == 215]

Unnamed: 0,FileID,Emitter,Addressee,Context,Emitter pre-vocalization action,Addressee pre-vocalization action,Emitter post-vocalization action,Addressee post-vocalization action,Start sample,End sample
14742,61822,215,217,6,0,0,0,0,1.0,320336.0
14782,61879,215,217,6,0,0,0,0,1.0,328528.0
14784,61882,215,217,6,0,0,0,0,1.0,320336.0
14790,61894,215,217,6,0,0,0,0,1.0,557904.0
14794,61900,215,217,6,0,0,0,0,1.0,320336.0
...,...,...,...,...,...,...,...,...,...,...
91062,293146,215,221,9,2,2,3,3,1.0,4514640.0
91063,293155,215,221,9,2,2,3,3,1.0,2065232.0
91066,293179,215,221,9,2,2,3,3,1.0,426832.0
91067,293182,215,221,9,2,2,3,3,1.0,2409296.0


In [113]:
# a long vocalizations from Bat 215
file_row = file_info.loc[293146]
wav_row = wf_df.loc[file_row["File name"]]

In [115]:
# get the file
file_annotations = annotations[annotations.FileID == 293146].sort_values(
    by="Start sample"
)

In [116]:
file_annotations

Unnamed: 0,FileID,Emitter,Addressee,Context,Emitter pre-vocalization action,Addressee pre-vocalization action,Emitter post-vocalization action,Addressee post-vocalization action,Start sample,End sample
91062,293146,215,221,9,2,2,3,3,1.0,4514640.0


In [117]:
rate = get_samplerate(wav_row.wav_loc.as_posix())

In [118]:
get_samplerate(wav_row.wav_loc.as_posix())

250000

In [119]:
bout_duration = librosa.get_duration(path=wav_row.wav_loc.as_posix())

In [120]:
bout_duration

18.05856

In [121]:
file_annotations.Emitter.unique()

array([215])

In [122]:
file_annotations[file_annotations.Emitter == 215]

Unnamed: 0,FileID,Emitter,Addressee,Context,Emitter pre-vocalization action,Addressee pre-vocalization action,Emitter post-vocalization action,Addressee post-vocalization action,Start sample,End sample
91062,293146,215,221,9,2,2,3,3,1.0,4514640.0


In [139]:
annotations[annotations['FileID'] == 2587]

Unnamed: 0,FileID,Emitter,Addressee,Context,Emitter pre-vocalization action,Addressee pre-vocalization action,Emitter post-vocalization action,Addressee post-vocalization action,Start sample,End sample
277,2587,-112,0,12,2,2,3,3,1.0,398691.0
278,2587,-107,0,12,2,2,3,3,398692.0,770896.0


In [138]:
annotations[annotations['FileID'].duplicated()]

Unnamed: 0,FileID,Emitter,Addressee,Context,Emitter pre-vocalization action,Addressee pre-vocalization action,Emitter post-vocalization action,Addressee post-vocalization action,Start sample,End sample
278,2587,-107,0,12,2,2,3,3,398692.0,770896.0
387,3790,-101,0,11,2,2,3,3,605313.0,1409872.0
410,3912,118,0,11,2,2,3,3,394312.0,729936.0
436,4005,-118,0,11,2,2,3,3,753151.0,820048.0
441,4011,-118,0,11,2,2,3,3,409281.0,1360720.0
...,...,...,...,...,...,...,...,...,...,...
90341,289901,207,221,4,2,2,3,3,509771.0,2507600.0
90466,290331,-208,-221,4,2,2,3,3,543787.0,2622288.0
90518,290547,-208,-221,4,2,2,3,3,562935.0,975696.0
90768,291695,207,221,4,2,2,3,3,352229.0,1786704.0


In [64]:
missing_wav = 0
# for each unique wav
for FID in tqdm(annotations.FileID.unique()):
    file_row = file_info.loc[FID]
    try:
        wav_row = wf_df.loc[file_row["File name"]]
    except KeyError as e:
        missing_wav += 1
        continue
    
    # get the file
    file_annotations = annotations[annotations.FileID == FID].sort_values(
        by="Start sample"
    )
            
    json_out = (
        DATA_DIR
        / "processed"
        / DATASET_ID
        / DT_ID
        / "JSON"
        / (wav_row.wav_loc.stem.replace(" ", "_") + ".JSON")
    )

    rate = get_samplerate(wav_row.wav_loc.as_posix())
    bout_duration = librosa.get_duration(path=wav_row.wav_loc.as_posix())

    # wav general information
    json_dict = {}
    json_dict["fid"] = int(file_row.name)
    json_dict["Treatment ID"] = int(file_row["Treatment ID"])
    json_dict["Recording channel"] = int(file_row["Recording channel"])
    json_dict["Recording time"] = str(file_row["Recording time"])

    json_dict["indvs"] = {}
    for emitter in file_annotations.Emitter.unique():
        emitter_anno = file_annotations[file_annotations.Emitter == int(emitter)]

        json_dict["indvs"][str(emitter)] = {
            "syllables": {
                "start_times": NoIndent(
                    list(emitter_anno["Start sample"].astype("float32") / rate)
                ),
                "end_times": NoIndent(
                    list(emitter_anno["End sample"].astype("float32") / rate)
                ),
                "addressee": NoIndent(list(emitter_anno["Addressee"].astype(str))),
                "context": NoIndent([context_dict[i] for i in emitter_anno["Context"]]),
                "emit_prevoc_act": NoIndent(
                    list(
                       emitter_anno["Emitter pre-vocalization action"]
                    )
                ),
                "add_prevoc_act": NoIndent(
                    list(
                       emitter_anno["Addressee pre-vocalization action"]
                    )
                ),
                "emit_postvoc_act": NoIndent(
                    list(emitter_anno["Emitter post-vocalization action"])
                ),
                "add_postvoc_act": NoIndent(
                    list(
                       emitter_anno["Addressee post-vocalization action"].tolist()
                    )
                ),
            }
        }


    json_dict["species"] = "Rousettus aegyptiacus"
    json_dict["common_name"] = "Egyptian fruit bat"
    json_dict["samplerate_hz"] = int(rate)
    json_dict["original_wav"] = wav_row.wav_loc.as_posix()
    json_dict["length_s"] = bout_duration
    json_dict["wav_loc"] = wav_row.wav_loc.as_posix()

    json_txt = json.dumps(json_dict, cls=NoIndentEncoder, indent=2)

    # save json
    ensure_dir(json_out.as_posix())

    print(json_txt, file=open(json_out.as_posix(), "w"))

    #break

  0%|          | 0/87986 [00:00<?, ?it/s]

In [63]:
print(json_txt)

{
  "fid": 7,
  "Treatment ID": 1,
  "Recording channel": 11,
  "Recording time": "2012-06-01 00:23:40",
  "indvs": {
    "118": {
      "syllables": {
        "start_times": [4e-06],
        "end_times": [1.34688],
        "addressee": ["0"],
        "context": ["Mating protest"],
        "emit_prevoc_act": [2],
        "add_prevoc_act": [2],
        "emit_postvoc_act": [3],
        "add_postvoc_act": [3]
      }
    }
  },
  "species": "Rousettus aegyptiacus",
  "common_name": "Egyptian fruit bat",
  "samplerate_hz": 250000,
  "original_wav": "data/raw/fruitbat/zip_contents/files101/120601002132055008.WAV",
  "length_s": 1.34688,
  "wav_loc": "data/raw/fruitbat/zip_contents/files101/120601002132055008.WAV"
}
