In [1]:
%load_ext autoreload
%autoreload 2

In [7]:
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
import pandas as pd
import librosa
from datetime import datetime
import json
from IPython.display import display
from pathlib2 import Path

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
from avgn.utils.paths import DATA_DIR, ensure_dir

In [8]:
DATA_DIR = Path('./data')
DSLOC = DATA_DIR / 'raw' / 'fruitbat'

## Prepare DataFrames

In [None]:
# Annotations
annotations =  pd.read_csv(DSLOC/'Annotations.csv')

annotations = annotations.astype({
    'FileID' : int,
    'Emitter' : int,
    'Addressee' : int,
    'Context' :int,
    'Emitter pre-vocalization action' : int,
    'Addressee pre-vocalization action': int,
    'Emitter post-vocalization action' : int,
    'Addressee post-vocalization action' : int,
    'Start sample': float,
    'End sample' : float
})

len(annotations)

91080

In [12]:
# FileInfo

""" fruit bat db is malformed. 
Recreate the db with proper columns.
"""

with open(DSLOC/'FileInfo.csv', 'r') as temp_f:
    # get No of columns in each line
    col_count = [ len(l.split(",")) for l in temp_f.readlines() ]

### Generate column names  (names will be 0, 1, 2, ..., maximum columns - 1)
column_names = [i for i in range(0, max(col_count))]

### Read csv
file_info = pd.read_csv(DSLOC/'FileInfo.csv', header=None, delimiter=",", names=column_names)
file_info, file_info.columns = file_info[1:] , file_info.iloc[0]

# recreate the header
samples_boundaries = np.arange(1, 1 + file_info.shape[1] - 6)
voice_starts = samples_boundaries[::2]
voice_ends = samples_boundaries[1::2]
col_samples_boundaries = { val : float for pair in zip(['voice_start_sample_{}'.format(i + 1)for i in range(len(voice_starts))  ], 
        ['voice_end_sample_{}'.format(i + 1) for i in range(len(voice_ends))  ]) for val in pair }

col_names = ['FileID','Treatment ID','File name','File folder','Recording channel','Recording time'] + \
    [ val for pair in zip(['voice_start_sample_{}'.format(i + 1)for i in range(len(voice_starts))  ], 
        ['voice_end_sample_{}'.format(i + 1) for i in range(len(voice_ends))  ]) for val in pair]

columns_types = {
    'FileID' : int,
    'Treatment ID' : int,
    'File name' : str,
    'File folder' : str,
    'Recording channel' : int,
    'Recording time' : 'datetime64[ms]',
} | col_samples_boundaries
 

# set the new header
file_info.columns = columns_types.keys()


#ensure datatypes are meaningful
file_info = file_info.astype(columns_types)

# set index
file_info.set_index('FileID', inplace=True)

# shape
file_info.shape

  file_info = pd.read_csv(DSLOC/'FileInfo.csv', header=None, delimiter=",", names=column_names)


(293238, 331)

In [53]:
# Wav files
wavfiles = list(DSLOC.glob('zip_contents/*/*.WAV'))
len(wavfiles), wavfiles[0]

(293238,
 PosixPath('data/raw/fruitbat/zip_contents/files222/130527125845741428.WAV'))

In [54]:
wf_df = pd.DataFrame(
    [[wf, wf.name, wf.parent.stem] for wf in tqdm(wavfiles)],
    columns=["wav_loc", "id", "folder"],
)
wf_df = wf_df.set_index('id')
len(wf_df)

  0%|          | 0/293238 [00:00<?, ?it/s]

293238

In [55]:
wf_df.head()

Unnamed: 0_level_0,wav_loc,folder
id,Unnamed: 1_level_1,Unnamed: 2_level_1
130527125845741428.WAV,data/raw/fruitbat/zip_contents/files222/130527...,files222
130530225847479598.WAV,data/raw/fruitbat/zip_contents/files222/130530...,files222
130530004918782202.WAV,data/raw/fruitbat/zip_contents/files222/130530...,files222
130522231228763645.WAV,data/raw/fruitbat/zip_contents/files222/130522...,files222
130531182616717977.WAV,data/raw/fruitbat/zip_contents/files222/130531...,files222


## Prepare Json

In [20]:
context_dict = {
    0:'Unknown',
    1:'Separation',
    2:'Biting',
    3:'Feeding',
    4:'Fighting',
    5:'Grooming',
    6:'Isolation',
    7:'Kissing',
    8:'Landing',
    9:'Mating protest',
    10:'Threat-like',
    11:'General',
    12:'Sleeping',
}

In [16]:
from avgn.utils.audio import get_samplerate
from avgn.utils.json import NoIndent, NoIndentEncoder
from datetime import datetime
import librosa
import json

In [17]:
DATASET_ID = "fruitbat"

In [18]:
# create a unique datetime identifier for the files output by this notebook
DT_ID = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
DT_ID

'2023-03-04_21-59-19'

In [19]:
display(annotations[:1])
display(wf_df[:1])
display(file_info[:1])

Unnamed: 0,FileID,Emitter,Addressee,Context,Emitter pre-vocalization action,Addressee pre-vocalization action,Emitter post-vocalization action,Addressee post-vocalization action,Start sample,End sample
0,7,118,0,9,2,2,3,3,1.0,336720.0


Unnamed: 0_level_0,wav_loc,folder
id,Unnamed: 1_level_1,Unnamed: 2_level_1
121112003518920255.WAV,data/raw/fruitbat/zip_contents/files205/121112...,files205


Unnamed: 0_level_0,Treatment ID,File name,File folder,Recording channel,Recording time,voice_start_sample_1,voice_end_sample_1,voice_start_sample_2,voice_end_sample_2,voice_start_sample_3,...,voice_start_sample_159,voice_end_sample_159,voice_start_sample_160,voice_end_sample_160,voice_start_sample_161,voice_end_sample_161,voice_start_sample_162,voice_end_sample_162,voice_start_sample_163,voice_end_sample_163
FileID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,120601000005102988.WAV,files101,11,2012-06-01 00:01:57,52100.0,120902.0,130577.0,158906.0,,...,,,,,,,,,,


In [21]:
from avgn.utils.json import NoIndent, NoIndentEncoder

In [56]:
# check
missing_wav = 0
missing_folder = set()
# for each unique wav
for FID in tqdm(annotations.FileID.unique()):
    file_row = file_info.loc[FID]
    try:
        wav_row = wf_df.loc[file_row["File name"]]
    except KeyError as e:
        missing_wav += 1
        missing_folder.add(file_row["File folder"])
        continue

  0%|          | 0/87986 [00:00<?, ?it/s]

In [57]:
missing_wav

0

In [58]:
missing_folder

set()

In [44]:
annotations.iloc[4101]

FileID                                 21722.0
Emitter                                 -116.0
Addressee                                  0.0
Context                                   11.0
Emitter pre-vocalization action            1.0
Addressee pre-vocalization action          3.0
Emitter post-vocalization action           3.0
Addressee post-vocalization action         3.0
Start sample                               1.0
End sample                            623440.0
Name: 4101, dtype: float64

In [47]:
file_info.loc[21722]

Treatment ID                                   2
File name                 120707013847142851.WAV
File folder                             files103
Recording channel                             12
Recording time               2012-07-07 01:49:35
                                   ...          
voice_end_sample_161                         NaN
voice_start_sample_162                       NaN
voice_end_sample_162                         NaN
voice_start_sample_163                       NaN
voice_end_sample_163                         NaN
Name: 21722, Length: 331, dtype: object

In [46]:
wf_df.loc['120707013847142851.WAV']

wav_loc    data/raw/fruitbat/zip_contents/files103/120707...
folder                                              files103
Name: 120707013847142851.WAV, dtype: object

In [None]:
# get the file
file_annotations = annotations[annotations.FileID == FID].sort_values(
    by="Start sample"
)