In [1]:
%load_ext autoreload
%autoreload 2

## Load data

In [2]:
# Utils
from joblib import Parallel, delayed
from tqdm.autonotebook import tqdm
from datetime import datetime
import json
from IPython.display import display, Audio
from pathlib2 import Path

# data libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa

  from tqdm.autonotebook import tqdm


In [3]:
# wrapper lib
from avgn.utils.paths import DATA_DIR, ensure_dir

DATA_DIR = Path('./data')
DSLOC = DATA_DIR / 'raw' / 'fruitbat'

PROJECT_DIR set on:  /data0/home/h21/luas6629/Thesis


In [4]:
# annotations
annotations =  pd.read_csv(DSLOC/'Annotations.csv')

annotations = annotations.astype({
    'FileID' : int,
    'Emitter' : int,
    'Addressee' : int,
    'Context' :int,
    'Emitter pre-vocalization action' : int,
    'Addressee pre-vocalization action': int,
    'Emitter post-vocalization action' : int,
    'Addressee post-vocalization action' : int,
    'Start sample': float,
    'End sample' : float
})

len(annotations)

91080

In [6]:
# file info
""" fruit bat db is malformed. 
Recreate the db with proper columns.
"""

with open(DSLOC/'FileInfo.csv', 'r') as temp_f:
    # get No of columns in each line
    col_count = [ len(l.split(",")) for l in temp_f.readlines() ]

### Generate column names  (names will be 0, 1, 2, ..., maximum columns - 1)
column_names = [i for i in range(0, max(col_count))]

### Read csv
file_info = pd.read_csv(DSLOC/'FileInfo.csv', header=None, delimiter=",", names=column_names)
file_info, file_info.columns = file_info[1:] , file_info.iloc[0]

# recreate the header
samples_boundaries = np.arange(1, 1 + file_info.shape[1] - 6)
voice_starts = samples_boundaries[::2]
voice_ends = samples_boundaries[1::2]
col_samples_boundaries = { val : float for pair in zip(['voice_start_sample_{}'.format(i + 1)for i in range(len(voice_starts))  ], 
        ['voice_end_sample_{}'.format(i + 1) for i in range(len(voice_ends))  ]) for val in pair }

col_names = ['FileID','Treatment ID','File name','File folder','Recording channel','Recording time'] + \
    [ val for pair in zip(['voice_start_sample_{}'.format(i + 1)for i in range(len(voice_starts))  ], 
        ['voice_end_sample_{}'.format(i + 1) for i in range(len(voice_ends))  ]) for val in pair]

columns_types = {
    'FileID' : int,
    'Treatment ID' : int,
    'File name' : str,
    'File folder' : str,
    'Recording channel' : int,
    'Recording time' : 'datetime64[ms]',
} | col_samples_boundaries
 

# set the new header
file_info.columns = columns_types.keys()


#ensure datatypes are meaningful
file_info = file_info.astype(columns_types)

# set index
file_info.set_index('FileID', inplace=True)

# shape
file_info.shape

  file_info = pd.read_csv(DSLOC/'FileInfo.csv', header=None, delimiter=",", names=column_names)


(293238, 331)

In [14]:
# test file

sample_rawdata = 'files102'
samples_files_info = file_info[file_info['File folder'].values == sample_rawdata]


exnm = 56
wav_loc = DSLOC/'zip_contents'/ sample_rawdata /  samples_files_info['File name'].values[exnm]
wav_info = file_info[file_info['File name'] == wav_loc.name]

## Load file test

In [12]:
from avgn.signalprocessing.spectrogramming import spectrogram
from avgn.utils.audio import load_wav, float32_to_int16, int16_to_float32, write_wav
from avgn.visualization.spectrogram import visualize_spec, plot_spec

In [15]:
# load file

rate, data = load_wav(wav_loc)
print(rate, len(data))

start_times = wav_info.loc[:, wav_info.columns.str.startswith('voice_start_')].dropna(axis=1).values[0]
end_times = wav_info.loc[:, wav_info.columns.str.startswith('voice_end_')].dropna(axis=1).values[0]

assert(len(start_times) == len(end_times))

250000 861008


In [None]:
# load segment