# Exporting fe_03_p1 to 8kHz Mono Wav files

## Introduction

The main goal is to create splits of the dataset for validation, training and testing.
Additionally, another goal is to convert all the audio files in all the splits into a standard uniform format.
This format is usually wav, with the same samplerate and number of channels.

That will entail:

- For each split, make individual pairs of audio and labels
    - along with audio-export parameters, out-dir (upto group-number)
    - Making of such pairs should be based on some goal, perhaps random
- Make the necessary folders
    - upto group-number for each split, inferred from the pairs made above.
- For each split:
    - copy the `calldata.tbl` to `<split>/labels/`
    - for each pair:
        - export the audio with parameters
        - copy the label file



## Imports and finding the filepaths

### Imports

In [1]:
from __future__ import print_function, division

import os
import sys

rennet_root = os.environ['RENNET_ROOT']
sys.path.append(rennet_root)

%load_ext autoreload
%autoreload 1

In [2]:
import glob
import numpy as np
from collections import defaultdict
import warnings

import shutil as sh
from tqdm import tqdm_notebook as tqdm
import multiprocessing as mp

In [3]:
# rennet specific imports
# autoreload for debugging
%aimport rennet.datasets.fisher
import rennet.datasets.fisher as fe

%aimport rennet.utils.audio_utils
import rennet.utils.audio_utils as au

### Gather all audio, labels and calldata filepaths

> **NOTE**
>
> If it is not clear by now, we will be, from now on, working exclusively with the working directory we created earlier. Keep that in mind for all the instructions to come.

***


In [4]:
# Making glob queries for audio, label and calldata files

# UPDATE HERE - BEGIN #######################################################

rennet_working_dir = os.path.join(rennet_root, 'data', 'working')

provider = 'fisher'
dataset = 'fe_03_p1'
rennet_working_raw_dir = os.path.join(rennet_working_dir, 
                                      provider, dataset, 'raw')

raw_audio_dir = os.path.join(rennet_working_raw_dir, 'audio', 'data')
raw_audio_groups_glob_str = str(os.path.join(raw_audio_dir, "*", "*"))

raw_labels_dir = os.path.join(rennet_working_raw_dir, 'labels', 'data')
raw_labels_groups_glob_str = str(os.path.join(raw_labels_dir, "*", "*"))

raw_calldata_glob_str = str(os.path.join(rennet_working_raw_dir, 
                                         'labels', "*calldata.tbl"))

expected_audio_groups = 59
expected_labels_groups = 59

# UPDATE HERE - END #########################################################

print("Audio Groups Query:\n", raw_audio_groups_glob_str)
print()
print("Labels Groups Query:\n", raw_labels_groups_glob_str)
print()
print("Calldata Query:\n", raw_calldata_glob_str)
print()

Audio Groups Query:
 /home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/audio/data/*/*

Labels Groups Query:
 /home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/labels/data/*/*

Calldata Query:
 /home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/labels/*calldata.tbl



In [5]:
# Finding audio and label files in each group and matching pairs

audio_groups_dirs = sorted(glob.glob(raw_audio_groups_glob_str))
audio_fps = [sorted(glob.glob(os.path.join(gd, '*.sph'))) for gd in audio_groups_dirs]

label_groups_dirs = sorted(glob.glob(raw_labels_groups_glob_str))
label_fps = [sorted(glob.glob(os.path.join(gd, '*.txt'))) for gd in label_groups_dirs]

print("AUDIO:###########################################################\n\n", 
      "{} Groups".format(len(audio_groups_dirs)), 
      "{} Total Files, as:\n\n".format(sum(map(len, audio_fps))), list(map(len, audio_fps)),
      "\n\nFor Example:\n\n{}\n...".format("\n".join(audio_fps[0][:5])))
print()

if len(audio_groups_dirs) != expected_audio_groups:
    warnings.warn("\nNot all audio groups were found")
    
print("LABEL:###########################################################\n\n", 
      "{} Groups".format(len(label_groups_dirs)), 
      "{} Total Files, as:\n\n".format(sum(map(len, label_fps))), list(map(len, label_fps)),
      "\n\nFor Example:\n\n{}\n...".format("\n".join(label_fps[0][:5])))
print()

if len(label_groups_dirs) != expected_labels_groups:
    warnings.warn("\nNot all label groups were found")
    
    
def callid_from_fp(fp):
    return os.path.basename(fp).split('_')[-1].split('.')[0]


# find if there are all groups
if len(audio_groups_dirs) != len(label_groups_dirs):
    warnings.warn("\nMismatch in number of audio and label groups:"
                  "\nAudios {} v/s {} Labels".format(
                      len(audio_groups_dirs), 
                      len(label_groups_dirs)))

# NOTE: The following checks use zip for matching and assume order
# if the first n of the total N groups or files are available, there will be no warning
# if the n groups or files are from somewhere in the middle, there will be mismatch

# find if certain groups mismatch
if any(lag != llg for lag, llg in zip(map(len, audio_fps), map(len, label_fps))):
    warnings.warn("\nMismatch in number of files in certain groups")

# check if each pair has the same callid
audio_callids = [map(callid_from_fp, fps) for fps in audio_fps]
label_callids = [map(callid_from_fp, fps) for fps in label_fps]

if any(any(ac != lc for ac, lc in zip(gac, glc)) for gac, glc in zip(audio_callids, label_callids)):
    warnings.warn("\nMismatch in callids for certain files")
        

AUDIO:###########################################################

 59 Groups 5850 Total Files, as:

 [99, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 51] 

For Example:

/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/audio/data/disc1/000/fe_03_00001.sph
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/audio/data/disc1/000/fe_03_00002.sph
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/audio/data/disc1/000/fe_03_00003.sph
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/audio/data/disc1/000/fe_03_00004.sph
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/audio/data/disc1/000/fe_03_00005.sph
...

LABEL:###########################################################

 59 Groups 5850 Total

In [6]:
# Finding the calldata.tbl file

calldata_fp = glob.glob(raw_calldata_glob_str)

if len(calldata_fp) > 1:
    warnings.warn("More than one calldata file found")
elif len(calldata_fp) < 1:
    warnings.warn("Calldata file was not found")
else:
    calldata_fp = calldata_fp[0]
    print("CALLDATA filepath: ", calldata_fp)

CALLDATA filepath:  /home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/labels/fe_03_p1_calldata.tbl


In [7]:
group_pair_fps = list(zip(audio_fps, label_fps))

print("GROUP PAIRS: {}".format(len(group_pair_fps)), 
      list(map(lambda x: list(map(len, x)), group_pair_fps)), sep='\n')

GROUP PAIRS: 59
[[99, 99], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [100, 100], [51, 51]]


#### Result

- `audio_fps` and `labels_fps` have are list of lists, with grouped filepaths to audio and label files respectively
    + the checks above also ensure that corresponding pairs are at-least at the same indices, or missing altogether
- `calldata_fp` has the filepath to the calldata file
- `group_pair_fps` has pairs of audio and label filepaths for which both are present
    + for this dataset, only the first 8 audio groups are present, hence only 8 group pairs

## Read the calldata and activespeakers

Reading activespeakers internally also reads the annotations, so any errors should not go unnoticed.

NOTE: the `warn=False` parameter is passed to suppress the internal warnings for duplicate annotations for a speaker for the same time segment. 
This is ignored since we are only concerned with 'who' is speaking 'when', and not 'what' is being spoken. 

NOTE: We also read calldata.tbl and pass the relevant instance to the activespeakers so that anything unexpected pops up.

In [8]:
calldatas = fe.FisherAllCallData.from_file(calldata_fp)

print("CALLDATA read for {} callids".format(len(calldatas.allcalldata)))
print("\nExample calldata:\n")
print(calldatas.calldata_for_callid('00434'), sep='\n')

CALLDATA read for 5850 callids

Example calldata:

FisherCallData(callid='00434', topicid='ENG30', signalgrade=4.0, convgrade=4.0, channelspeakers=[FisherChannelSpeaker(id='2872', gender='f', dialect='a', phone_service='2'), FisherChannelSpeaker(id='5019', gender='f', dialect='a', phone_service='2')])


In [9]:
activespeakers = []
for _, label_group_fps in group_pair_fps:
    activespeakers.append([fe.FisherActiveSpeakers.from_file(lfp, 
                                                             warn=False, 
                                                             allcalldata=calldatas)
                           for lfp in label_group_fps])
    
print("ACTIVESPEAKERS read for total {} label files with calldata".format(sum(map(len, activespeakers))))

ACTIVESPEAKERS read for total 5850 label files with calldata


## Exporting

### Making Splits for validation, training and testing - based on groups

[] TODO: explanation

#### Checking parity of durations from audio and labels

In [10]:
grouped_dur_label = []
grouped_dur_audio = []

for g, (agfps, _) in enumerate(group_pair_fps):
    # durations from labels
    dur_label = []
    for act in activespeakers[g]:
        with act.samplerate_as(100):
            dur_label.append(int(act.ends[-1]))
    grouped_dur_label.append(np.array(dur_label) / 100)
    
    # durations from audios
    dur_audio = []
    for fp in agfps:
        audio_meta = au.get_audio_metadata(fp)
        dur_audio.append(audio_meta.seconds)
    grouped_dur_audio.append(np.array(dur_audio))
    
    
files_dur_label = np.concatenate(grouped_dur_label)
files_dur_audio = np.concatenate(grouped_dur_audio)

group_dur_label = np.array(list(map(sum, grouped_dur_label)))
group_dur_audio = np.array(list(map(sum, grouped_dur_audio)))

In [11]:
def split_seconds(seconds):
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return int(h), int(m), round(s, 2)

def print_splitsec(duration, name):
    print("{} : {:10.2f} seconds = {}".format(
        name, duration, 
        "{:4} : {:2} : {:5.2f}".format(*split_seconds(duration))
    ))

In [12]:
print("Per File Audio Durations")
print_splitsec(np.sum(files_dur_audio),   "Audio Total   ")
print_splitsec(np.sum(files_dur_label),   "Label Total   ")
print_splitsec(np.min(files_dur_audio),   "Audio Minimum ")
print_splitsec(np.min(files_dur_label),   "Label Minimum ")
print_splitsec(np.max(files_dur_audio),   "Audio Maximum ")
print_splitsec(np.max(files_dur_label),   "Label Maximum ")
print_splitsec(np.mean(files_dur_audio),  "Audio Mean    ")
print_splitsec(np.mean(files_dur_label),  "Label Mean    ")
print_splitsec(np.std(files_dur_audio),   "Audio Std     ")
print_splitsec(np.std(files_dur_label),   "Label Std     ")

Per File Audio Durations
Audio Total    : 3542124.21 seconds =  983 : 55 : 24.21
Label Total    : 3482482.03 seconds =  967 : 21 : 22.03
Audio Minimum  :     366.79 seconds =    0 :  6 :  6.79
Label Minimum  :     339.43 seconds =    0 :  5 : 39.43
Audio Maximum  :     719.96 seconds =    0 : 11 : 59.96
Label Maximum  :     719.96 seconds =    0 : 11 : 59.96
Audio Mean     :     605.49 seconds =    0 : 10 :  5.49
Label Mean     :     595.30 seconds =    0 :  9 : 55.30
Audio Std      :      31.44 seconds =    0 :  0 : 31.44
Label Std      :      32.99 seconds =    0 :  0 : 32.99


In [13]:
print("Per Group Audio Durations")
print_splitsec(np.sum(group_dur_audio),   "Audio Total   ")
print_splitsec(np.sum(group_dur_label),   "Label Total   ")
print_splitsec(np.min(group_dur_audio),   "Audio Minimum ")
print_splitsec(np.min(group_dur_label),   "Label Minimum ")
print_splitsec(np.max(group_dur_audio),   "Audio Maximum ")
print_splitsec(np.max(group_dur_label),   "Label Maximum ")
print_splitsec(np.mean(group_dur_audio),  "Audio Mean    ")
print_splitsec(np.mean(group_dur_label),  "Label Mean    ")
print_splitsec(np.std(group_dur_audio),   "Audio Std     ")
print_splitsec(np.std(group_dur_label),   "Label Std     ")

Per Group Audio Durations
Audio Total    : 3542124.21 seconds =  983 : 55 : 24.21
Label Total    : 3482482.03 seconds =  967 : 21 : 22.03
Audio Minimum  :   30597.64 seconds =    8 : 29 : 57.64
Label Minimum  :   30434.81 seconds =    8 : 27 : 14.81
Audio Maximum  :   71337.89 seconds =   19 : 48 : 57.89
Label Maximum  :   63725.15 seconds =   17 : 42 :  5.15
Audio Mean     :   60036.00 seconds =   16 : 40 : 36.00
Label Mean     :   59025.12 seconds =   16 : 23 : 45.12
Audio Std      :    4605.41 seconds =    1 : 16 : 45.41
Label Std      :    4013.30 seconds =    1 :  6 : 53.30


In [14]:
print("All audio durations >= label durations? : ", 
      all(adur >= ldur for adur, ldur in zip(files_dur_audio, files_dur_label)))

print()
print("Where the audio durations are <= than label durations\n", 
      list(filter(lambda durs: durs[0] <= durs[1], zip(files_dur_audio, files_dur_label))))


All audio durations >= label durations? :  False

Where the audio durations are <= than label durations
 [(599.952, 599.97000000000003), (599.952, 599.97000000000003), (599.952, 599.97000000000003), (599.98800000000006, 599.99000000000001), (599.98800000000006, 599.99000000000001), (599.976, 599.98000000000002), (599.98800000000006, 599.99000000000001), (599.98800000000006, 599.99000000000001), (599.94000000000005, 599.94000000000005), (599.976, 599.98000000000002), (599.98800000000006, 599.99000000000001), (599.98800000000006, 599.99000000000001), (599.94000000000005, 599.94000000000005), (599.91600000000005, 599.91999999999996), (599.91600000000005, 599.91999999999996), (599.98800000000006, 599.99000000000001), (599.94000000000005, 599.94000000000005), (600.0, 600.0), (599.976, 599.98000000000002), (599.91600000000005, 599.91999999999996), (600.0, 600.0), (599.94000000000005, 599.94000000000005), (599.98800000000006, 599.99000000000001), (599.928, 599.92999999999995), (599.9880000000

##### Results

As expected, the audio files are mostly longer than the labels.
The exception cases are rare and have a mismatch of less than 20 milliseconds.
This is not deemed to be huge issue.

We also see how, with so many files, the total disparity can add up to a lot.

#### Assigning groups to splits

In [15]:
takeat = lambda l, i: [l[ii] for ii in i]

In [16]:
val_groups = [0]

tis = 53
tst_groups = list(range(tis, len(group_dur_audio)))

trn_groups = list(filter(lambda v: ((v not in val_groups) and 
                                    (v not in tst_groups)), 
                         range(len(group_pair_fps))))

trn_val_groups = np.concatenate([val_groups, trn_groups])

print("    VAL with {:2} groups and {:4} files".format(len(val_groups), sum(map(len, [activespeakers[i] for i in val_groups]))))
print("    TRN with {:2} groups and {:4} files".format(len(trn_groups), sum(map(len, [activespeakers[i] for i in trn_groups]))))
print("    TST with {:2} groups and {:4} files".format(len(tst_groups), sum(map(len, [activespeakers[i] for i in tst_groups]))))
print("TRN+VAL with {:2} groups and {:4} files".format(len(trn_val_groups), sum(map(len, [activespeakers[i] for i in trn_val_groups]))))

    VAL with  1 groups and   99 files
    TRN with 52 groups and 5200 files
    TST with  6 groups and  551 files
TRN+VAL with 53 groups and 5299 files


In [17]:
print("Train-Val Split - Groups Durations")
print_splitsec(np.sum(group_dur_audio[trn_groups]),
               "TRN    : {:5.2f}%".format(100 * group_dur_audio[trn_groups].sum()/group_dur_audio[trn_val_groups].sum()))
print_splitsec(np.sum(group_dur_audio[val_groups]),
               "VAL    : {:5.2f}%".format(100 * group_dur_audio[val_groups].sum()/group_dur_audio[trn_val_groups].sum()))

Train-Val Split - Groups Durations
TRN    : 97.82% : 3141667.39 seconds =  872 : 41 :  7.39
VAL    :  2.18% :   70004.44 seconds =   19 : 26 : 44.44


#### Making audio-label filepath pairs for each split

In [18]:
def _make_pairs(grouped_pair_fps, group_ids):
    grouped_pairs = []
    for g in group_ids:
        audio_fps, label_fps = grouped_pair_fps[g]
        grouped_pairs.extend(list(zip(audio_fps, label_fps)))
        
    return grouped_pairs

In [19]:
val_pairs = _make_pairs(group_pair_fps, val_groups)
tst_pairs = _make_pairs(group_pair_fps, tst_groups)
trn_pairs = _make_pairs(group_pair_fps, trn_groups)

print("This should match the number of files in each split, when the groups were assigned above")
print(*map(len, [val_pairs, tst_pairs, trn_pairs]))

print()
print("VAL\n", *val_pairs[:5], '...\n', sep='\n')
print("TST\n", *tst_pairs[:5], '...\n', sep='\n')
print("TRN\n", *trn_pairs[:5], '...\n', sep='\n')

This should match the number of files in each split, when the groups were assigned above
99 551 5200

VAL

('/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/audio/data/disc1/000/fe_03_00001.sph', '/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/labels/data/disc1/000/fe_03_00001.txt')
('/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/audio/data/disc1/000/fe_03_00002.sph', '/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/labels/data/disc1/000/fe_03_00002.txt')
('/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/audio/data/disc1/000/fe_03_00003.sph', '/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/labels/data/disc1/000/fe_03_00003.txt')
('/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/audio/data/disc1/000/fe_03_00004.sph', '/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/labels/data/disc1/000/fe_03_00004.txt')
('/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/raw/audio

### Make the Folders

The folder structure expected:

```
<rennet_data_working>/<provider>/<dataset>/<export_name>/
    - test/
        - audio/
            - data/
                - <group-number>/
                    - <call>.wav
                    - ...
         - labels/
             - data/
                 - <group-number>/
                     - <call>.txt
                     - ...
             - <calldata>.tbl
        - pickles/
            - <date-stamped-feature-info>/
                - <date-stamped-part-xx>.hdf5
                - ...
     - train/
         - < SAME AS TEST >
     - val/
         - < SAME AS TEST >
```

In [20]:
# UPDATE HERE - BEGIN #######################################################

export_name = 'wav-8k-mono'
val_dirname = 'val'
trn_dirname = 'train'
tst_dirname = 'test'

out_format = 'wav'
out_samplerate = 8000
out_channels = 1
out_channels_split = False

# UPDATE HERE - END #########################################################

export_name_dir = os.path.join(rennet_working_dir, provider, dataset, export_name)
val_dir = os.path.join(export_name_dir, val_dirname)
trn_dir = os.path.join(export_name_dir, trn_dirname)
tst_dir = os.path.join(export_name_dir, tst_dirname)

val_labeldata_dir = os.path.join(val_dir, 'labels', 'data')
trn_labeldata_dir = os.path.join(trn_dir, 'labels', 'data')
tst_labeldata_dir = os.path.join(tst_dir, 'labels', 'data')

val_audiodata_dir = os.path.join(val_dir, 'audios', 'data')
trn_audiodata_dir = os.path.join(trn_dir, 'audios', 'data')
tst_audiodata_dir = os.path.join(tst_dir, 'audios', 'data')

val_pickledata_dir = os.path.join(val_dir, 'pickles')
trn_pickledata_dir = os.path.join(trn_dir, 'pickles')
tst_pickledata_dir = os.path.join(tst_dir, 'pickles')

print("VAL:", val_dir, val_labeldata_dir, val_audiodata_dir, val_pickledata_dir, sep='\n')
print()
print("TRN:", trn_dir, trn_labeldata_dir, trn_audiodata_dir, trn_pickledata_dir, sep='\n')
print()
print("TST:", tst_dir, tst_labeldata_dir, tst_audiodata_dir, tst_pickledata_dir, sep='\n')

VAL:
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/val
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/val/labels/data
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/val/audios/data
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/val/pickles

TRN:
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/labels/data
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/audios/data
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/pickles

TST:
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/test
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/test/labels/data
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/test/audios/data
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/

In [21]:
# make the data directories for the splits

# these directories shouldn't exist right now

# ##### DO NOT RERUN! without changing ddeo to True ######
ddeo = False

os.makedirs(val_labeldata_dir, exist_ok=ddeo)
os.makedirs(val_audiodata_dir, exist_ok=ddeo)
os.makedirs(val_pickledata_dir, exist_ok=ddeo)
os.makedirs(trn_labeldata_dir, exist_ok=ddeo)
os.makedirs(trn_audiodata_dir, exist_ok=ddeo)
os.makedirs(trn_pickledata_dir, exist_ok=ddeo)
os.makedirs(tst_labeldata_dir, exist_ok=ddeo)
os.makedirs(tst_audiodata_dir, exist_ok=ddeo)
os.makedirs(tst_pickledata_dir, exist_ok=ddeo)

In [22]:
# Create group dirs for each split
group_dir_for_file = lambda fp: os.path.basename(os.path.dirname(fp))

def groups_for_split(split_pairs):
    # the group dirs required for labels and audios for each split are assumed to be the same
    # But check anyway
    
    audio_fps = [afp for afp, _ in split_pairs]
    label_fps = [lfp for _, lfp in split_pairs]
    
    audio_groups = set(map(group_dir_for_file, audio_fps))
    label_groups = set(map(group_dir_for_file, label_fps))
    
    assert audio_groups == label_groups
    return audio_groups


In [23]:
val_groupdirnames = sorted(list(groups_for_split(val_pairs)))
trn_groupdirnames = sorted(list(groups_for_split(trn_pairs)))
tst_groupdirnames = sorted(list(groups_for_split(tst_pairs)))

print(val_groupdirnames, trn_groupdirnames, tst_groupdirnames, sep='\n')

['000']
['001', '002', '003', '004', '005', '006', '007', '008', '009', '010', '011', '012', '013', '014', '015', '016', '017', '018', '019', '020', '021', '022', '023', '024', '025', '026', '027', '028', '029', '030', '031', '032', '033', '034', '035', '036', '037', '038', '039', '040', '041', '042', '043', '044', '045', '046', '047', '048', '049', '050', '051', '052']
['053', '054', '055', '056', '057', '058']


In [24]:
val_audiogroup_dirs = [os.path.join(val_audiodata_dir, g) for g in val_groupdirnames]
val_labelgroup_dirs = [os.path.join(val_labeldata_dir, g) for g in val_groupdirnames]
trn_audiogroup_dirs = [os.path.join(trn_audiodata_dir, g) for g in trn_groupdirnames]
trn_labelgroup_dirs = [os.path.join(trn_labeldata_dir, g) for g in trn_groupdirnames]
tst_audiogroup_dirs = [os.path.join(tst_audiodata_dir, g) for g in tst_groupdirnames]
tst_labelgroup_dirs = [os.path.join(tst_labeldata_dir, g) for g in tst_groupdirnames]

print("VAL:", *val_audiogroup_dirs, '', *val_labelgroup_dirs, sep='\n')
print()
print("TRN:", *trn_audiogroup_dirs, '', *trn_labelgroup_dirs, sep='\n')
print()
print("TST:", *tst_audiogroup_dirs, '', *tst_labelgroup_dirs, sep='\n')

VAL:
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/val/audios/data/000

/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/val/labels/data/000

TRN:
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/audios/data/001
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/audios/data/002
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/audios/data/003
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/audios/data/004
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/audios/data/005
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/audios/data/006
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/audios/data/007
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/audios/data/008
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/tr

In [25]:
exist_ok = False
for agd, lgd in zip(val_audiogroup_dirs, val_labelgroup_dirs):
    os.makedirs(agd, exist_ok=exist_ok)
    os.makedirs(lgd, exist_ok=exist_ok)
    
for agd, lgd in zip(trn_audiogroup_dirs, trn_labelgroup_dirs):
    os.makedirs(agd, exist_ok=exist_ok)
    os.makedirs(lgd, exist_ok=exist_ok)
    
for agd, lgd in zip(tst_audiogroup_dirs, tst_labelgroup_dirs):
    os.makedirs(agd, exist_ok=exist_ok)
    os.makedirs(lgd, exist_ok=exist_ok)

### Copy the calldata.tbl files to the labels dirs of each split

In [26]:
val_labeldir = os.path.dirname(val_labeldata_dir)
trn_labeldir = os.path.dirname(trn_labeldata_dir)
tst_labeldir = os.path.dirname(tst_labeldata_dir)

print("labels dirs where calldata will be copied", val_labeldir, trn_labeldir, tst_labeldir, sep='\n')

labels dirs where calldata will be copied
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/val/labels
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/labels
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/test/labels


In [27]:
print(sh.copy(calldata_fp, val_labeldir, follow_symlinks=True))
print(sh.copy(calldata_fp, trn_labeldir, follow_symlinks=True))
print(sh.copy(calldata_fp, tst_labeldir, follow_symlinks=True))

/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/val/labels/fe_03_p1_calldata.tbl
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/train/labels/fe_03_p1_calldata.tbl
/home/aabdullah/delve/rennet/data/working/fisher/fe_03_p1/wav-8k-mono/test/labels/fe_03_p1_calldata.tbl


### Copy all the label files to appropriate groups

In [28]:
def copy_labelfile_split(labelfile_splitlabeldatadir):
    labelfile, split_labeldatadir = labelfile_splitlabeldatadir
    groupname = group_dir_for_file(labelfile)
    todir = os.path.join(split_labeldatadir, groupname)
    return os.path.basename(sh.copy(labelfile, todir, follow_symlinks=True))

In [36]:
p = mp.Pool(mp.cpu_count())

In [30]:
splits_pbar = tqdm(zip([val_pairs, trn_pairs, tst_pairs],
                       [val_labeldata_dir, trn_labeldata_dir, tst_labeldata_dir],
                       ['val', 'trn', 'tst']))

for splitpairs, splitlabeldatadir, name in splits_pbar:
    params = [(lfp, splitlabeldatadir) for _, lfp in splitpairs]
    files_pbar = tqdm(total=len(params))
    for fn in p.imap_unordered(copy_labelfile_split, params):
        files_pbar.update()
        files_pbar.set_description(fn)
    splits_pbar.set_description(name)




In [31]:
splits_pbar.close()
files_pbar.close()




### Export audio files

In [32]:
def export_audiofile_split(audiofile_splitaudiodatadir_tofmt_tosr_tonchannels_issplit):
    audiofile, splitaudiodatadir = audiofile_splitaudiodatadir_tofmt_tosr_tonchannels_issplit[:2]
    tofmt, tosr, tonchannels, issplit = audiofile_splitaudiodatadir_tofmt_tosr_tonchannels_issplit[2:]
    
    groupname = os.path.basename(os.path.dirname(audiofile))
    todir = os.path.join(splitaudiodatadir, groupname)
    
    if issplit:
        tofns = au.convert_to_standard_split(audiofile, todir, tofmt, tosr)
    else:
        tofns = au.convert_to_standard(audiofile, todir, tofmt, tosr, tonchannels)
        
    return os.path.basename(tofns[0])

In [38]:
splits_pbar = tqdm(zip([val_pairs, trn_pairs, tst_pairs],
                       [val_audiodata_dir, trn_audiodata_dir, tst_audiodata_dir],
                       ['val', 'trn', 'tst']))

for splitpairs, splitaudiodatadir, name in splits_pbar:
    params = [(afp, splitaudiodatadir, 
               out_format, out_samplerate, 
               out_channels, out_channels_split) 
              for afp, _ in splitpairs]
    files_pbar = tqdm(total=len(params))
    for fn in p.imap_unordered(export_audiofile_split, params):
        files_pbar.update()
        files_pbar.set_description(fn)
    files_pbar.close()
    splits_pbar.set_description(name)
    
splits_pbar.close()





In [39]:
p = None

## Confirm all files in the splits are as expected

In [40]:
# val split

val_afns = sorted([os.path.basename(afp).split('.')[0] for afp, _ in val_pairs])
val_lfns = sorted([os.path.basename(lfp).split('.')[0] for _, lfp in val_pairs])

val_xafps = glob.glob(os.path.join(val_audiodata_dir, "**", "*.{}".format(out_format)))
val_xlfps = glob.glob(os.path.join(val_labeldata_dir, "**", "*.txt"))

val_xafns = sorted([os.path.basename(xafp).split('.')[0] for xafp in val_xafps])
val_xlfns = sorted([os.path.basename(xlfp).split('.')[0] for xlfp in val_xlfps])

# check all calls were exported / copied
assert len(val_afns) == len(val_xafns), "Mismatched number of audio files"
assert len(val_lfns) == len(val_xlfns), "Mismatched number of label files"

assert set(val_afns) == set(val_xafns), "Mismatched in sets of audio files"
assert set(val_lfns) == set(val_xlfns), "Mismatched in sets of label files"

# check all exported audios have the correct formatting

val_audiometa = list(map(au.get_audio_metadata, val_xafps))

assert all(map(lambda m: m.format == out_format, 
               val_audiometa)), "exported audio format is not {} for all".format(out_format)
assert all(map(lambda m: m.samplerate == out_samplerate, 
               val_audiometa)), "exported audio samplerate is not {} for all".format(out_samplerate)
assert all(map(lambda m: m.nchannels == out_channels, 
               val_audiometa)), "exported audio nchannels is not {} for all".format(out_channels)

# ALL OKAY IF NO ASSERTION ERRORS

In [41]:
# trn split

trn_afns = sorted([os.path.basename(afp).split('.')[0] for afp, _ in trn_pairs])
trn_lfns = sorted([os.path.basename(lfp).split('.')[0] for _, lfp in trn_pairs])

trn_xafps = glob.glob(os.path.join(trn_audiodata_dir, "**", "*.{}".format(out_format)))
trn_xlfps = glob.glob(os.path.join(trn_labeldata_dir, "**", "*.txt"))

trn_xafns = sorted([os.path.basename(xafp).split('.')[0] for xafp in trn_xafps])
trn_xlfns = sorted([os.path.basename(xlfp).split('.')[0] for xlfp in trn_xlfps])

# check all calls were exported / copied
assert len(trn_afns) == len(trn_xafns), "Mismatched number of audio files"
assert len(trn_lfns) == len(trn_xlfns), "Mismatched number of label files"

assert set(trn_afns) == set(trn_xafns), "Mismatched in sets of audio files"
assert set(trn_lfns) == set(trn_xlfns), "Mismatched in sets of label files"

# check all exported audios have the correct formatting

trn_audiometa = list(map(au.get_audio_metadata, trn_xafps))

assert all(map(lambda m: m.format == out_format, 
               trn_audiometa)), "exported audio format is not {} for all".format(out_format)
assert all(map(lambda m: m.samplerate == out_samplerate, 
               trn_audiometa)), "exported audio samplerate is not {} for all".format(out_samplerate)
assert all(map(lambda m: m.nchannels == out_channels, 
               trn_audiometa)), "exported audio nchannels is not {} for all".format(out_channels)

# ALL OKAY IF NO ASSERTION ERRORS


In [42]:
# tst split

tst_afns = sorted([os.path.basename(afp).split('.')[0] for afp, _ in tst_pairs])
tst_lfns = sorted([os.path.basename(lfp).split('.')[0] for _, lfp in tst_pairs])

tst_xafps = glob.glob(os.path.join(tst_audiodata_dir, "**", "*.{}".format(out_format)))
tst_xlfps = glob.glob(os.path.join(tst_labeldata_dir, "**", "*.txt"))

tst_xafns = sorted([os.path.basename(xafp).split('.')[0] for xafp in tst_xafps])
tst_xlfns = sorted([os.path.basename(xlfp).split('.')[0] for xlfp in tst_xlfps])

# check all calls were exported / copied
assert len(tst_afns) == len(tst_xafns), "Mismatched number of audio files"
assert len(tst_lfns) == len(tst_xlfns), "Mismatched number of label files"

assert set(tst_afns) == set(tst_xafns), "Mismatched in sets of audio files"
assert set(tst_lfns) == set(tst_xlfns), "Mismatched in sets of label files"

# check all exported audios have the correct formatting

tst_audiometa = list(map(au.get_audio_metadata, tst_xafps))

assert all(map(lambda m: m.format == out_format, 
               tst_audiometa)), "exported audio format is not {} for all".format(out_format)
assert all(map(lambda m: m.samplerate == out_samplerate, 
               tst_audiometa)), "exported audio samplerate is not {} for all".format(out_samplerate)
assert all(map(lambda m: m.nchannels == out_channels, 
               tst_audiometa)), "exported audio nchannels is not {} for all".format(out_channels)

# ALL OKAY IF NO ASSERTION ERRORS


### Results

- All labels were copied
- All audio files were converted to 8kHz wav files with 1 channel formed on merging the two channels in original