In [None]:
# for auto-reloading rennet modules
%load_ext autoreload
%autoreload 1

# py2.7 compat
from __future__ import division, print_function
from six.moves import zip, range, zip_longest

In [None]:
import os
import sys

rennet_data_root = os.path.join("..", "..", "data")  # path to the data directory, AKA $RENNET_DATA_ROOT

---
***

> **NOTE:**
>
> This is not the original notebook used in double-talk detection research by me (Abdullah).
> But it is a faithful and more easy to use copy with some modifications, and some parts skipped.
>
> You should still be able to use it to meet the main goals of this notebook.
>
> For anything marked as `[SKIPPED]`, please refer to the following original notebooks in `notebooks/dtfinale/`:
> - `2017-02-09-fisher-fe_03_p1.ipynb`
> - `2017-03-22-fisher-fe_03_p1m-analysis.ipynb`
> - `2017-03-23-fisher-fe_03_p1m-export-wav8kmono.ipynb`
> - `2017-03-31-fisher-fe_03_p1-export-wav8kmono.ipynb`

---
***

# Acquisition and Analysis of raw dataset `fisher/fe_03_p1`

The goal of this notebook is to document how:
- the raw `fisher/fe_03_p1` dataset was acquired,
- the dataset was copied into a structure `working` directory to be used for analysis,
- the required classes and functions were implemented to analyze the dataset,
- the analysis of the various relevant properties of the dataset was performed. `[SKIPPED]`
- the dataset was split into training, validation and testing sub-sets.
- the data was exported in at standardized format for the three splits
    + exporting to standard format involved converting all audio files to ones with the following properties:
        * format: `wav`
        * channels: `mono`
        * samplerate: `8000 Hz`

## Data Acquisition

The corpus is the first half of a collection of conversational telephone speech (CTS) created at LDC during 2003.

It contains **5850** audio files, each with one full conversation of **upto 10 minutes** between **2 participants**.

**Origin Location on disk**

- AUDIO
    + `/nm-raid/audio/data/corpora/LDC/fisher_eng_tr_sp_LDC2004S13/fisher_eng_tr_sp_LDC2004S13.zip`
- LABELS
    + `/nm-raid/audio/data/corpora/LDC/Other/LDC2004T19.tgz`

The audio files are NIST Sphere files (`.sph`), with two channels, one per speaker, `(0: A, 1: B)`. 
The files are grouped into directories of a 100 files each, while the groups are available on 7 different discs.

The `filetable.txt` has complete listing of all the files in this part of the dataset, including the gender of the speakers.

The labels come in two forms: One that was extracted in an automated way marking speech parts, in `data/bbn_orig/`.
The relevant transcription is in `data/trans/`, which are `.txt` files in groups of 100 files as above.
The labels are however not divided based on discs.
The `doc` folder has useful readmes and metadata for the recordings, with more information about the conversation and the speakers involved.

The raw files mentioned above were **manually** copied into `$RENNET_DATA_ROOT/raw/fisher/fe_03_p1/` to maintain a local canonical copy.

---

## Copying files to `working` folder

The files were copied **manually, again**, to `$RENNET_DATA_ROOT/working/fisher/fe_03_p1/raw` with the following directory structure.

- `audio`
    + has the readme file, and the `filetable.txt` with list of all audio files, and corresponding speaker genders
    + `data/disc1` to `data/disc7` with grouped audio sph files, each group having roughly a hundred of them.
        * the groups are named based on the first 3 digits of the conversation IDs of the files in them.
- `labels`
    + has readmes and doc files with more info about the transcriptions and the metadata in the same folder
        * `fe_03_p1_calldata.tbl` has most of the relevant speaker and annotation metadata
        * `fe_03_pindata.tbl` has deeper information about the speaker themselves.
        * `fe_03_topics.sgm` is an xml like file with the information about the topics of conversation, referred in the `calldata` file
    + Same as above, `data/disc1` to `data/disc7` with grouped transcription txt files, each group having roughly a hundred of them.
        * the groups are named based on the first 3 digits of the conversation IDs of the files in them.
        
---
***

## Implementing useful classes and functions

Useful classes and functions will be implemented in this stage to be able to read the data and labels, and, more specifically, be able to work with the audio and label files for training double-talk detection model later on.

### Gather all filepaths

> **NOTE:**
>
> If it is not clear by now, we will be, from now on, working exclusively with the working directory we created in the previous step. Keep that in mind for all the instructions to come.


To gather:
- Paths to all audio files (.sph)
- Paths to all transcriptions to be used as labels (.txt)
- Path to the calldata
    * There is more information available in other files, but the one in this file is enough for double-talk detection
    * Actually, even that is not necessary ... but ... maybe it will be useful later. It has speaker PINs.

In [None]:
from rennet.utils.py_utils import recursive_glob

In [None]:
# Finding audio files

rennet_workingdir = os.path.join(rennet_data_root, 'working')
provider = 'fisher'
dataset = 'fe_03_p1'

working_rawaudio_dir = os.path.join(rennet_workingdir, provider, dataset, 
                                    'raw', 'audio', 'data')

glob_root = working_rawaudio_dir
glob_pattern = "*.sph"


print("Query:\n", "Root:{}\n".format(glob_root), "Pattern:{}\n".format(glob_pattern))
audio_fp = sorted(list(recursive_glob(glob_root, glob_pattern)))

print("Found audio files: {}".format(len(audio_fp)))
print("\n".join(audio_fp[:10]), "\n...")

In [None]:
# Finding label files

working_rawlabel_dir = os.path.join(rennet_workingdir, provider, dataset, 
                                    'raw', 'labels', 'data')

glob_root = working_rawlabel_dir
glob_pattern = "*.txt"


print("Query:\n", "Root: {}\n".format(glob_root), "Pattern: {}\n".format(glob_pattern))
label_fp = sorted(list(recursive_glob(glob_root, glob_pattern)))

print("Found transcription files: {}".format(len(label_fp)))
print("\n".join(label_fp[:10]), "\n...")

In [None]:
# Finding the calldata file

glob_root = os.path.join(rennet_workingdir, provider, dataset, 
                         'raw', 'labels')
glob_pattern = "*calldata.tbl"

print("Query:\n", "Root: {}\n".format(glob_root), "Pattern: {}\n".format(glob_pattern))
calldata_fp = sorted(list(recursive_glob(glob_root, glob_pattern)))

print("Found calldata files: {}".format(len(calldata_fp)))
print("\n".join(calldata_fp), "\n")
calldata_fp = calldata_fp[0]
print("Choosing FIRST ONE:\n{}".format(calldata_fp))

***
### Make sure all audio files have corresponding labels, and vice-versa

We are going to do this by comparing the `CALLID` in the filename of the audios and labels (transciptions).

For this, a new module has been created as `rennet/rennet/datasets/fisher.py` which will house all the necessary classes and functions to be used for working with `fisher/fe_03_p1` dataset.

In [None]:
# load the created datasets.fisher module with support for autoreload
# messing with complicated classes will require restarting this notebook's kernel ... but .. oh well
%aimport rennet.datasets.fisher
import rennet.datasets.fisher as fe

import warnings

In [None]:
# check if each pair has the same callid
audio_callids = list(map(fe.callid_for_filename, audio_fp))
label_callids = list(map(fe.callid_for_filename, label_fp))

if len(audio_callids) != len(label_callids):
    warnings.warn("\nMismatch in number of audios ({}) vs. labels ({})".format(len(audio_callids), len(label_callids)))
else:
    # same number of audios and labels
    pass
    
# NOTE: it is assumed that audio_fp and label_fp are sorted, and hence are their callids
if any(ac != lc for ac, lc in zip(audio_callids, label_callids)):
    warnings.warn("\nMismatch in callids for certain files")
else:
    # even if there is a mismatch in lengths of lists of audios and labels
    # all callids in the smallest list have a corresponding matching callid in the the other, at the same index
    pass
    
    
# No UserWarning means all okay, hopefully

***
### How to read `.sph` audio files?

A method was implemented in `rennet.utils.pydub_utils.AudioIO` class to read `.sph` files.
It uses the `sph2pipe` tool.

We will choose one audio file and try reading it, then export it to `.wav`.

The as is exported file will have the two speaker channels separated.
There is also code below on how to merge the two channels and export the mono-channel file.
Once an audio has been read into an `AudioIO` object, (hopefully) any operations applicable on `pydub.AudioSegment` objects are available. (Google `pydub` for more info).


We won't be converting all the files to `.wav` right now. 
That will be done while creating the training, validation and testing splits later.

In [None]:
%aimport rennet.utils.pydub_utils
import rennet.utils.pydub_utils as bu

%aimport rennet.utils.audio_utils
import rennet.utils.audio_utils as au

In [None]:
# test if we can read .sph files and convert them to .wav files
audio_0 = audio_fp[0]

# Uncomment below and change after sph2pipe has been compiled. 
# Run as is to get instructions from the error on how.
#
# sph2pipe_path = os.path.join(rennet_root, 'rennet', 'utils', 'sph2pipe_v2.5', 'sph2pipe')
# print("Path to compiled sph2pipe binary to be used: {}".format(sph2pipe_path))

# audio_0_io = bu.AudioIO.from_file(audio_0, sph2pipe_path=sph2pipe_path)
audio_0_io = bu.AudioIO.from_file(audio_0)  # comment this out when you have sph2pipe. Use the line above.

# convert to .wav without any changes
to_fp = os.path.abspath(os.path.join(".", os.path.basename(audio_0) + ".wav"))
audio_0_io.export(out_f=to_fp, format='wav')
wav_meta = au.get_audio_metadata(to_fp)
print("Exported wav file can be found at:\n{}".format(wav_meta.filepath))
print("\n", wav_meta, "\n")

# convert to mono-channel .wav
audio_0_io_mono = audio_0_io.set_channels(1)
to_fp = os.path.abspath(os.path.join(".", os.path.basename(audio_0) + ".mono.wav"))
audio_0_io_mono.export(out_f=to_fp, format='wav')
wav_meta = au.get_audio_metadata(to_fp)
print("\n\n")
print("Exported mono wav file can be found at:\n{}".format(wav_meta.filepath))
print("\n", wav_meta, "\n")

---
### Reading the label (transcription) files

The class `rennet.datasets.fisher.Annotations` was implemented to read transcriptions into the `SequenceLabels` structure.

`Annotations.from_file(...)` was implemented using `csv` module to instantiate the class.

**NOTE:** It is assumed that there are only 2 speakers in every file, one per channel, with `A` representing the first speaker, and the `B` for the second speaker, in order.

In [None]:
# how does the transcription file look like?
label_0 = label_fp[0]

with open(label_0, 'r') as f:
    for line in f.readlines():
        print(line)

In [None]:
# Create instance of Annotations for the file
ann = fe.Annotations.from_file(label_0)

print(ann)

In [None]:
# Check if all files can be read this way
for l in label_fp:
    fe.Annotations.from_file(l)

***
### Reading extra call information from the calldata file

The transcription file doesn't have more information about the speakers or the conversation.
These potentially useful informations are available in the calldata file found earlier.

`rennet.datasets.fisher.AllCallData` was impelemented as a slots-only class that parses **all** the calldata from the given path to calldata table file found earlier.

It has more convenience methods, an interesting one is to get the calldata for a `CALLID` or even the filename by using the __getitem__ operator.

Nevertheless, the `Annotations.from_file(...)` method now also accepts an `AllCallData` object (or filepath to it) and automatically parses the and make a copy of the relevant `CallData` from it. 
`CallData.channelspeakers` is a list of `Speaker` objects, with 2 speakers, for channel `A` and channel `B`, in order.

For more information about what the parsed `CallData` means, refer to `doc_calldata_tbl.txt`.

In [None]:
allcalldata = fe.AllCallData.from_file(calldata_fp)
print(allcalldata)
print()
callid_0 = label_callids[0]
print(callid_0)
print(allcalldata[callid_0])

In [None]:
# read Annotations with calldata
ann = fe.Annotations.from_file(label_0, allcalldata=allcalldata)
print()
print(ann)

In [None]:
print(ann.calldata)

---
### Inferring overlapping speech instances from the transcriptions

It was seen that the transcriptions only consist of annotations for which speaker-channel is active during which intervals, and what is being spoken. 
Silences (or non-speech) events are implicitly annotated for by there being no entry for any speaker within that duration. 

This has been more or less faithfully been parsed into the `Annotations` object above. `ann.start_ends` has the start- and end-time-stamps for a line in the file, and `ann.labels` has the corresponding parsed transcription.

`rennet.datasets.fisher.ActiveSpeakers` was implemented to parse from the `Annotation` object which speaker is active at _**all**_ the intervals between the first and the last entry (when sorted based on start and end time-stamps). 

This was done by _**assuming**_ that an absent `Transcription` (because of an absent entry for a speaker) indicates silence. 
This is a dangerous assumption since there can be other non-speech or even speech events missing from the transcription.
However, nothing can be done for missing entries for durations between the first and last entries, and both speakers are assumed inactive (represented by `[0 0]` entry in the labels below).
But, it is nevertheless, un-safe to assume this beyond the first and last entries, and so is not done in `ActiveSpeakers`, and should be kept in mind later when extracting labels for feature vectors at time-stamps outside this range.

`ActiveSpeakers`'s parent class's `labels_at` method can be used to find which speaker is active at a given time-stamp. And from that, it can be determined whether the label for that time-stamp would be non-speech, single-speech, or overlap

In [None]:
# extract ActiveSpeakers from Annotations
act = fe.ActiveSpeakers.from_annotations(ann)
print(act)

# [1, 0] : only speaker in channel A active (single-speech)
# [0, 1] : only speaker in channel B active (single-speech)
# [1, 1] : both speakers active (overlap)
# [0, 0] : no speaker active (silence, or non-speech)

In [None]:
# ActiveSpeakers directly from filepath
act = fe.ActiveSpeakers.from_file(label_0, allcalldata=allcalldata)
print(act)

In [None]:
# first entry's start time
print(act.min_start)

# last entry's end time
print(act.max_end)

In [None]:
# Getting active-speakers for time-stamps

ts = [0, 375, 376, 1350, 1359, 1360, 1390, 1396, 1397, 1400, 50734, 50735]  # milliseconds
ts_samplerate = 100  # milliseconds

ts_actspk = act.labels_at(ends=ts, samplerate=ts_samplerate, 
                          default_label='zeros')  # assume no speaker active at ts outside (default behavior)

print("\n".join(str(z) for z in zip(ts, ts_actspk)))

In [None]:
# raise error (not the default) if query ts are outside the first and last annotations
ts_actspk = act.labels_at(ends=ts, samplerate=ts_samplerate, 
                          default_label='raise')  # raise error if any of ts is beyond first and last annotation

In [None]:
# getting labels at time-stamps
ts_actspk = act.labels_at(ends=ts, samplerate=ts_samplerate)  # assume no speaker active at ts outside (default behavior)

ts_labels = ts_actspk.sum(axis=1)  # labels_at(...) returns a numpy array when possible, esp. for default behavior

ts_labels.clip(0, 2)  # not applicable here since there are only two speakers, but whatever

labels_dict = {
    0: 'non-speech',
    1: 'single-speech',
    2: 'overlap',
}


print("\n".join(str((t, labels_dict[l])) for t, l in zip(ts, ts_labels)))

In [None]:
# Check if all files can be read this way
for l in label_fp:
    fe.ActiveSpeakers.from_file(l, warn_duplicates=False)  # suppress the warnings for duplicates. We can't do much about it.

## Analysis `[SKIPPED]`

> Here, various analyses would have been performed on the acquired, and now parsed, data.
> For example, what is the typical duration of overlaps, etc.
> These analyses influenced some decisions in the next steps below and in the subsequent notebooks.
>
> This section has been skipped from this notebook.
> 
> Please refer to the original notebooks listed near the top of this notebook for inspiration.

---
## Assigning callids to training, validation and testing splits

All audio and label files have been grouped based on the first 3 digits of their respective `CALLID`.
These 3 digits will be called the `GROUPID` from here on forwards.
A convenience function to do this was implemented in `rennet.datasets.fisher`

> **NOTE:**
> 
> The data-analysis that lead to the following decisions have been skipped from this notebook.
> As noted previously for concrete analyses, please refer to the orginal notebooks.
> The decisions, however, have been documented and implemented below.

The idea now is that, there is so much data, that making splits based just on groups should be enough.

- The variances across the groups are expected to be insignificant, wrt double-talk and it's contributing factors
    + even 1% of the data is in actuality 9 hours long, roughly 10x what ka3 has.
- It is very easy to assign groups, or even entire discs to a particular split.
    + ***IDEALLY*** this whole thing should be randomized
- Cross-validation on this dataset, even if arguably will be good, it is definitely impractical
    + a 90-10 train-test split seems more than enough, given 10% will be roughly 90 hours
    + Again, **Variances should be tested for** beforehand




In [None]:
groupid_for_filename = lambda fn: fe.groupid_for_callid(fe.callid_for_filename(fn))

from itertools import groupby

In [None]:
# pair the audio and label filepaths and group them by groupid
grouped_pair_fps = [(groupid, tuple(it)) 
                    for groupid, it in groupby(zip_longest(audio_fp, label_fp), 
                                                   lambda a_l: groupid_for_filename(a_l[1]))]

print("Number of Groups: {}\n".format(len(grouped_pair_fps)))
print("List of groupids:\n{}\n".format([g for g, _ in grouped_pair_fps]))
print("Number of files in each groupid:\n{}\n".format([len(p) for _, p in grouped_pair_fps]))
print("For Example: ")
print(grouped_pair_fps[0][0])
print("\n".join(str(pair) for pair in grouped_pair_fps[0][1][:3]), "\n...")

In [None]:
# find the per-file durations for these pairs, based on labels
def duration_from_labelfp(fp):
    ann = fe.Annotations.from_file(fp)
    dur = ann.max_end - ann.min_start  # unit: (1/ann.samplerate) seconds 
    return dur / ann.samplerate  # unit: seconds

import numpy as np

perfile_durs_from_labels = np.array(list(map(duration_from_labelfp, label_fp)))

grouped_durs_from_labels = [(g, np.array([d for _, d in it])) 
                            for g, it in groupby(zip(label_fp, perfile_durs_from_labels), 
                                                 lambda l_p: groupid_for_filename(l_p[0]))]

# NOTE: doing based on audios is possible, but not done here. 
# There will be disparities, but they were found to be negligible.

In [None]:
def split_seconds(seconds):
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return int(h), int(m), round(s, 2)

def print_splitsec(duration, name):
    print("{} : {:10.2f} seconds = {}".format(
        name, duration, 
        "{:4} : {:2} : {:5.2f}".format(*split_seconds(duration))
    ))

In [None]:
print("durations statistics for files")
print_splitsec(np.min(perfile_durs_from_labels),   "Minimum ")
print_splitsec(np.max(perfile_durs_from_labels),   "Maximum ")
print_splitsec(np.mean(perfile_durs_from_labels),  "Mean    ")
print_splitsec(np.std(perfile_durs_from_labels),   "Std     ")
print_splitsec(np.sum(perfile_durs_from_labels),   "Total   ")

In [None]:
print("duration statistics for groups")
group_dur_from_labels = np.array([d.sum() for _, d in grouped_durs_from_labels])
print_splitsec(np.min(group_dur_from_labels),   "Minimum ")
print_splitsec(np.max(group_dur_from_labels),   "Maximum ")
print_splitsec(np.mean(group_dur_from_labels),  "Mean    ")
print_splitsec(np.std(group_dur_from_labels),   "Std     ")
print_splitsec(np.sum(group_dur_from_labels),   "Total   ")

### Splitting decisions

- Assigning the group `000` exclusively for validation.
    + affords ~17 hours of data, ~1.7% of non-test data.
    + It is still a lot of data to run validation after every-epoch.
    + Therefore, some specific `callids` have been chosen, and are available at `rennet.datasets.fisher.chosen_val_callids` based on analysis on gender ratios, double-talk ratios, etc.
        * We will still extract features for all validation calls, but will run our validations only on these calls during training.
        * The entire validation set will be available for any post-training validation, or something.
- Assigning all groups from `001` to `052` (inclusive) for training.
    + This is a giant dataset.
    + Definitely not all will be used for training, because of hard-disk limitation as is.
        * Will be even possible to have completely disjoint set different training sub-splits!!!
        * All while maintaining disjoint testing and validation splits.
    + And, also, will probably only be unnecessary for training on only 3 classes.
    + Assigning it now anyway, even though not even features will be extracted for all of them.
        * Mainly so that there is a hard separation between all the splits.
        * No calls from one split will be used for any other purpose.
    + **The final training dataset will be formed using groups `001` to `012`** (1200 files)
        * Based on ... how much the hard-disk could handle.
        * It will still be ~189 hours of data.
- Assigning all groups from `053` onwards for testing.
    + All files from this assignment will be used during evaluation, and, hence, also feature extraction

In [None]:
print(fe.chosen_val_callids)

In [None]:
# assign grouped pairs to splits
val_groups = [grouped_pair_fps[0]]  # group `000`

tis = 53
trn_groups = grouped_pair_fps[1:tis]  # groups `001` to `052` (inclusive)

tst_groups = grouped_pair_fps[tis:]  # groups `053` to `058` (inclusive)

assert set(val_groups).isdisjoint(trn_groups)
assert set(val_groups).isdisjoint(tst_groups)
assert set(tst_groups).isdisjoint(trn_groups)

# No Assertion errors means that all splits are disjoint

In [None]:
print("Number of Groups and Files per split:\n")

trn_val_groups = trn_groups + val_groups
print("    VAL with {:2} groups of total {:4} files".format(len(val_groups), sum(map(len, (p for _, p in val_groups)))))
print("    TRN with {:2} groups of total {:4} files".format(len(trn_groups), sum(map(len, (p for _, p in trn_groups)))))
print("    TST with {:2} groups of total {:4} files".format(len(tst_groups), sum(map(len, (p for _, p in tst_groups)))))
print("TRN+VAL with {:2} groups of total {:4} files".format(len(trn_val_groups), sum(map(len, (p for _, p in trn_val_groups)))))

In [None]:
grouped_durs_dict = {g: d.sum() for g, d in grouped_durs_from_labels}

val_group_dur = sum(grouped_durs_dict[g] for g, _ in val_groups)
trn_group_dur = sum(grouped_durs_dict[g] for g, _ in trn_groups)
tst_group_dur = sum(grouped_durs_dict[g] for g, _ in tst_groups)
trn_val_group_dur = val_group_dur + trn_group_dur
tot_dur = trn_val_group_dur + tst_group_dur

print("Precentages and Durations of Splits")
print_splitsec(val_group_dur,     "    VAL  == {:6.2f}%".format(100 *     val_group_dur/tot_dur))
print_splitsec(trn_group_dur,     "    TRN  == {:6.2f}%".format(100 *     trn_group_dur/tot_dur))
print_splitsec(tst_group_dur,     "    TST  == {:6.2f}%".format(100 *     tst_group_dur/tot_dur))
print_splitsec(trn_val_group_dur, "TRN+VAL  == {:6.2f}%".format(100 * trn_val_group_dur/tot_dur))
print_splitsec(tot_dur,           "  TOTAL  == {:6.2f}%".format(100 *           tot_dur/tot_dur))

---
## Exporting splits to `wav-8k-mono` standard format



### The folder structure expected for exports

```
$RENNET_DATA_ROOT/working/<provider>/<dataset>/<export_name>/
    - test/
        - audio/
            - data/
                - <group-number>/
                    - <call>.wav
                    - ...
         - labels/
             - data/
                 - <group-number>/
                     - <call>.txt
                     - ...
             - <calldata>.tbl
    - train/
        - < SAME AS TEST >
    - val/
        - < SAME AS TEST >
    - pickles/
        - <date-stamped-features_1-info>/
            - trn.h5
            - tst.h5
            - val.h5
        - <date-stamped-features_2-info>/
            - trn.h5
            - tst.h5
            - val.h5
        - ...
```

> **NOTE:**
>
> The directories inside `pickles` above will be created in the feature-extraction notebook; shown here for reference only.

In [None]:
# UPDATE HERE - BEGIN #######################################################

export_name = 'wav-8k-mono'
val_dirname = 'val'
trn_dirname = 'train'
tst_dirname = 'test'

out_format = 'wav'
out_samplerate = 8000
out_channels = 1
out_channels_split = False

# UPDATE HERE - END #########################################################
rennet_working_dir = rennet_workingdir

export_name_dir = os.path.join(rennet_working_dir, provider, dataset, export_name)
val_dir = os.path.join(export_name_dir, val_dirname)
trn_dir = os.path.join(export_name_dir, trn_dirname)
tst_dir = os.path.join(export_name_dir, tst_dirname)

val_labeldata_dir = os.path.join(val_dir, 'labels', 'data')
trn_labeldata_dir = os.path.join(trn_dir, 'labels', 'data')
tst_labeldata_dir = os.path.join(tst_dir, 'labels', 'data')

val_audiodata_dir = os.path.join(val_dir, 'audios', 'data')
trn_audiodata_dir = os.path.join(trn_dir, 'audios', 'data')
tst_audiodata_dir = os.path.join(tst_dir, 'audios', 'data')

pickles_dir = os.path.join(export_name_dir, 'pickles')

print("Directories that will be created, upto group-name (exclusive)\n")
print("VAL:", val_dir, val_labeldata_dir, val_audiodata_dir, sep='\n')
print()
print("TRN:", trn_dir, trn_labeldata_dir, trn_audiodata_dir, sep='\n')
print()
print("TST:", tst_dir, tst_labeldata_dir, tst_audiodata_dir, sep='\n')
print()
print("PICKLES:", pickles_dir)

In [None]:
# make the data directories for the splits
from rennet.utils.py_utils import makedirs_with_existok

# these directories shouldn't exist right now

# ##### DO NOT RERUN! without changing ddeo to True ######
ddeo = False

makedirs_with_existok(val_labeldata_dir, exist_ok=ddeo)
makedirs_with_existok(val_audiodata_dir, exist_ok=ddeo)
makedirs_with_existok(trn_labeldata_dir, exist_ok=ddeo)
makedirs_with_existok(trn_audiodata_dir, exist_ok=ddeo)
makedirs_with_existok(tst_labeldata_dir, exist_ok=ddeo)
makedirs_with_existok(tst_audiodata_dir, exist_ok=ddeo)
makedirs_with_existok(pickles_dir, exist_ok=ddeo)

In [None]:
# group-name dir paths
groups_for_split = lambda split: [g for g, _ in split]
val_groupdirnames = groups_for_split(val_groups)
trn_groupdirnames = groups_for_split(trn_groups)
tst_groupdirnames = groups_for_split(tst_groups)

val_audiogroup_dirs = [os.path.join(val_audiodata_dir, g) for g in val_groupdirnames]
val_labelgroup_dirs = [os.path.join(val_labeldata_dir, g) for g in val_groupdirnames]
trn_audiogroup_dirs = [os.path.join(trn_audiodata_dir, g) for g in trn_groupdirnames]
trn_labelgroup_dirs = [os.path.join(trn_labeldata_dir, g) for g in trn_groupdirnames]
tst_audiogroup_dirs = [os.path.join(tst_audiodata_dir, g) for g in tst_groupdirnames]
tst_labelgroup_dirs = [os.path.join(tst_labeldata_dir, g) for g in tst_groupdirnames]

print("VAL:", "\n".join(val_audiogroup_dirs), '', "\n".join(val_labelgroup_dirs))
print()
print("TRN:", "\n".join(trn_audiogroup_dirs), '', "\n".join(trn_labelgroup_dirs))
print()
print("TST:", "\n".join(tst_audiogroup_dirs), '', "\n".join(tst_labelgroup_dirs))
print()

In [None]:
# Make group-name directories for all splits
exist_ok = False
for agd, lgd in zip(val_audiogroup_dirs, val_labelgroup_dirs):
    makedirs_with_existok(agd, exist_ok=exist_ok)
    makedirs_with_existok(lgd, exist_ok=exist_ok)
    
for agd, lgd in zip(trn_audiogroup_dirs, trn_labelgroup_dirs):
    makedirs_with_existok(agd, exist_ok=exist_ok)
    makedirs_with_existok(lgd, exist_ok=exist_ok)
    
for agd, lgd in zip(tst_audiogroup_dirs, tst_labelgroup_dirs):
    makedirs_with_existok(agd, exist_ok=exist_ok)
    makedirs_with_existok(lgd, exist_ok=exist_ok)

In [None]:
# path to which calldata tbl for each split's label dir will be copied to
# it is the same file, copying for consistency
val_labeldir = os.path.dirname(val_labeldata_dir)
trn_labeldir = os.path.dirname(trn_labeldata_dir)
tst_labeldir = os.path.dirname(tst_labeldata_dir)

print("labels dirs where calldata will be copied to:", val_labeldir, trn_labeldir, tst_labeldir, sep='\n')

In [None]:
# copy the calldata tbl file
import shutil as sh

print(sh.copy(calldata_fp, val_labeldir))
print(sh.copy(calldata_fp, trn_labeldir))
print(sh.copy(calldata_fp, tst_labeldir))

### Copy all label files to appropriate directories

In [None]:
def copy_labelfile_split(labelfile_splitlabeldatadir):
    labelfile, split_labeldatadir = labelfile_splitlabeldatadir
    if labelfile is None:
        return ""
    
    groupname = groupid_for_filename(labelfile)
    
    todir = os.path.join(split_labeldatadir, groupname)
    sh.copy(labelfile, todir) 
    
    return os.path.basename(labelfile) 

In [None]:
import multiprocessing as mp
from tqdm import tqdm_notebook as tqdm

In [None]:
p = mp.Pool(mp.cpu_count())

splits_pbar = tqdm(zip([val_groups, trn_groups, tst_groups],
                       [val_labeldata_dir, trn_labeldata_dir, tst_labeldata_dir],
                       ['val', 'trn', 'tst']))

for split, splitlabeldatadir, name in splits_pbar:
    params = []
    for _, pair in split:
        for _, lfp in pair:
            params.append((lfp, splitlabeldatadir))
            
    files_pbar = tqdm(total=len(params))
    for fn in p.imap_unordered(copy_labelfile_split, params):
        files_pbar.update()
        files_pbar.set_description(fn)
    splits_pbar.set_description(name)

In [None]:
p = None
files_pbar.close()
splits_pbar.close()

### Export all audio files to the approriate format, to the appropriate directories

In [None]:
def export_audiofile_split(audiofile_splitaudiodatadir_tofmt_tosr_tonchannels_issplit):
    audiofile, splitaudiodatadir = audiofile_splitaudiodatadir_tofmt_tosr_tonchannels_issplit[:2]
    if audiofile is None:
        return ""
    
    tofmt, tosr, tonchannels, issplit = audiofile_splitaudiodatadir_tofmt_tosr_tonchannels_issplit[2:]
    
    groupname = os.path.basename(os.path.dirname(audiofile))
    todir = os.path.join(splitaudiodatadir, groupname)
    
    if issplit:
        tofns = bu.convert_to_standard_split(audiofile, todir, tofmt, tosr)
    else:
        tofns = bu.convert_to_standard(audiofile, todir, tofmt, tosr, tonchannels)
        
    return os.path.basename(tofns[0])

In [None]:
p = mp.Pool(mp.cpu_count())

splits_pbar = tqdm(zip([val_groups, trn_groups, tst_groups],
                       [val_audiodata_dir, trn_audiodata_dir, tst_audiodata_dir],
                       ['val', 'trn', 'tst']))

for split, splitaudiodatadir, name in splits_pbar:
    params = []
    for _, pairs in split:
        for afp, _ in pairs:
            if afp is None:
                continue
            params.append((afp, splitaudiodatadir, 
                          out_format, out_samplerate, 
                          out_channels, out_channels_split))
            
    files_pbar = tqdm(total=len(params))
    for fn in p.imap_unordered(export_audiofile_split, params):
        files_pbar.update()
        files_pbar.set_description(fn)
    files_pbar.close()
    splits_pbar.set_description(name)
    


In [None]:
p = None
splits_pbar.close()
files_pbar.close()

### Check all files have been moved to the appropriate paths `[SKIPPED]`

> **NOTE:**
>
> Sorry, you'll have to check the files manually.
>
> They should be fine, but some sanity check would be much better.
>
> Scripted checks are performed at the end of the notebook:
> 
> `notebooks/dtfinale/2017-03-31-fisher-fe_03_p1-export-wav8kmono.ipynb`
>
> You can adapt that code, if you smell something fishy.