# Automated Post-Extraction Pipeline

This notebook contains the summary on how the automated post-extraction pipeline on DigitalOcean cloud was done. The scripts can only be run after the feature extraction process has been done (`npz` and `txt`).

## Libraries Import

In [6]:
import glob
import os
import shutil

import numpy as np
import pandas as pd

## Post-Extraction Pipeline

### Eligibility Filter

The first step to do is to filter the `npz` and `txt` files that meet the criteria for the training. The reason is we want to filter out the extracted files with the defined `transcription_length` and `frame_length`. During this process the broken `npz` files will also be filtered.

In [26]:
def filter_eligibility(origin_dir, destination_dir="eligible_files", lower_fl_bound=0, upper_fl_bound=99999, lower_tl_bound=0, upper_tl_bound=99999, print_metadata=False, write_output=True, output_filename="eligibility_list.csv", copy=True):
    
    """
    """
    
    destination_dir = f"{origin_dir}/{destination_dir}"
    if not os.path.exists(destination_dir):
        os.mkdir(destination_dir)
    
    # Create an empty array to store the names
    eligible_files = []

    # Get all of the npz files
    npzs = glob.glob(f"{origin_dir}/*.npz")
    txts = glob.glob(f"{origin_dir}/*.txt")

    # Store the total and error counts
    count = 0
    error_count = 0
    
    # Store the tx and feature lengths
    transcription_lengths = []
    feature_lengths = []
    for npz in npzs:
        # Error handling in case there are broken npz file
        try:
            feature_length = np.load(npz)["arr_0"].shape[0]

            if lower_fl_bound <= feature_length <= upper_fl_bound:
                txt_conversion = f"{npz[:-4]}.txt"
                
                if os.path.exists(txt_conversion):
                    transcription_length = 0
                    
                    with open(txt_conversion, "r") as f:
                        for line in f:
                            transcription_length += len(line)

                    if lower_tl_bound < transcription_length < upper_tl_bound:
                        eligible_files.append(npz)
                        count += 1
                        
                        if copy:
                            shutil.copy(npz, destination_dir)
                            shutil.copy(txt_conversion, destination_dir)
                        else:
                            shutil.move(npz, destination_dir)
                            shutil.move(txt_conversion, destination_dir)

                        transcription_lengths.append(transcription_length)
                        feature_lengths.append(feature_length)
        except:
            error_count += 1
    
    mean_transcription_length = np.mean(transcription_lengths)
    median_transcription_length = np.median(transcription_lengths)
    min_transcription_length = min(transcription_lengths)
    max_transcription_length = max(transcription_lengths)
    
    mean_feature_length = np.mean(feature_lengths)
    median_feature_length = np.median(feature_lengths)
    min_feature_length = min(feature_lengths)
    max_feature_length = max(feature_lengths)

    df = pd.DataFrame({
        "transcription_length": transcription_lengths,
        "feature_length": feature_lengths
    })
    
    output_filename = f"{origin_dir}/{output_filename}"
    if write_output:
        with open(output_filename, "w") as f:
            f.write("ELIGIBILITY_LIST\n")
            f.write("----------------\n")
            f.write(f"COUNT: {count}\n")
            f.write(f"MEAN TX LENGTH: {mean_transcription_length}\n")
            f.write(f"MEDIAN TX LENGTH: {median_transcription_length}\n")
            f.write(f"MIN TX LENGTH: {min_transcription_length}\n")
            f.write(f"MAX TX LENGTH: {max_transcription_length}\n\n")

            f.write(f"MEAN FEATURE LENGTH: {mean_feature_length}\n")
            f.write(f"MEDIAN FEATURE LENGTH: {median_feature_length}\n")
            f.write(f"MIN FEATURE LENGTH: {min_feature_length}\n")
            f.write(f"MAX FEATURE LENGTH: {max_feature_length}\n")
            f.write("----------------\n\n")

            df.to_csv(f, index=False)
    
    if print_metadata:
        print(f"COUNT: {count}")
        print(f"MEAN TX LENGTH: {mean_transcription_length}")
        print(f"MEDIAN TX LENGTH: {median_transcription_length}")
        print(f"MIN TX LENGTH: {min_transcription_length}")
        print(f"MAX TX LENGTH: {max_transcription_length}")
        print(f"---------------------------------------")
        print(f"MEAN FEATURE LENGTH: {mean_feature_length}")
        print(f"MEDIAN FEATURE LENGTH: {median_feature_length}")
        print(f"MIN FEATURE LENGTH: {min_feature_length}")
        print(f"MAX FEATURE LENGTH: {max_feature_length}")

### Train - Validation - Test Split

The next step in the post-extraction pipeline is to split the eligible files into train, validation and test. This is done by moving the files to their corresponding directories (`train`, `test`, `val`) so they can easily be loaded to the model.

In [69]:
def split_train_val_test(origin_dir, destination_dir=".", train_ratio=0.6, val_ratio=0.2, test_ratio=0.2):
    """
    """
    
    if (train_ratio + val_ratio + test_ratio) != 1:
        raise ValueError("The sum of the ratios didn't add up to 1.")
    
    directory_names = ["train", "val", "test"]
    for directory_name in directory_names:
        if not os.path.exists(directory_name):
            os.mkdir(directory_name)
            
    npzs = glob.glob(f"{origin_dir}/*.npz")
    txts = glob.glob(f"{origin_dir}/*.txt")
    
    total_file_count = len(npzs)
    train_size = round(train_ratio * total_file_count)
    val_size = round(val_ratio * total_file_count)
    test_size = total_file_count - (train_size + val_size)
    
    train_files = npzs[:train_size]
    val_files = npzs[train_size:-test_size]
    test_files = npzs[-test_size:]
    
    train_val_test_dict = {directory_names[0]: train_files, directory_names[1]: val_files, directory_names[2]: test_files}
    
    for file_dir, npzs in train_val_test_dict.items():
        for npz in npzs:
            txt_conversion = f"{npz[:-4]}.txt"
            shutil.copy(npz, f"{destination_dir}/{file_dir}")
            shutil.copy(txt_conversion, f"{destination_dir}/{file_dir}")        

### Encoding

The third step is to encode the filenams of the npzs and txts into numbers, since the `DataGenerator` can only use those numbers as the input. The reason behind this is to lower the memory consumption (the `DataGenerator` doesn't have to save long `string`names)

In [75]:
def encode(origin_dir, output_filename="encoding.json"):
    # Get all of the npz files
    npzs = glob.glob(f"{origin_dir}/*.npz")

    # Get all of the available texts to compare it with the ones required    
    available_txts = set(glob.glob(f"{origin_dir}/*.txt"))
    required_txts = set([f"{npz.replace('npz', 'txt')}" for npz in npzs])

    # Assert the files length of the existing txts must be similar
    assert len(available_txts.intersection(required_txts)) == len(required_txts)

    encoded_dict = {}
    date = datetime.today().strftime("%Y%m%d")

    for i, npz in enumerate(npzs):
        encoded_dict[i] = npz.replace(".npz", "")
        os.rename(npz, f"{origin_dir}/{i}.npz")
        os.rename(f"{npz.replace('npz', 'txt')}", f"{origin_dir}/{i}.txt")
    
    if output_filename == "encoding.json":
        with open(f"{origin_dir}/{date}_audio_encoding.json", "w") as f:
            json.dump(encoded_dict, f)
    else:
        with open(f"{output_filename}", "w") as f:
            json.dump(encoded_dict, f)

### Downloadable Link