# Efficiently process dataframes

## Import dependencies

In [1]:
# For basic data manipulation
import pandas as pd

# For wav files processing
import librosa
import wavfile

# For parallelization
from tqdm import tqdm

#### <font color='orange'>Comment 1</font>
- check differences between `import wavfile` and `from scipy.io import wavfile`


## Define auxiliary functions and parameters

In [2]:
# Hyperparameters
class HParams(object):
    """ Hparams was removed from tf 2.0alpha so this is a placeholder
    """

    def __init__(self, **kwargs):
        self.set_defaults()
        self.__dict__.update(kwargs)

    def set_defaults(self):
        self.win_length_ms = 5
        self.hop_length_ms = 1
        self.n_fft = 1024
        self.ref_level_db = 20
        self.min_level_db = -60
        self.preemphasis = 0.97
        self.num_mel_bins = 64
        self.mel_lower_edge_hertz = 200
        self.mel_upper_edge_hertz = 15000
        self.power = 1.5  # for spectral inversion
        self.griffin_lim_iters = 50
        self.butter_lowcut = 500
        self.butter_highcut = 15000
        self.reduce_noise = False
        self.noise_reduce_kwargs = {}
        self.mask_spec = False
        self.mask_spec_kwargs = {"spec_thresh": 0.9, "offset": 1e-10}
        self.nex = -1
        self.n_jobs = -1
        self.verbosity = 1

    def save(self):
        raise NotImplementedError

    def load(self):
        raise NotImplementedError
hparams = HParams(
    num_mel_bins = 64,
    mel_lower_edge_hertz=500,
    mel_upper_edge_hertz=15000,
    butter_lowcut = 500,
    butter_highcut = 15000,
    ref_level_db = 20,
    min_level_db = -30,
    mask_spec = True,
    win_length_ms = 10,
    hop_length_ms = 2,
    nex=-1,
    n_jobs=-1,
    verbosity = 1,
    )

In [3]:
# Functions
def read_wav(wav_loc, method="librosa", **kwargs):
    """ read wav using either librosa or scipy
    """
    if method == "librosa":
        if "sr" not in kwargs.keys():
            kwargs["sr"] = None
        data, rate = librosa.core.load(wav_loc, **kwargs)
    elif method == "scipy":
        rate, data = wavfile.read(wav_loc) # TODO: requires package wavfile, not scipy
    return rate, data

def load_wav(wav_loc, catch_errors=True, method="librosa", **kwargs):
    if catch_errors:
        try:
            rate, data = read_wav(wav_loc, method=method, **kwargs)
            return rate, data
        except Exception as e:
            print(e)
            return None, None
    else:
        rate, data = read_wav(wav_loc, method=method, **kwargs)
        return rate, data
    
def prepare_wav(wav_loc, hparams=None): # TODO: like this, this function is entirely equivalent to `load_wav`
    """ load wav and convert to correct format
    """
    # get rate and date
    rate, data = load_wav(wav_loc)

    return rate, data

## Parse the dataframe

In [4]:
df = pd.read_csv("test_input.csv")
df.head()

Unnamed: 0,start_time,end_time,participant,utterance,key,language,uid
0,629.96,630.51,A,aha,/german1/5298,german,german-059-255-629960
1,398.87,399.33,A,aha,/german1/5298,german,german-059-151-398870
2,2009.1,2009.5,tx@ADUSBS,aoq,/sambas1/SBS-20111031,sambas,sambas-24-0883-2009100
3,1782.89,1783.4,tx@JEPSBS,aoq,/sambas1/SBS-20111031,sambas,sambas-24-0764-1782890
4,341.41,341.83,B,mhm,/german1/4123,german,german-008-097-341410


## Define the analysis
The function below defines our analysis. For each row in the dataframe it:
1. Opens the `.wav` file listed in `key`.
2. Extracts the values between `start_time` and `end_time`.
3. Appends it in a new column called "audio".
4. Extracts the audio bit rate.
5. Appends it in a new column called "rate".
6. Returns the enriched data frame.

This is the process we want to optimize.

In [5]:
def get_row_audio(df, wav_loc, hparams):
    """ load audio and grab individual snippets
    TODO: for large sparse WAV files, the audio should be loaded only for the syllable
    """

    # load audio
    rate, data = prepare_wav(wav_loc, hparams)
    data = data.astype('float32')

    # get audio for each word (row) in data frame #TODO: why not stick to the function name, extract only one row, and perform the loop outside?
    df["audio"] = [
        data[int(st * rate) : int(et * rate)]
        for st, et in zip(df.start_time.values, df.end_time.values)
    ]

    df["rate"] = rate

    return df

The wrapper below will be convenient, as most of the information on file location is already contained in `df`.

In [6]:
def get_row_audio_path(df, hparams = hparams, path='audio_files'):
    wav_loc = path + df.key[0] + ".wav" # Build file location from data frame
    return get_row_audio(df, wav_loc, hparams)


In [7]:
%%time
get_row_audio_path(df)

CPU times: user 3.89 s, sys: 700 ms, total: 4.59 s
Wall time: 4.62 s


Unnamed: 0,start_time,end_time,participant,utterance,key,language,uid,audio,rate
0,629.96,630.51,A,aha,/german1/5298,german,german-059-255-629960,"[-0.00012207031, -0.00061035156, -0.0008544922...",8000
1,398.87,399.33,A,aha,/german1/5298,german,german-059-151-398870,"[-0.0009765625, -0.0008544922, -0.0010986328, ...",8000
2,2009.1,2009.5,tx@ADUSBS,aoq,/sambas1/SBS-20111031,sambas,sambas-24-0883-2009100,[],8000
3,1782.89,1783.4,tx@JEPSBS,aoq,/sambas1/SBS-20111031,sambas,sambas-24-0764-1782890,"[-0.0044555664, -0.0037231445, -0.0022583008, ...",8000
4,341.41,341.83,B,mhm,/german1/4123,german,german-008-097-341410,"[0.19720459, 0.15008545, 0.22064209, 0.1969604...",8000
5,622.02,622.37,A,ja,/german1/4123,german,german-008-223-622020,"[-0.0049438477, -0.0009765625, 0.0037231445, 0...",8000
6,220.343,220.682,f37ln,sí,/catalan1/ca_f37s_f38s_und,catalan,catalan-12-091-220343,"[-0.022521973, -0.027770996, -0.030822754, -0....",8000
7,266.974,267.346,f37ln,sí,/catalan1/ca_f37s_f38s_und,catalan,catalan-12-108-266974,"[-0.038024902, -0.034362793, -0.030456543, -0....",8000
8,145.13,145.82,tx@39,yeah,/arapaho1/25b,arapaho,arapaho-22-076-145130,"[0.0013427734, 0.002746582, 0.0020141602, 0.00...",8000
9,417.9,418.31,tx@5,yeah,/arapaho1/25b,arapaho,arapaho-22-206-417900,"[-0.00024414062, 0.0, -0.00061035156, -0.00036...",8000


#### <font color='orange'>Comment 2</font>
- The rate column is not correct (cf test_output.csv) 
- Row 2 and 16 are missing wav data  

## Optimizations

I propose the following approach.
First of all, define a function that operates on a single row.
This would be at atomic task.
Afterwards, we'll compare different methods to complete the whole task.

In [8]:
def get_row_audio(df, hparams = hparams, path = 'audio_files'):
    wav_loc = path + df.key + ".wav" # Build file location from data frame

    # load audio
    rate, data = prepare_wav(wav_loc, hparams)
    data = data.astype('float32')

    # get audio on a per-row-basis
    audio = data[int(df.start_time * rate) : int(df.end_time * rate)]

    return rate, audio

### Loop

In [9]:
%%time

rates = []
audios = []
for i in range(len(df)):
    row = df.iloc[i]
    rate, audio = get_row_audio(row)
    rates.append(rate)
    audios.append(audio)

CPU times: user 43.1 s, sys: 32.1 s, total: 1min 15s
Wall time: 1min 17s


### Loop with preallocated memory

In [10]:
%%time

rates = [None]*len(df)
audios = [None]*len(df)
for i in range(len(df)):
    row = df.iloc[i]
    rate, audio = get_row_audio(row)
    rates[i] = rate
    audios[i] = audio

CPU times: user 35.5 s, sys: 15 s, total: 50.5 s
Wall time: 52.2 s


### Using `map`

In [11]:
rows = [df.iloc[i] for i in range(len(df))]
#list(map(get_row_audio, rows)) #TODO: reactivate after figuring out why this causes memory problems