## Adding operations between audio data files

In [1]:

import os
import glob
from pathlib import Path
import datetime
#import random
import numpy as np
import pandas as pd
from threading import Thread

from microfaune import audio

In [2]:

def mesure_delta(time_start, time_end):
    """ time mesurements
    input a (= datetime.datetime.now())
    input b (= datetime.datetime.now())
    output : seconds, ms, string to display
    """
    c = time_end - time_start
    s = c.seconds
    ms = int(c.microseconds/1000)
    t = str(s) + 's ' + str(ms) + 'ms'
    return s, ms, t


def load_wav_dataonly(wav_filename):
    """ load data without fs """
    fs, data = audio.load_wav(wav_filename)
    return data 


### Basic audio adding files operation 
- audio with same fs size
- audio with different fs size


In [3]:
def add_dataWav1todataWav2(data1, data2, percent1=1, percent2=1):
    """ addition of two wave np array, with weight percentage
    input : 
    data1 first data audio, in flat vector
    data1 first data audio, in flat vector
    percent 1 percentage of data taken
    percent 1 percentage of data added
    output :
    p1*data1 + p2*data2     
    """
    
    if len(data2)==0:
        return data1    
    if len(data2)>len(data1):
        data2 = data2[:len(data1)]
    if len(data2)<len(data1):
        data2 = list(data2) + [1] * (len(data1)-len(data2))
    if percent1==1 and percent2==1:
        list3 = list(map(lambda x, y: x + y, data1, data2))                             
    else:
        list3 = list(map(lambda x, y: (percent1 * x) + (percent2 * y), \
                             data1, data2))
    return np.array(list3, dtype=np.float32) 




##### Test function 

In [4]:
def test_add_dataWav1todataWav2(wave_file='../toy_data/bird1.wav'):
    """ basic tests of wave addition function """
    """ on audio of having the same fs size """
    
    fs, data = audio.load_wav(wave_file)
    data1 = data.copy()
    data2 = np.array([1] * len(data1), dtype=np.float32) 
    
    print('data1==data2')
    print('data1', data1)
    print('data2', data2)
    print()
    
    debut_t = datetime.datetime.now()
    data3 = add_dataWav1todataWav2(data1, data2) 
    s, ms, t = mesure_delta(debut_t, datetime.datetime.now())
    print('Time for data3 = data2 + data1 :', t)
    print('data3', data3)
    print()
    
    debut_t = datetime.datetime.now()
    data3 = add_dataWav1todataWav2(data1, data2, 1, 0.5) 
    s, ms, t = mesure_delta(debut_t, datetime.datetime.now())
    print('Time for data3 = data1 + 50%(data2) :', t)
    print('data3', data3)
    print()
    
    debut_t = datetime.datetime.now()
    data3 = add_dataWav1todataWav2(data1, data2, 0.9, 0) 
    s, ms, t = mesure_delta(debut_t, datetime.datetime.now())
    print('Time for data3 = 90%(data1) + 0%(data2) :', t)
    print('data3', data3)
    print()
    
    data3 = add_dataWav1todataWav2(data1, data2, 1, 0.8) 
    print('data3 = data1 + 80%(data2)')
    print('data3', data3)
    
test_add_dataWav1todataWav2()


data1==data2
data1 [  0.   0.   0. ... 558. 741. 620.]
data2 [1. 1. 1. ... 1. 1. 1.]

Time for data3 = data2 + data1 : 0s 316ms
data3 [  1.   1.   1. ... 559. 742. 621.]

Time for data3 = data1 + 50%(data2) : 3s 822ms
data3 [5.000e-01 5.000e-01 5.000e-01 ... 5.585e+02 7.415e+02 6.205e+02]

Time for data3 = 90%(data1) + 0%(data2) : 3s 911ms
data3 [  0.    0.    0.  ... 502.2 666.9 558. ]

data3 = data1 + 80%(data2)
data3 [  0.8   0.8   0.8 ... 558.8 741.8 620.8]


#### Discussion
- time mesurements are betters with simple addition than when we apply percentages

### Class and method for loading audio file by  file list
- limit size for test

In [9]:
class load_bythread(Thread):
    """Thread to load audio file """
    wavefile = None
    data = None

    def __init__(self, wavefile):
        Thread.__init__(self)
        self.wavefile = wavefile

    def run(self):
        """Code du thread."""
        #fs, self.data = wavfile.read(self.wavefile)
        self.data = load_wav_dataonly(self.wavefile)

    def join(self):
        Thread.join(self)
        return self.data
    
def load_audiofile_list(pathfiles, by_thread=True, traceon=False):
    """ load audio file list
    input :
        pathfiles wave path files 
        two loading mode is proposed : 
        by_thread = True with multithreading
        by_thread = False without multithreading
        traceon : to display performance
    ouput :
        DataFrame with loaded data from wave files
    """    
    if traceon:
        debut_t = datetime.datetime.now()
    
    wavefiles = glob.glob(pathfiles + "//*.wav")
    df = pd.DataFrame(wavefiles)
    
    # limit size for test
    length = len(df)
    if length>2000:
        length = 2000  
    df = df[:length] 
    
    print(len(df))
    display(df.head())
    df.columns = ['fullname']
    df['file_name'] = df['fullname'].apply(lambda x: Path(x).name)
    
    if not by_thread:
        df['data'] = df['fullname'].apply(lambda x: load_wav_dataonly(x))
    
    if by_thread:
        df['load_bythread'] = df['fullname'].apply(lambda x: load_bythread(x))
        df.apply(lambda x: x['load_bythread'].start(), axis=1)
        df['data'] = df.apply(lambda x: x['load_bythread'].join(), axis=1)
        
    if traceon:
        s, ms, t = mesure_delta(debut_t, datetime.datetime.now())
        print('Time for loadind', len(wavefiles), ',audio files with mode mutithread=',by_thread,':', t)

    return df


### Test loading wave audio file list operation

ulimit -n     
1024  = nombre de fichiers ouvrable simmultaneement
   
solution :
ulimit -S -n 40960    
ulimit -n    
40960   


In [10]:
def test_load_audiofile_list(pathfiles=None):
    """ test loading wav audio file list """
    if pathfiles is None:
        #pathfiles = '/mnt/d/ecomdata/DataForGood_wazo/audio-annotator/split_data-20191021T201218Z-001/split_data'
        pathfiles = '/mnt/d/ecomdata/DataForGood_wazo/input/ff1010bird_wav/wav'
    df = load_audiofile_list(pathfiles, by_thread=False, traceon=True)
    df = load_audiofile_list(pathfiles, by_thread=True, traceon=True)

test_load_audiofile_list()

2000


Unnamed: 0,0
0,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...
1,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...
2,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...
3,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...
4,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...


Time for loadind 7690 ,audio files with mode mutithread= False : 55s 437ms
2000


Unnamed: 0,0
0,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...
1,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...
2,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...
3,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...
4,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...


Time for loadind 7690 ,audio files with mode mutithread= True : 15s 28ms


### class for adding audio operation with file list

In [12]:
class addwav_bythread(Thread):
    """Thread to load audio file """
    data = None

    def __init__(self, data1, data2, p1=1, p2=1):
        Thread.__init__(self)
        self.data1 = data1
        self.data2 = data2
        self.p1 = p1
        self.p2 = p2
        
    def run(self):
        """Code du thread."""
        self.data = add_dataWav1todataWav2(self.data1, self.data2, self.p1, self.p2) 

    def join(self):
        Thread.join(self)
        return self.data

In [13]:

def test_dataWavAdding():
    """ test adding operation on file list """        
    pathfiles = '/mnt/d/ecomdata/DataForGood_wazo/input/ff1010bird_wav/wav'
    df1 = load_audiofile_list(pathfiles, by_thread=True, traceon=True)
    
    """
    Time for loadind 7690 ,audio files with mode mutithread= True : 149s 834ms
    """
    
    #for n in [100, 300, 500, 1000, len(df1)]:
    for n in [100, 300, 500, 2000]:
        print('test adding on ', n, ' files')
        df2 = df1[:n].copy()    
        debut_t = datetime.datetime.now()        
        df2['data2'] = df2['data']
        df2['addwav_bythread'] = df2.apply(lambda x: addwav_bythread(x['data'], x['data2']), axis=1)
        
        df2.apply(lambda x: x['addwav_bythread'].start(), axis=1)
        df2['data3'] = df2.apply(lambda x: x['addwav_bythread'].join(), axis=1)
        
        s, ms, t = mesure_delta(debut_t, datetime.datetime.now())
        print('Time for Add. operation on /', len(df2), 'wav files :', t)
        
test_dataWavAdding()

2000


Unnamed: 0,0
0,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...
1,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...
2,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...
3,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...
4,/mnt/d/ecomdata/DataForGood_wazo/input/ff1010b...


Time for loadind 7690 ,audio files with mode mutithread= True : 13s 864ms
test adding on  100  files
Time for Add. operation on / 100 wav files : 39s 122ms
test adding on  300  files
Time for Add. operation on / 300 wav files : 114s 693ms
test adding on  500  files
Time for Add. operation on / 500 wav files : 191s 85ms
test adding on  2000  files
Time for Add. operation on / 2000 wav files : 748s 444ms
