# Audio Data Bunch

by @ste & @zachcaceres

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
#Export
import mimetypes
from fastai.vision import *
import torchaudio
from torchaudio import transforms

#for jupyter Display
from IPython.display import Audio

## Sample data for test

In [None]:
# Standard path notation for fast.ai
# The files willbe saved on $HOME/.fastai/data/timit/
path = Path(Path.home()/'.fastai/data/timit')
if path.exists: print(f'Working directory: {path}')
else: print('Missing data folder')

## Data Block classes

In [None]:
#Export

#Parameters
MIN_SAMPLE_SIZE = 201

# These are valid file extensions for audio files
AUDIO_EXTENSIONS = set(k for k,v in mimetypes.types_map.items() if v.startswith('audio/'));

### AudioItem
This is the base class of out audio data. It contains two basic information about the "sound":
* sr: the sample rate
* data: the actual signal

**IMPORTANT:** the audio signal is one-dimensional.

In [None]:
#Export        
class AudioItem(ItemBase):
    def __init__(self, data=None, sr=16000, **kwargs):
        self.data = data.reshape(-1) # Always flatten out to single dimension signal!
        self.sr = sr
        self.kwargs = kwargs

    def __str__(self): return f'Duration: {self.duration} seconds.'
    def __len__(self): return self.data.shape[0]
    def _repr_html_(self): return f'{self.__str__()}<br />{self.ipy_audio._repr_html_()}'
    
    def show(self, title:Optional[str]=None, **kwargs):
        "Show sound on `ax` with `title`, using `cmap` if single-channel, overlaid with optional `y`"
        self.hear(title=title)

    def hear(self, title=None):
        if title is not None: print(title)
        display(self.ipy_audio)

    def apply_tfms(self, tfms):
        for tfm in tfms:
            self.data = tfm(self.data)
        return self
        
    @property
    def shape(self):
        return self.data.shape
    
    @property
    def ipy_audio(self): 
        return Audio(data=self.data, rate=self.sr)

    @property
    def duration(self): return len(self.data)/self.sr

    @classmethod
    def open(cls, fileName, **kwargs):
        p = Path(fileName)
        if p.exists():
            signal,samplerate = torchaudio.load(str(fileName))
            return AudioItem(signal,samplerate)
        raise f'File not fund: {fileName}'

In [None]:
def test_AudioItem_create_from_data():
    signal,samplerate = torchaudio.load(str(path/'TRAIN/DR1/MDPK0/SA1.WAV'))
    a = AudioItem(signal,samplerate)
    assert 1 == len(a.data.shape), 'Single dimension data'
    assert a.data.shape[0] > 100, 'Has data'
    assert 16000 == a.sr
    display(a)

test_AudioItem_create_from_data()

In [None]:
def test_AudioItem_create_from_audio_file():
    a = AudioItem.open(str(path/'TRAIN/DR1/MDPK0/SA1.WAV'))
    assert 1 == len(a.data.shape), 'Single dimension data'
    assert a.data.shape[0] > 100, 'Has data'
    assert 16000 == a.sr
    display(a)
    
test_AudioItem_create_from_audio_file()

## AudioDataBunch

In [None]:
#Export
class AudioDataBunch(DataBunch):
    def hear_ex(self, rows:int=3, ds_type:DatasetType=DatasetType.Valid, **kwargs):
        batch = self.dl(ds_type).dataset[:rows]
        self.train_ds.hear_xys(batch.x, batch.y, **kwargs)

### AudioList
This class is responsible to contain a list of AudioItem.

In [None]:
#Export
class AudioList(ItemList):
    _bunch = AudioDataBunch
    
    # TODO: __REPR__    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
            
    def get(self, i):
        item = self.items[i]
        if isinstance(item, str):
            return AudioItem.open(item)
        if isinstance(item, tuple): #data,sr
            return AudioItem(item[0],item[1])
        raise 'Format not supported!'
    
    def reconstruct(self, t:Tensor): return Image(t.transpose(1,2))

    def hear_xys(self, xs, ys, **kwargs):
        for x, y in zip(xs, ys): x.hear(title=y, **kwargs)
            
    @classmethod
    def from_folder(cls, path:PathOrStr='.', extensions:Collection[str]=None, **kwargs)->ItemList:
        extensions = ifnone(extensions, AUDIO_EXTENSIONS)
        return super().from_folder(path=path, extensions=extensions, **kwargs)


In [None]:
def test_AudioList_from_df_file_names():
    import glob
    #Create Data Frame
    df = pd.DataFrame(glob.glob(str(path/'**/*.WAV'), recursive=True)[:10])
    df.columns = ['FileName']
    display(df.head())

    #Crete AudioList
    ils = AudioList.from_df(df, path, cols=['FileName'])
    
    #Test a item
    i=5
    print(f'FileName: {df.FileName[i]}')
    a = ils.get(i)
    print(a.data.shape, a.sr)
    display(a)

test_AudioList_from_df_file_names()

In [None]:
#sig,srt = torchaudio.load(str(path/'TRAIN/DR1/MDPK0/SA1.WAV'))
sig,srt = torchaudio.load('/home/ste/.fastai/data/timit/TRAIN/DR7/MPAR0/SX406.WAV')
display(Audio(sig,rate=srt))

In [None]:
def test_AudioList_from_df_data_and_sr():
    import glob
    #Create Data Frame
    df = pd.DataFrame(glob.glob(str(path/'**/*.WAV'), recursive=True)[:10])
    df.columns = ['FileName']
    df['SampleAndSr']=df['FileName'].apply(lambda n: torchaudio.load(n))
    #df = df['tmp'].drop()
    
    display(df.head())

    #Crete AudioList
    ils = AudioList.from_df(df, path, cols=['SampleAndSr'])
    
    #Test a item
    i=4
    print(f'FileName: {df.FileName[i]}')
    a = ils.get(i)
    print(a.data.shape, a.sr)
    display(a)

test_AudioList_from_df_data_and_sr()

# ---- TODO: CONTINUE FROM HERE ----

# Audio transformations

In [None]:
def tfm_to_mel(x):
    """Transform AudioItem to spectrogram"""
    src = x.reshape(1,-1) # expect single sample...
    return transforms.MelSpectrogram(
        sr=16000,
        n_fft=1024,
#        hop_length=512, 
#        n_mels=128,                  
#        power=1.0, 
#        fmin=20, fmax=8000        
    )(src)

In [None]:
def test_tfm_to_mel():
    sig,sr = torchaudio.load(str(path/'TRAIN/DR1/MDPK0/SA1.WAV'))
    x = AudioItem(sig, sr=sr)
    display(x)
    img = tfm_to_mel(x)
    print(img.shape)
    plt.imshow(img[0])
    
test_tfm_to_mel()

# from_df

In [None]:
def pad_to_max(t, mx=1000, value=0):
    """Pad tensor with `value` until it reaches length `mx`"""
    if t.shape[1] == mx: return t
    return F.pad(t, (0,0, 0,mx-t.shape[1]), value=value)

In [None]:
def process_phn_file(p_file, sig, sr, delimiter=' '):
    df = pd.read_csv(p_file, delimiter=delimiter, header=None)
    df.columns = ['Start', 'End', 'Phn']
    df['SampleAndSr'] = df.apply(lambda x : (sig[-1][x['Start']: x['End']], sr), axis=1)
    return df

In [None]:
import glob 

def create_phn_df(path, count=100):
    phns = []
    final = pd.DataFrame()

    for phn_file in glob.glob(str(path/'**/*.PHN'), recursive=True)[:count]:
        sig,sr = torchaudio.load(str(phn_file.replace('PHN', 'WAV')))
        df = process_phn_file(phn_file, sig, sr, delimiter=' ')
        df['Source'] = phn_file
        final = final.append(df, ignore_index=True)
    return df

df = create_phn_df(path/'TRAIN')
df.head()

In [None]:
type(df.SampleAndSr[0])

In [None]:
def tfm_log(x):
    '''Fake transformation that logs x shape'''
    print(f'Shape of transform input: {x.shape}')
    return x

In [None]:
def tfm_flatten(x):
    return x.reshape(-1)

In [None]:
# Normal datablock setup from our AudioList from above.
data_from_df = (AudioList.from_df(df, path, cols=['SampleAndSr'])
    .split_by_rand_pct(0.1, seed=1)
    .label_from_df('Phn')
    .transform([[tfm_log, tfm_to_mel, pad_to_max, tfm_log], 
                [tfm_log, tfm_to_mel, pad_to_max, tfm_log]])
    .databunch(bs=8))        

In [None]:
data_from_df.one_batch(1)

In [None]:
t0 = AudioList.from_df(df, path, cols=['Sample']); print(type(t0));
#t1 = t0.audio_transform([[tfm_log],[tfm_log]]); print(type(t1));
t1.get(1)

In [None]:
len(data_from_df.train_dl)

In [None]:
data_from_df.get(0)

# from_folder

# <span style="color:red">TODO: merge with Yes/No sample</span>

In [None]:
ils = AudioList.from_folder(path,extensions=AUDIO_EXTENSIONS)

# Export

In [None]:
!python notebook2script.py DataBlock.ipynb