# Audio Data Bunch

by @ste & @zachcaceres

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
#Export
import mimetypes
from fastai.vision import *
import torchaudio
from torchaudio import transforms

#for jupyter Display
from IPython.display import Audio

## Data Block classes

In [3]:
#Export

#Parameters
MIN_SAMPLE_SIZE = 201

# These are valid file extensions for audio files
AUDIO_EXTENSIONS = set(k for k,v in mimetypes.types_map.items() if v.startswith('audio/'));

In [4]:
#Export
class AudioDataBunch(DataBunch):
    def hear_ex(self, rows:int=3, ds_type:DatasetType=DatasetType.Valid, **kwargs):
        batch = self.dl(ds_type).dataset[:rows]
        self.train_ds.hear_xys(batch.x, batch.y, **kwargs)

class AudioItem(ItemBase):
    def __init__(self, signal, data=None, **kwargs):
        self.signal = signal
        self.sr = 16000
        self.kwargs = kwargs
        if (self.signal.shape[0] < MIN_SAMPLE_SIZE):
            self.signal = F.pad(self.signal, (0, MIN_SAMPLE_SIZE - self.signal.shape[0]), value=0)

        self.signal = self.signal[None,:] # add channel to signal
        self.data = transforms.MelSpectrogram(sr=self.sr, **kwargs)(self.signal)

    def __str__(self): return f'Duration: {self.duration} seconds.'
    def __len__(self): return self.signal.shape[1]
    def _repr_html_(self): return f'{self.__str__()}<br />{self.ipy_audio._repr_html_()}'
    
    def show(self, title:Optional[str]=None, **kwargs):
        "Show sound on `ax` with `title`, using `cmap` if single-channel, overlaid with optional `y`"
        self.hear(title=title)

    def hear(self, title=None):
        if title is not None: print(title)
        display(self.ipy_audio)

    def apply_tfms(self, tfms):
        for tfm in tfms:
            self.data = tfm(self.data)
        return self
        
    @property
    def shape(self):
        return self.data.shape

    @property
    def size(self)->Tuple[int,int]: return self.data.shape[-2:]

    @property
    def ipy_audio(self): return Audio(data=self.signal, rate=self.sr)

    @property
    def duration(self): return len(self)/self.sr

    @classmethod
    def open(cls, fn, **kwargs):
        return AudioItem(fn)

In [5]:
#Export
class AudioList(ItemList):
    _bunch = AudioDataBunch
    
    # TODO: __REPR__    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
            
    def get(self, i): return AudioItem.open(self.items[i])
    
    def reconstruct(self, t:Tensor): return Image(t.transpose(1,2))

    def hear_xys(self, xs, ys, **kwargs):
        for x, y in zip(xs, ys): x.hear(title=y, **kwargs)

    @classmethod
    def from_folder(cls, path:PathOrStr='.', extensions:Collection[str]=None, **kwargs)->ItemList:
        extensions = ifnone(extensions, AUDIO_EXTENSIONS)
        return super().from_folder(path=path, extensions=extensions, **kwargs)


## Sample data for test

In [6]:
# Standard path notation for fast.ai
# The files willbe saved on $HOME/.fastai/data/timit/
path = Path(Path.home()/'.fastai/data/timit')
if path.exists: print(f'Working directory: {path}')
else: print('Missing data folder')

Working directory: /home/ste/.fastai/data/timit


# Sample usage 

# from_df

In [7]:
def pad_to_max(t, mx=1000, value=0):
    """Pad tensor with `value` until it reaches length `mx`"""
    if t.shape[1] == mx: return t
    return F.pad(t, (0,0, 0,mx-t.shape[1]), value=value)

In [8]:
def process_phn_file(p_file, sig, sr, delimiter=' '):
    df = pd.read_csv(p_file, delimiter=delimiter, header=None)
    df.columns = ['Start', 'End', 'Phn']
    df['Sample'] = df.apply(lambda x : sig[-1][x['Start']: x['End']], axis=1)
    return df

In [9]:
import glob 

def create_phn_df(path, count=100):
    phns = []
    final = pd.DataFrame()

    for phn_file in glob.glob(str(path/'**/*.PHN'), recursive=True)[:count]:
        sig,sr = torchaudio.load(str(phn_file.replace('PHN', 'WAV')))
        df = process_phn_file(phn_file, sig, sr, delimiter=' ')
        df['Source'] = phn_file
        final = final.append(df)
    return df

df = create_phn_df(path/'TRAIN')
df.head()

Unnamed: 0,Start,End,Phn,Sample,Source
0,0,2240,h#,"[tensor(-6.1035e-05), tensor(-3.0518e-05), ten...",/home/ste/.fastai/data/timit/TRAIN/DR7/MSAH1/S...
1,2240,2480,d,"[tensor(0.0003), tensor(0.0003), tensor(0.0003...",/home/ste/.fastai/data/timit/TRAIN/DR7/MSAH1/S...
2,2480,3920,iy,"[tensor(-0.0010), tensor(0.0002), tensor(-0.00...",/home/ste/.fastai/data/timit/TRAIN/DR7/MSAH1/S...
3,3920,5170,pcl,"[tensor(-9.1553e-05), tensor(-0.0002), tensor(...",/home/ste/.fastai/data/timit/TRAIN/DR7/MSAH1/S...
4,5170,5380,p,"[tensor(-0.0006), tensor(-0.0008), tensor(-0.0...",/home/ste/.fastai/data/timit/TRAIN/DR7/MSAH1/S...


In [10]:
# Normal datablock setup from our AudioList from above.
data_from_df = (AudioList.from_df(df, path, cols=['Sample'])
    .split_by_rand_pct(0.1, seed=1)
    .label_from_df('Phn')
    .transform([[pad_to_max], [pad_to_max]])
    .databunch(bs=8))        

Your valid set contained the following unknown labels, the corresponding items have been discarded.
pcl, dx
  if getattr(ds, 'warn', False): warn(ds.warn)


In [11]:
data_from_df.get(6)

# from_folder

# <span style="color:red">TODO: merge with Yes/No sample</span>.

In [12]:
ils = AudioList.from_folder(path,extensions=AUDIO_EXTENSIONS)

# Export

In [13]:
!python notebook2script.py DataBlock.ipynb

Converted DataBlock.ipynb to nb_DataBlock.py
