In [None]:
# default_exp data

In [None]:
#export

from fastai2_resnet_audio.model import * 
from fastai2.torch_core import TensorBase
from fastai2.data.block import TransformBlock
from fastcore.transform import Transform
from fastai2.vision.augment import RandTransform
import torch.nn.functional as F
import torch
import torchaudio
import torchaudio.transforms

In [None]:
#hide
from nbdev.showdoc import *

# fastai2-resnet-audio data

> DataBlock and transforms for fastai2-resnet-audio model

## DataBlock

AudioBlock creates a TensorAudio instance. TensorAudio uses torchaudio to load the sound file.

In [None]:
#export

def AudioBlock():
    return TransformBlock(type_tfms=TensorAudio.create, batch_tfms=None)

class TensorAudio(TensorBase):
          
    @classmethod
    def create(cls, o, norm=True):
        o, sr = torchaudio.load(o, normalization=norm)
        o = cls(o)
        o.sr = sr
        o.mode = 'raw'
        return o
    '''
    def show(self, ctx=None):
        if self.mode == 'raw':
            print(self.shape)
            librosa.display.waveplot(np.asarray(self.squeeze()), sr=self.sr)
            #print(img.shape)
    '''   

## Transforms

In [None]:
#export

class AudioFixLength(Transform):

    def __init__(self, length=0.0):
        self.length = length

    def encodes(self, o: TensorAudio):
        if self.length > 0.0:
            n_samples = int(o.sr * self.length)
            if n_samples < len(o.squeeze()):
                o = torch.split(o, n_samples, dim=1)[0]
            else:
                n_pad = int(o.sr * self.length - len(o.squeeze()))
                n_pre = (torch.rand(1) * n_pad).int()
                n_post = n_pad - n_pre
                o = F.pad(input=o, pad=(n_pre,n_post), mode='constant', value=0)
        return o
    
class AudioResample(Transform):

    def __init__(self, target_sr=0, device='cpu'):
        self.target_sr = target_sr
        self.device = device

    def encodes(self, o: TensorAudio):
        if self.target_sr != o.sr:
            resample = torchaudio.transforms.Resample(orig_freq=o.sr, new_freq=self.target_sr)
            o = TensorAudio(resample(o))
            o.sr = self.target_sr
        return o
    
class AudioToMono(Transform):

    def __init__(self, device='cpu'):
        self.device = device

    def encodes(self, o: TensorAudio):
        sr = o.sr
        o = TensorAudio(torch.mean(o,dim=0).unsqueeze(0))
        o.sr = sr
        return o

    
class AudioRandomCrop(RandTransform):
    
    def __init__(self, p=1.0, length=0.0): 
        super().__init__(p=p)
        self.length = length
        
    def encodes(self, o: TensorAudio): 
        if self.length > 0.0:
            n_samples = int(o.sr * self.length)
            if n_samples < len(o[0]):
                n_cut = len(o[0]) - n_samples
                n_pre = (n_cut * torch.rand(1)).int()
                o = o[:,n_pre:(n_samples + n_pre)]
        return o    
    

class AudioAddNoise(RandTransform):
    "Randomly add noise with probability `p`"
    def __init__(self, p=0.5, device='cpu'): 
        super().__init__(p=p)
        self.device=device
        
    def encodes(self, o: TensorAudio): 
        noise_amp = (0.001*torch.rand(1) * torch.max(o)).to(self.device)
        o = o + noise_amp * torch.empty(o.shape).normal_().to(self.device)
        return o
    

class AudioToTensor(Transform):

    def encodes(self, o: TensorAudio):
        o = tensor(o).float()
        return o
    

# Tests

## Transforms

### AudioFixLength

In [None]:
#hide

# 0.5 second sample at 44100 Hz
ta1 = TensorAudio(torch.randn(1,22050))
ta1.sr = 44100
assert ta1.shape == (1,22050)

# Test AudioFixLength
# 2.0 second sample at 44100 Hz
ta2 = TensorAudio(torch.randn(1,88200))
ta2.sr = 44100
assert ta2.shape == (1,88200)

# set length to 1.0 seconds -> 44100 samples
tfm = AudioFixLength(1.0)
ta1 = tfm.encodes(ta1)
assert ta1.shape == (1,44100)

ta2 = tfm.encodes(ta2)
assert ta2.shape == (1,44100)

### AudioResample

In [None]:
#hide

# 1 second sample at 44100 Hz
ta1 = TensorAudio(torch.randn(1,44100))
ta1.sr = 44100
assert ta1.shape == (1,44100)

# resample to 22050 Hz
tfm = AudioResample(target_sr=22050)
ta1 = tfm.encodes(ta1)

assert ta1.shape == (1,22050)
assert ta1.sr == 22050

### AudioToMono

In [None]:
#hide

# 1 second 2 channel sample at 44100 Hz 
ta1 = TensorAudio(torch.empty(2,44100))
ta1.sr = 44100
ta1[0].fill_(0.)
ta1[1].fill_(1.)
assert ta1.shape == (2,44100)

tfm = AudioToMono()
ta1 = tfm.encodes(ta1)

assert ta1.shape == (1,44100)
assert ta1[0][0] == 0.5
assert ta1.mean() == 0.5