# Training on Speech Commands

If you havent installed fastaudio do it uncommenting and executing the following cell

In [None]:
#!pip install git+https://github.com/fastaudio/fastaudio.git

In [None]:
from fastai.torch_basics import *
from fastai.basics import *
from fastai.data.all import *
from fastai.callback.all import *
from fastai.vision.all import *

from fastaudio.core.all import *
from fastaudio.augment.all import *

import torchaudio

## Baseline

The dataset is about 2.26 G

In [None]:
r = torchaudio.datasets.SPEECHCOMMANDS(".", download=True)
r

In [None]:
commands_path = Path("SpeechCommands")
audio_files = get_audio_files(commands_path)
len(audio_files)

In [None]:
for i in range(10):
    print(random.choice(audio_files))

In [None]:
for i in range(10):
    f = random.choice(audio_files)
    print("File:",f )
    print("Label:", parent_label(f))

In [None]:
DBMelSpec = SpectrogramTransformer(mel=True, to_db=True)
a2s = DBMelSpec()
crop_4000ms = ResizeSignal(4000)
tfms = [crop_4000ms, a2s]

In [None]:
auds = DataBlock(blocks=(AudioBlock, CategoryBlock),  
                 get_items=get_audio_files, 
                 splitter=RandomSplitter(),
                 item_tfms=tfms,
                 get_y=parent_label)

In [None]:
audio_dbunch = auds.dataloaders(commands_path, item_tfms=tfms, bs=64)

In [None]:
# credit to Kevin Bird and Hiromi Suenaga for these two lines to adjust a CNN model to take 1 channel input
def alter_learner(learn, channels=1):
    learn.model[0][0].in_channels=channels
    learn.model[0][0].weight = torch.nn.parameter.Parameter(learn.model[0][0].weight[:,1,:,:].unsqueeze(1))

In [None]:
learn = Learner(audio_dbunch, 
                xresnet18(),
                torch.nn.CrossEntropyLoss(), 
                metrics=[accuracy])
nchannels = audio_dbunch.one_batch()[0].shape[1]
alter_learner(learn, nchannels)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(5, lr_max=slice(1e-2))

In [None]:
learn.lr_find()

In [None]:
learn.unfreeze()
learn.fit_one_cycle(5, lr_max=slice(1e-3))

## Customize our AudioToSpec Function using a config

In [None]:
voice_cfg = AudioConfig.Voice()
a2s = AudioToSpec.from_cfg(voice_cfg)
tfms = [crop_4000ms, a2s]
auds.item_tfms = tfms
# tfms = Pipeline([ResizeSignal(4000),  a2s, MaskFreq(size=12), MaskTime(size=15), SGRoll()], as_item=True)
dbunch250B = auds.dataloaders(commands_path, bs=64)

In [None]:
learn = Learner(dbunch250B, 
                xresnet18(),
                torch.nn.CrossEntropyLoss(), 
                metrics=[accuracy])
nchannels = dbunch250B.one_batch()[0].shape[1]
alter_learner(learn, nchannels)

In [None]:
learn.lr_find()

In [None]:
# Better results even without fine tuning, but much slower. We need to move a2s to the GPU and 
# then add data augmentation!
learn.fit_one_cycle(5, lr_max=slice(2e-2))

## Training an MFCC with Delta

In [None]:
# only grab 1500ms of the clip, voice identity can be done with shorter sections and it will speed it up
# this is really slow for mfcc, even for 45k files, need to figure out what's going on here. Also the results
# shouldn't be this much worse than melspectrogram
a2mfcc = AudioToMFCC(n_mffc=20, melkwargs={"n_fft":2048, "hop_length":256, "n_mels":128})
tfms = [ResizeSignal(1500), a2mfcc, Delta()]
auds.item_tfms = tfms
# tfms = Pipeline([ResizeSignal(4000),  a2s, MaskFreq(size=12), MaskTime(size=15), SGRoll()], as_item=True)
dbunch_mfcc = auds.dataloaders(commands_path, bs=64)

In [None]:
#n_mfcc isn't getting passed down? 
dbunch_mfcc.one_batch()[0].shape

In [None]:
learn = Learner(dbunch_mfcc, 
                xresnet18(),
                torch.nn.CrossEntropyLoss(), 
                metrics=[accuracy])

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(5, lr_max=slice(2e-2))

<div class="alert alert-block alert-info">
<strong>From Here:</strong><br>
    1. Get transforms on the GPU <br>
    2. Once it's faster test signal and spectrogram augments for speed/efficacy<br>
    3. Fine-tune and see how high we can push results on 250 speakers
</div>