# GPU Backed Spectogarms

In [1]:
from torchaudio.transforms import MelSpectrogram, PadTrim
from fastai.vision import Image, open_image, image2np
from fastai.basics import *
import time
import PIL
import numpy as np
from torchaudio import load as load_audio
from concurrent.futures import ProcessPoolExecutor
import multiprocessing

In [2]:
AUDIO_EXTENSIONS = tuple(str.lower(k) for k,v in mimetypes.types_map.items() 
                         if v.startswith('audio/'))

In [3]:
#Export
class SPEC2DB(object):
    """Turns a spectrogram from the power/amplitude scale to the decibel scale.

    Args:
        stype (str): scale of input spectrogram ("power" or "magnitude").  The
            power being the elementwise square of the magnitude. default: "power"
        top_db (float, optional): minimum negative cut-off in decibels.  A reasonable number
            is -80.
    """
    def __init__(self, stype="power", top_db=None):
        self.stype = stype
        self.top_db = -top_db if top_db > 0 else top_db
        self.multiplier = 10. if stype == "power" else 20.

    def __call__(self, spec):
        spec_db = self.multiplier * torch.log10(spec / spec.max())  # power -> dB
        if self.top_db is not None:
            spec_db = torch.max(spec_db, spec_db.new([self.top_db]))
        return spec_db

In [4]:
## The actual url is http://www.openslr.org/resources/45/ST-AEDS-20180100_1-OS.tgz
## but we need to strip off the extension otherwise fastai gets confused.
data_url = 'http://www.openslr.org/resources/45/ST-AEDS-20180100_1-OS'
## Need this because the source tar file doesn't extract to its own folder
p = datapath4file(url2name(data_url))
untar_data(data_url, dest=p)
p

PosixPath('/home/h/.fastai/data/ST-AEDS-20180100_1-OS')

In [27]:
def create_spectogram(p, to_db_scale=True, force_cache=False, n_fft=1024,
                ws=None, hop=72, f_min=0.0, f_max=200, pad=0, n_mels=224, max_to_pad=16000):
    sig, sr = load_audio(p)
    image_path = p.with_suffix('.jpg')
    sig = sig.clone().cuda()
    sig = PadTrim(max_len=max_to_pad)(sig).squeeze()
    mel = MelSpectrogram(sr=sr, n_mels=n_mels, n_fft=n_fft, ws=ws, hop=hop,
                    f_min=f_min, f_max=f_max, pad=pad)(sig.reshape(1, -1))
    mel = mel.permute(0, 2, 1) # swap dimension...
    mel = SPEC2DB(stype='magnitude', top_db=f_max)(mel)
    x = image2np(mel).astype(np.uint8)
    PIL.Image.fromarray(x).save(image_path)

def spectogram_files(folder, **kwargs):
    files = get_files(folder, extensions=AUDIO_EXTENSIONS)
    f = partial(create_spectogram, **kwargs)
    [f(x) for x in files]

In [28]:
max_length = 4*16000

tfm_params = {
    'max_to_pad' : max_length,
    'to_db_scale':True,
    'f_max': 80
}

t = time.time()
spectogram_files(p, **tfm_params)
time.time() - t

11.725131034851074