# Spectrogram data preparation

Aim: create a pipeline which creates a dataset of (spectrogram, caption) pairs

1. Load from huggingface
2. Inspect data
3. Prepare dataset of first speaker


In [2]:
import torchaudio
import librosa
import librosa.display
import numpy as np
from torchaudio.transforms import Spectrogram
from torchvision.transforms import functional as tf
import matplotlib.pyplot as plt

In [None]:
# Check available datasetes

from datasets import load_dataset, list_datasets

dataset_lists = list_datasets()

print(', '.join(dataset for dataset in dataset_lists))

In [2]:
dataset = load_dataset('Loie/VGGSound', split='train')

Downloading readme:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading and preparing dataset imagefolder/Loie--VGGSound to /home/ryan/.cache/huggingface/datasets/Loie___imagefolder/Loie--VGGSound-76634360ec67a488/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/116k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/7.75M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset imagefolder downloaded and prepared to /home/ryan/.cache/huggingface/datasets/Loie___imagefolder/Loie--VGGSound-76634360ec67a488/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.


In [3]:
print(dataset)
dataset[0]

Dataset({
    features: ['image'],
    num_rows: 1
})


{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=1924x494>}

Mel spectrogram conversion

In [37]:
import torchaudio
import torchaudio.transforms as transforms

# Takes an existing dataset and creates a copy with a spectrogram columm - no prefix
def transform_wav(dataset):    
    transformed_data = []

    for data in dataset:
        waveform, sample_rate = torchaudio.load(data['audio']['path'], normalize=True)

        # Parameters: number of fourier bins & mel filter banks
        n_fft = 256 
        n_mels = 80

        transform = transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=n_fft,
            n_mels=n_mels
        )
        
        spectrogram = transform(waveform)
        
        data['spectrogram'] = spectrogram
        transformed_data.append(data)
        
    return transformed_data

# Takes an existing dataset and creates a copy with a spectrogram column, allows selection of prefix
def transform_wav(dataset, prefix):
    prefix_len = len(prefix)
    filtered_data = [d for d in dataset if d["id"][:prefix_len] == prefix]
    
    transformed_data = []

    for data in filtered_data:
        waveform, sample_rate = torchaudio.load(data['audio']['path'], normalize=True)

        # Parameters: number of fourier bins & mel filter banks
        n_fft = 256 
        n_mels = 80

        transform = transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_fft=n_fft,
            n_mels=n_mels
        )
        
        spectrogram = transform(waveform)
        
        data['spectrogram'] = spectrogram
        transformed_data.append(data)
        
    return transformed_data
        
""" 
Transformation back into
invers_transform = torchaudio.transforms.InverseMelScale(sample_rate=sample_rate, n_stft=n_stft)
grifflim_transform = torchaudio.transforms.GriffinLim(n_fft=n_fft)

mel_specgram = transform(waveform)
inverse_waveform = invers_transform(mel_specgram)
pseudo_waveform = grifflim_transform(inverse_waveform) """
        
test_dataset = transform_wav(dataset, "LJ001")
    



{'id': 'LJ001-0001',
 'audio': {'path': '/home/ryan/.cache/huggingface/datasets/downloads/extracted/767c76e13c24c1cc26025e0fbcf1d4fe53cfaa6521aac12a67eac69b9fa07ab2/LJSpeech-1.1/wavs/LJ001-0001.wav',
  'array': array([-7.32421875e-04, -7.62939453e-04, -6.40869141e-04, ...,
          7.32421875e-04,  2.13623047e-04,  6.10351562e-05]),
  'sampling_rate': 22050},
 'file': '/home/ryan/.cache/huggingface/datasets/downloads/extracted/767c76e13c24c1cc26025e0fbcf1d4fe53cfaa6521aac12a67eac69b9fa07ab2/LJSpeech-1.1/wavs/LJ001-0001.wav',
 'text': 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition',
 'normalized_text': 'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition',
 'spectrogram': tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [0.

In [None]:
# Convert tf records to pytorch tensors
