In [None]:
import pandas as pd
import numpy as np
import soundfile as sf
import librosa
from IPython.display import Audio
import os
from scipy.io.wavfile import write as write_wav
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from transformers import (
    AutoFeatureExtractor, ClapFeatureExtractor, GraniteSpeechFeatureExtractor, Speech2TextFeatureExtractor,
    MCTCTFeatureExtractor, UnivNetFeatureExtractor, ParakeetFeatureExtractor, SeamlessM4TFeatureExtractor,
    MusicgenMelodyFeatureExtractor, DiaFeatureExtractor,DacFeatureExtractor, EncodecFeatureExtractor,
    Speech2TextFeatureExtractor, SeamlessM4TFeatureExtractor, SpeechT5FeatureExtractor, UnivNetFeatureExtractor, 
    Wav2Vec2FeatureExtractor
)

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]


### spotify playlists

* [jamaican patois](https://open.spotify.com/playlist/54AaUsgTHhmsdfhT5lMMdl)
* [ghanaian pidgin](https://open.spotify.com/playlist/74Nm56ijqC9O9Z2qF3kC5X)
* [american english](https://open.spotify.com/playlist/4WjFFH3mIMD6EEpDmPZKc3)
* [european spanish](https://open.spotify.com/playlist/38hpr6Usrjtey7IkdNHlaN)
* [chinese mandarin](https://open.spotify.com/playlist/76XEwiwfRRtg3QDeB2ttCD)
* [indian hindu](https://open.spotify.com/playlist/4m9VYOmySJiM7wv14um6lZ)

### audio pre-processing techniques
* [short-time fourier transform](https://librosa.org/doc/main/generated/librosa.stft.html)
    * 704 • 257
* [mel-scaled spectrogram](https://librosa.org/doc/main/generated/librosa.feature.melspectrogram.html)
    * 704 • 96
* [mel-frequency cepstral coefficients](https://librosa.org/doc/main/generated/librosa.feature.mfcc.html)
    * 704 • 20
* [audio spectrogram transformer](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/audio-spectrogram-transformer#transformers.ASTFeatureExtractor)
    * 1024 • 128
* [clap](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/clap#transformers.ClapFeatureExtractor)
    * 1001 • 64
* [descript audio codec](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/dac#overview)
    * 1 • 240,000
* [dia](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/dia#overview)
    * 1 • 240,128
* [encodec neural codec](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/encodec#overview)
    * 1 • 360,001
* [granite speech](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/granite_speech#overview)
    * 1034 • 160
* [kyutai mimi](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/mimi)
    * 1 • 330,750
* [m-ctc-t](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/mctct#overview)
    * 2065 • 80
* [moonshine](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/moonshine)
    * 1 • 330,750
* [moshiko](http://huggingface.co/docs/transformers/v4.57.1/en/model_doc/moshi)
    * couldn't load
* [musicGen melody](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/musicgen_melody#overview)
    * 81 • 12
* [parakeet](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/parakeet#overview)
    * 2068 • 80
* [pop2Piano](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/pop2piano#overview)
    * 40 • 512
* [seamlessM4T](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/seamless_m4t#overview)
    * 1033 • 160
* [speech2Text](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/speech_to_text)
    * 2065 • 80
* [speechT5](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/speecht5)
    * 1 • 330,750
* [uniSpeech](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/unispeech#overview)
    * 1 • 330,750
* [uniSpeechSAT](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/unispeech-sat#overview)
    * 1 • 330,750
* [univNet](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/univnet#overview)
    * 937 • 100
* [wav2vec2](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/wav2vec2#overview)
    * 1 • 330,750
* [wav2vec2 conformer](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/wav2vec2-conformer)
    * 1 • 330,750
* [wavLM](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/wavlm)
    * 1 • 330,750
* [whisper](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/whisper#whisper)
    * 3000 • 80
* [x-codec](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/xcodec#overview)
    * 1 • 330,880

### future considerations
* [stft-spectrogram layer](https://keras.io/examples/audio/stft/)
* [melspectrogram layer](https://keras.io/api/layers/preprocessing_layers/audio_preprocessing/mel_spectrogram/)
* [comprehensive AutoFeatureExtractor library](https://huggingface.co/docs/transformers/en/model_doc/auto#audio)

### questions
* how long does it take for each method to produce the preprocessed data ?
    * **reasoning:** if it takes too long, no sense being part of the evolutionary process. it already takes a very long time to build and train the models, adding preprocessing at run time on top of that is intense, on top of it making the search space even more vast with all the specifications for each feature extraction technique. it will take even longer for this evo process to run if every single time a model runs it preprocesses the data. this can be a future step where there are GPU and computational resources to accelerate this pipeline. i can even provide code that does this for others with more computing power to run.


In [None]:
# create a class object to store song information
class Song:
    
    def __init__(self, name, audio_vector, lang, samplerate, length, split):
        self.name = name
        self.audio = audio_vector     
        self.lang = lang              
        self.samplerate = samplerate
        self.length = length
        self.split = split

# load data 
* save values into Song objects (language + sample rate)
* load audio into 1D vectors
* assign train-test-validation splits to object instances

In [3]:
def process_song(path, lang, sr=22050, clip_len=15):
    
    import subprocess, io, soundfile as sf, numpy as np

    # try fast path with librosa; on failure, robust fallback via ffmpeg -> wav bytes
    try:
        y, _ = librosa.load(path, sr=sr, mono=True)
    except Exception:
        try:
            proc = subprocess.run(
                [
                    'ffmpeg', '-v', 'error', '-nostdin',
                    '-i', path,
                    '-ac', '1', '-ar', str(sr),
                    '-f', 'wav', '-' 
                ],
                check=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE
            )
            wav_bytes = io.BytesIO(proc.stdout)
            y, _ = sf.read(wav_bytes, dtype='float32')
            if y.ndim > 1:
                y = np.mean(y, axis=1)
        except Exception:
            return []  # skip unreadable/corrupt file

    name = os.path.splitext(os.path.basename(path))[0].lower()

    # validate audio
    if y is None or len(y) == 0:
        return []

    # total duration in seconds
    duration = librosa.get_duration(y=y, sr=sr)
    if duration < 120:
        return []  # not enough audio for 2-minute center segment
    center = duration / 2

    # find center 2-minute segment
    start = int((center - 120 / 2) * sr)
    end = int((center + 120 / 2) * sr)
    y = y[start:end]

    # split into 15-second clips (8 clips from 2-minute segment)
    clip_size = clip_len * sr
    clips = [y[i:i + clip_size] for i in range(0, len(y), clip_size) if len(y[i:i + clip_size]) == clip_size]

    # assign splits deterministically
    num_clips = len(clips)
    if num_clips == 0:
        return []
    
    split_assignments = []
    for i in range(num_clips):
        if i == 0:
            split_assignments.append('train')
        elif i == 1:
            split_assignments.append('test')
        elif i == 2:
            split_assignments.append('train')
        elif i == 3:
            split_assignments.append('validation')
        elif i == 4:
            split_assignments.append('train')
        elif i == 5:
            split_assignments.append('test')
        else:
            split_assignments.append('train')
    
    # wrap into Song objects with appropriate split assignments
    return [Song(name, clip, lang, sr, clip_len, split_assignments[i]) for i, clip in enumerate(clips)]


In [None]:
# create list of Song objects for all languages
languages = ['patois', 'mandarin', 'english', 'spanish', 'hindi', 'pidgin']
mp3_files = []

total_files = sum(len([f for f in os.listdir(f'./{lang}') if f.endswith('.mp3')]) for lang in languages)
processed = 0

for lang in languages:
    folder = f'./{lang}'
    for file in os.listdir(folder):
        if file.endswith('.mp3'):
            path = os.path.join(folder, file)
            try:
                clips = process_song(path, lang)
                mp3_files.extend(clips)
                processed += 1
                if processed % 50 == 0:
                    print(f"Processed {processed}/{total_files} files...")
            except Exception as e:
                print(f"Error processing {file}: {e}")

print(f"Completed! Processed {processed} files, created {len(mp3_files)} clips total.")

Processed 50/93 files...
Completed! Processed 93 files, created 739 clips total.


# visualize and listen to song audios

In [5]:
# include UI to listen to songs before and after sampling to ensure there is no data corruption
def visualize_audio(audio_vector, sampling_rate=22050, seconds=None, out_dir=None):
    """
    purpose:
        visualize your 1D audio vector as playable audio
    params:
        audio_vector: 1D numpy array of raw audio data
        sampling_rate: sample rate of your audio
        seconds: number of seconds to play
        out_dir: directory to save WAV file (None = just display)
    output:
        None if just display, audio saved location if input provodied for out_dir
    """
    
    # play only a portion
    if seconds is not None:
        samples_to_play = seconds * sampling_rate
        waveform = audio_vector[:samples_to_play]
    else:
        waveform = audio_vector
    
    # normalize to prevent clipping
    waveform = waveform / np.max(np.abs(waveform))
    
    # display the audio if no path is provided
    if out_dir is None:
        return Audio(waveform, rate=sampling_rate)
    
    # save the audio to a file
    os.makedirs(out_dir, exist_ok=True)
    audio_path = os.path.join(out_dir, "generated_audio.wav")
    
    # convert to 16-bit PCM for WAV file
    waveform_int16 = (waveform * 32767).astype(np.int16)
    write_wav(audio_path, sampling_rate, waveform_int16)
    return audio_path


In [6]:
# listen to the first song in the dataset
song = mp3_files[0]
sample = song.audio
visualize_audio(sample)

In [7]:
# listen to a downsample version of the first song in the dataset
downsample = librosa.resample(y=sample, orig_sr=22050, target_sr=12000)
visualize_audio(downsample, 12000)

# feature extraction methods

In [8]:
print(song._mel_spectrogram_features().T.shape)
print(song._stft_features().T.shape)
print(song._mfcc_features().T.shape)

(704, 96)
(704, 257)
(704, 20)


In [9]:
# load the AST model (trained with sr of 16000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
feature_extractor(downsample, sampling_rate=16000, return_tensors="pt", hop_length=256)['input_values'][0].shape

torch.Size([1024, 128])

In [10]:
# load the CLAP model (trained with sr of 48000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=48000)
feature_extractor = ClapFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
feature_extractor(downsample, sampling_rate=48000, return_tensors="pt", hop_length=256)['input_features'][0][0].shape

torch.Size([1001, 64])

In [11]:
# load the DAC model (trained with sr of 16000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = DacFeatureExtractor.from_pretrained("descript/dac_16khz")
feature_extractor(downsample, sampling_rate=16000, return_tensors="pt")['input_values'][0].shape


torch.Size([1, 240000])

In [12]:
# load the DIA model (trained with sr of 16000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = DiaFeatureExtractor()  # use the exact Dia repo ID
feature_extractor(downsample, return_tensors="pt", sampling_rate=16000)['input_values'][0].shape

torch.Size([1, 240128])

In [13]:
# load the Encodec model (trained with sr of 24000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=24000)
feature_extractor = EncodecFeatureExtractor()
unspecific = feature_extractor(downsample, sampling_rate=24000, return_tensors="pt")['input_values']
feature_extractor = EncodecFeatureExtractor.from_pretrained("facebook/encodec_24khz")
specific = feature_extractor(downsample, sampling_rate=24000, return_tensors="pt")['input_values']
specific[0].shape, unspecific[0].shape

(torch.Size([1, 360001]), torch.Size([1, 360001]))

In [14]:
# load the Granite Speech model (trained with sr of 16000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = GraniteSpeechFeatureExtractor()
feature_extractor(song.audio)["input_features"][0].shape  # not 'input_values'

torch.Size([1034, 160])

In [15]:
# load the Mimi model (trained with sr of 24000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=24000)
feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi")
feature_extractor(raw_audio=song.audio, sampling_rate=24000, return_tensors="pt")['input_values'].shape

torch.Size([1, 1, 330750])

In [16]:
# load the MCTCT model (trained with sr of 16000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = MCTCTFeatureExtractor()
feature_extractor(raw_speech=song.audio, sampling_rate=16000, return_tensors="pt")['input_features'][0].shape


torch.Size([2065, 80])

In [17]:
# load the Moonshine model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape

torch.Size([1, 330750])

In [18]:
# TODO: cannot find the exact Moshiko repo ID

# # load the Kyutai Moshiko model (trained with sr of 16000Hz - matched audio to be compatible with the model)
# resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
# feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/moshiko-pytorch-bf16")
# feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")#['input_values'].shape

In [19]:
# load the MusicGen model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=32000)
feature_extractor = MusicgenMelodyFeatureExtractor.from_pretrained("facebook/musicgen-small")
feature_extractor(audio=song.audio, sampling_rate=32000)['input_features'][0].shape

torch.Size([81, 12])

In [20]:
# load the Parakeet model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = ParakeetFeatureExtractor()
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_features'][0].shape

torch.Size([2068, 80])

In [21]:
from transformers import Pop2PianoFeatureExtractor

# feel free to change the sr to a suitable value.
feature_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano")
feature_extractor(audio=song.audio, sampling_rate=22050, return_attention_mask=True, return_tensors="pt")['input_features'][0].shape


torch.Size([57, 512])

In [22]:
# load the SeamlessM4T model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained("facebook/hf-seamless-m4t-medium") #""
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_features'][0].shape


torch.Size([1033, 160])

In [23]:
# load the Speech2Text model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = Speech2TextFeatureExtractor.from_pretrained("facebook/s2t-small-librispeech-asr")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_features'][0].shape


torch.Size([2065, 80])

In [24]:
# load the SpeechT5 model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
# feature_extractor = SpeechT5FeatureExtractor.from_pretrained("microsoft/speecht5-v2-large")
feature_extractor = SpeechT5FeatureExtractor()
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape


torch.Size([1, 330750])

In [25]:
# load the UniSpeech model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape

torch.Size([1, 330750])

In [26]:
# load the UniSpeech SAT model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-sat-base-100h-libri-ft")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape

torch.Size([1, 330750])

In [27]:
# load the UnivNet model (trained with sr of 24000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=24000)
feature_extractor = UnivNetFeatureExtractor.from_pretrained("dg845/univnet-dev")
feature_extractor(song.audio, sampling_rate=24000, return_tensors="pt")['input_features'][0].shape

torch.Size([937, 100])

In [28]:
# load the Wav2Vec2 model (trained with sr of 22050Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape

torch.Size([1, 330750])

In [29]:
# load the Wav2Vec2 Conformer model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape

torch.Size([1, 330750])

In [30]:
# load the WavLM model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-base")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape

torch.Size([1, 330750])

In [31]:
# load the Whisper model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_features'][0].T.shape

torch.Size([3000, 80])

In [32]:
# load the x-codec model (trained with sr of 22050Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = AutoFeatureExtractor.from_pretrained("hf-audio/xcodec-hubert-librispeech")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'][0].shape

torch.Size([1, 330880])