In [1]:
import pandas as pd
import numpy as np
import soundfile as sf
import librosa
from IPython.display import Audio
import os
from scipy.io.wavfile import write as write_wav
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

### spotify playlists

* [jamaican patois](https://open.spotify.com/playlist/54AaUsgTHhmsdfhT5lMMdl)
* [ghanaian pidgin](https://open.spotify.com/playlist/74Nm56ijqC9O9Z2qF3kC5X)
* [american english](https://open.spotify.com/playlist/4WjFFH3mIMD6EEpDmPZKc3)
* [european spanish](https://open.spotify.com/playlist/38hpr6Usrjtey7IkdNHlaN)
* [chinese mandarin](https://open.spotify.com/playlist/76XEwiwfRRtg3QDeB2ttCD)
* [indian hindu](https://open.spotify.com/playlist/4m9VYOmySJiM7wv14um6lZ)

### audio pre-processing techniques
* [short-time fourier transform ](https://librosa.org/doc/main/generated/librosa.stft.html)
* [mel-scaled spectrogram](https://librosa.org/doc/main/generated/librosa.feature.melspectrogram.html)
* [mel-frequency cepstral coefficients](https://librosa.org/doc/main/generated/librosa.feature.mfcc.html)
* [audio spectrogram transformer](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/audio-spectrogram-transformer#transformers.ASTFeatureExtractor)
* [clap](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/clap#transformers.ClapFeatureExtractor)
* [!descript audio codec](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/dac#overview)
* [dia](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/dia#overview)
* [encodec neural codec](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/encodec#overview)
* [granite speech](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/granite_speech#overview)
* [kyutai mimi](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/mimi)
* [m-ctc-t](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/mctct#overview)
* [moonshine](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/moonshine)
* [moshiko](http://huggingface.co/docs/transformers/v4.57.1/en/model_doc/moshi)
* [musicGen melody](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/musicgen_melody#overview)
* [parakeet](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/parakeet#overview)
* [pop2Piano](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/pop2piano#overview)
* [seamlessM4T](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/seamless_m4t#overview)
* [speech2Text](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/speech_to_text)
* [speechT5](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/speecht5)
* [uniSpeech](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/unispeech#overview)
* [uniSpeechSAT](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/unispeech-sat#overview)
* [univNet](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/univnet#overview)
* [wav2vec2](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/wav2vec2#overview)
* [wav2vec2 conformer](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/wav2vec2-conformer)
* [wavLM](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/wavlm)
* [whisper](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/whisper#whisper)
* [x-codec](https://huggingface.co/docs/transformers/v4.57.1/en/model_doc/xcodec#overview)

### future considerations
* [stft-spectrogram layer](https://keras.io/examples/audio/stft/)
* [melspectrogram layer](https://keras.io/api/layers/preprocessing_layers/audio_preprocessing/mel_spectrogram/)
* [comprehensive AutoFeatureExtractor library](https://huggingface.co/docs/transformers/en/model_doc/auto#audio)

### questions
* how long does it take for each method to produce the preprocessed data ?
    * **reasoning:** if it takes too long, no sense being part of the evolutionary process. it already takes a very long time to build and train the models, adding preprocessing at run time on top of that is intense, on top of it making the search space even more vast with all the specifications for each feature extraction technique. it will take even longer for this evo process to run if every single time a model runs it preprocesses the data. this can be a future step where there are GPU and computational resources to accelerate this pipeline. i can even provide code that does this for others with more computing power to run.


In [2]:
# create a class object to store song information
class Song:
    
    def __init__(self, name, audio_vector, lang, samplerate, length, split):
        self.name = name
        self.audio = audio_vector     
        self.lang = lang              
        self.samplerate = samplerate
        self.length = length
        self.split = split

    def create_feature_representations(self):
        
        self.stft = self._stft_features()
        self.mel_specs = self._mel_spectrogram_features()
        self.mfccs = self._mfcc_features()

        self.ast = self._ast_features()
        self.clap = self._clap_features()
        self.dac = self._dac_features()
        
        self.dia = self._dia_features()
        self.encodec = self._encodec_features()
        self.granite = self._granite_speech_features()
        self.kyutai = self._kyutai_mimi_features()
        
        self.mctct = self._m_ctc_t_features()
        self.moonshine = self._moonshine_features()
        self.moshiko = self._moshiko_features()
        
        self.musicgen = self._musicGen_features()
        self.parakeet = self._parakee_features()
        self.pop2piano = self._pop2Piano_features()
        
        self.seamlessm4t = self._seamlessM4T_features()
        self.speech2text = self._speech2Text_features()
        self.speecht5 = self._speechT5_features()
        
        self.unispeech = self._uniSpeech_features()
        self.univnet = self._univNet_features()
        self.wav2vec2 = self._wav2vec2_features()
        
        self.wav2vec2_con = self._wav2vec2_conformer_features()
        self.wavLM = self._wavLM_features()
        self.whisper = self._whisper_features()
        self.xcodec = self._x_codec_features()

    def _resample_audio(self, target_sr): 
        resample = librosa.resample(y=self.audio, orig_sr=self.samplerate, target_sr=target_sr)
        return resample


    def _stft_features(self):
        # create short-time fourier transform spectrograms
        resample = self._resample_audio(12000)
        self.stft = librosa.stft(
            y = resample,
            n_fft = 512,
            hop_length = 256,
        )
        return self.stft

    def _mel_spectrogram_features(self):
        # create mel-scaled spectrograms
        resample = self._resample_audio(12000)
        self.mel_spec = librosa.feature.melspectrogram(
            y=resample, 
            sr=12000, 
            hop_length = 256, 
            n_fft = 512, 
            n_mels=96
        )
        return self.mel_spec


        
    def _mfcc_features(self):
        # create mel-frequency cepstral coefficients
        resample = self._resample_audio(12000)
        self.mfccs = librosa.feature.mfcc(
            y=resample, 
            sr=12000, 
            hop_length = 256, 
            n_fft = 512, 
        )
        return self.mfccs

    def _ast_features(self):
        # TODO: create audio spectrogram transformers
        pass

    def _clap_features(self):
        # TODO: create features using clap model embeddings
        pass

    def _dac_features(self):
        # TODO: create features using descript audio codec model embeddings
        pass

    def _dia_features(self):
        # TODO: create features using dia model embeddings
        pass

    def _encodec_features(self):
        # TODO: create features using encodec neural codec model embeddings
        pass

    def _granite_speech_features(self):
        # TODO: create features using granite speech model embeddings
        pass

    def _kyutai_mimi_features(self):
        # TODO: create features using kyutai mimi model embeddings
        pass

    def _m_ctc_t_features(self):
        # TODO: create features using m-ctc-t model embeddings
        pass

    def _moonshine_features(self):
        # TODO: create features using moonshine model embeddings
        pass

    def _moshiko_features(self):
        # TODO: create features using moshiko model embeddings
        pass

    def _musicGen_features(self):
        # TODO: create features using musicGen melody model embeddings
        pass

    def _parakeet_features(self):
        # TODO: create features using parakeet model embeddings
        pass
        
    def _pop2Piano_features(self):
        # TODO: create features using pop2Piano model embeddings  
        pass      

    def _seamlessM4T_features(self):
        # TODO: create features using seamlessM4T model embeddings
        pass

    def _speech2Text_features(self):
        # TODO: create features using speech2Text model embeddings
        pass

    def _speechT5_features(self):
        # TODO: create features using speechT5 model embeddings
        pass

    def _uniSpeech_features(self):
        # TODO: create features using uniSpeech model embeddings
        pass

    def _univNet_features(self):
        # TODO: create features using univNet model embeddings
        pass

    def _wav2vec2_features(self):
        # TODO: create features using wav2vec2 model embeddings
        pass

    def _wav2vec2_conformer_features(self):
        # TODO: create features using wav2vec2 conformer model embeddings
        pass
 
    def _wavLM_features(self):
        # TODO: create features using wavLM model embeddings
        pass

    def _whisper_features(self):
        # TODO: create features using whisper model embeddings
        pass

    def _x_codec_features(self):
        # TODO: create features using x-codec model embeddings
        pass





# load data 
* save values into Song objects (language + sample rate)
* load audio into 1D vectors
* assign train-test-validation splits to object instances

In [3]:
def process_song(path, lang, sr=22050, clip_len=15):
    
    # load song with the same samplerate as patois songs
    y, _ = librosa.load(path, sr=sr, mono=True)
    name = os.path.splitext(os.path.basename(path))[0].lower()

    # total duration in seconds
    duration = librosa.get_duration(y=y, sr=sr)
    center = duration / 2

    # find center 2-minute segment
    start = int((center - 120 / 2) * sr)
    end = int((center + 120 / 2) * sr)
    y = y[start:end]

    # split into 15-second clips (8 clips from 2-minute segment)
    clip_size = clip_len * sr
    clips = [y[i:i + clip_size] for i in range(0, len(y), clip_size) if len(y[i:i + clip_size]) == clip_size]

    # assign splits: 1st and last clip to train, 2nd clip to test, 3rd clip to validation
    num_clips = len(clips)
    if num_clips == 0:
        return []  # return empty list if no valid clips
    
    split_assignments = []
    for i in range(num_clips):
        if i == 0:  # first clip
            split_assignments.append('train')
        elif i == 1:  # second clip
            split_assignments.append('test')
        elif i == 2:  # third clip
            split_assignments.append('train')
        elif i == 3:  # fourth clip
            split_assignments.append('validation')
        elif i == 4:  # fifth clip
            split_assignments.append('train')
        elif i == 5:  # sixth clip
            split_assignments.append('test')
        elif i == 6:  # seventh clip
            split_assignments.append('train')
        elif i == 7:  # eighth clip
            split_assignments.append('train')
    
    # wrap into Song objects with appropriate split assignments
    return [Song(name, clip, lang, sr, clip_len, split_assignments[i]) for i, clip in enumerate(clips)]


In [4]:
# create list of Song objects for all languages
languages = ['patois', 'mandarin', 'english', 'spanish', 'hindi', 'pidgin']
mp3_files = []

total_files = sum(len([f for f in os.listdir(f'./{lang}') if f.endswith('.mp3')]) for lang in languages)
processed = 0

for lang in languages:
    folder = f'./{lang}'
    for file in os.listdir(folder):
        if file.endswith('.mp3'):
            path = os.path.join(folder, file)
            try:
                clips = process_song(path, lang)
                mp3_files.extend(clips)
                processed += 1
                if processed % 50 == 0:
                    print(f"Processed {processed}/{total_files} files...")
            except Exception as e:
                print(f"Error processing {file}: {e}")

print(f"Completed! Processed {processed} files, created {len(mp3_files)} clips total.")

Processed 50/575 files...
Processed 100/575 files...
Processed 150/575 files...
Processed 200/575 files...
Processed 250/575 files...
Processed 300/575 files...


[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x77bd7d56 at offset 945978.
Note: Trying to resync...
Note: Skipped 134 bytes in input.
Note: Illegal Audio-MPEG-Header 0x0e9b81e3 at offset 1082816.
Note: Trying to resync...
Note: Skipped 98 bytes in input.
Note: Illegal Audio-MPEG-Header 0xd30bc33e at offset 3202978.
Note: Trying to resync...
Note: Skipped 152 bytes in input.


Processed 350/575 files...
Processed 400/575 files...
Processed 450/575 files...
Processed 500/575 files...
Processed 550/575 files...
Completed! Processed 575 files, created 4562 clips total.


In [5]:
# check how many of each language is in the train, test and validation splits
languages = ['patois', 'mandarin', 'english', 'spanish', 'hindi', 'pidgin']
splits = ['train', 'test', 'validation']

# create a dictionary to store counts
split_counts = {split: {lang: 0 for lang in languages} for split in splits}

# count songs by split and language
for song in mp3_files:
    if song.split in splits and song.lang in languages:
        split_counts[song.split][song.lang] += 1

# display results
print(f"{'language':<12} {'train':<8} {'test':<8} {'validation':<12} {'total':<8}")
print("-" * 50)

for lang in languages:
    train_count = split_counts['train'][lang]
    test_count = split_counts['test'][lang]
    val_count = split_counts['validation'][lang]
    total = train_count + test_count + val_count
    print(f"{lang:<12} {train_count:<8} {test_count:<8} {val_count:<12} {total:<8}")

# print totals
train_total = sum(split_counts['train'].values())
test_total = sum(split_counts['test'].values())
val_total = sum(split_counts['validation'].values())
grand_total = train_total + test_total + val_total

print("-" * 50)
print(f"{'TOTAL':<12} {train_total:<8} {test_total:<8} {val_total:<12} {grand_total:<8}")

language     train    test     validation   total   
--------------------------------------------------
patois       519      210      105          834     
mandarin     473      190      95           758     
english      460      186      93           739     
spanish      491      198      99           788     
hindi        404      162      81           647     
pidgin       496      200      100          796     
--------------------------------------------------
TOTAL        2843     1146     573          4562    


# visualize and listen to song audios

In [6]:
# include UI to listen to songs before and after sampling to ensure there is no data corruption
def visualize_audio(audio_vector, sampling_rate=22050, seconds=None, out_dir=None):
    """
    purpose:
        visualize your 1D audio vector as playable audio
    params:
        audio_vector: 1D numpy array of raw audio data
        sampling_rate: sample rate of your audio
        seconds: number of seconds to play
        out_dir: directory to save WAV file (None = just display)
    output:
        None if just display, audio saved location if input provodied for out_dir
    """
    
    # play only a portion
    if seconds is not None:
        samples_to_play = seconds * sampling_rate
        waveform = audio_vector[:samples_to_play]
    else:
        waveform = audio_vector
    
    # normalize to prevent clipping
    waveform = waveform / np.max(np.abs(waveform))
    
    # display the audio if no path is provided
    if out_dir is None:
        return Audio(waveform, rate=sampling_rate)
    
    # save the audio to a file
    os.makedirs(out_dir, exist_ok=True)
    audio_path = os.path.join(out_dir, "generated_audio.wav")
    
    # convert to 16-bit PCM for WAV file
    waveform_int16 = (waveform * 32767).astype(np.int16)
    write_wav(audio_path, sampling_rate, waveform_int16)
    return audio_path


In [7]:
# listen to the first song in the dataset
song = mp3_files[0]
sample = song.audio
visualize_audio(sample)

In [8]:
# listen to a downsample version of the first song in the dataset
downsample = librosa.resample(y=sample, orig_sr=22050, target_sr=12000)
visualize_audio(downsample, 12000)

# feature extraction methods

In [9]:
print(song._mel_spectrogram_features().T.shape)
print(song._stft_features().T.shape)
print(song._mfcc_features().T.shape)


(704, 96)
(704, 257)
(704, 20)


In [10]:
from transformers import AutoFeatureExtractor

# load the AST model (trained with sr of 16000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
feature_extractor(downsample, sampling_rate=16000, return_tensors="pt", hop_length=256)['input_values'][0].shape

  from .autonotebook import tqdm as notebook_tqdm
  import pynvml  # type: ignore[import]


OSError: dlopen(/Users/anandafrancis/ananda/c-resources/d-leverage/credentials/accreditation/degrees/northeastern/fa25/thesis/evolutionary-ml/venv-arm/lib/python3.11/site-packages/torchaudio/lib/libtorchaudio.so, 0x0006): Symbol not found: __ZNK3c106SymInt16sym_ne_slow_pathERKS0_
  Referenced from: <DE680148-C6F1-3330-99B6-BC1C0317F015> /Users/anandafrancis/ananda/c-resources/d-leverage/credentials/accreditation/degrees/northeastern/fa25/thesis/evolutionary-ml/venv-arm/lib/python3.11/site-packages/torchaudio/lib/libtorchaudio.so
  Expected in:     <F9AA07B6-BB5F-3065-AD69-63C99B7E4340> /Users/anandafrancis/ananda/c-resources/d-leverage/credentials/accreditation/degrees/northeastern/fa25/thesis/evolutionary-ml/venv-arm/lib/python3.11/site-packages/torch/lib/libc10.dylib

In [None]:
from transformers import ClapFeatureExtractor

# load the CLAP model (trained with sr of 48000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=48000)
feature_extractor = ClapFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
feature_extractor(downsample, sampling_rate=48000, return_tensors="pt", hop_length=256)['input_features'][0][0].shape

torch.Size([1001, 64])

In [None]:
from transformers import DacFeatureExtractor

# load the DAC model (trained with sr of 16000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = DacFeatureExtractor.from_pretrained("descript/dac_16khz")
feature_extractor(downsample, sampling_rate=16000, return_tensors="pt")['input_values'].shape


torch.Size([1, 1, 240000])

In [None]:
from transformers import DiaFeatureExtractor        # can't find the exact Dia repo ID

# load the DIA model (trained with sr of 16000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = DiaFeatureExtractor()  # use the exact Dia repo ID
feature_extractor(downsample, return_tensors="pt", sampling_rate=16000)['input_values'].shape

torch.Size([1, 1, 240128])

In [None]:
from transformers import EncodecFeatureExtractor 

# load the Encodec model (trained with sr of 24000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=24000)
feature_extractor = EncodecFeatureExtractor()
unspecific = feature_extractor(downsample, sampling_rate=24000, return_tensors="pt")['input_values']
feature_extractor = EncodecFeatureExtractor.from_pretrained("facebook/encodec_24khz")
specific = feature_extractor(downsample, sampling_rate=24000, return_tensors="pt")['input_values']
specific.shape, unspecific.shape

(torch.Size([1, 1, 360001]), torch.Size([1, 1, 360001]))

In [None]:
from transformers import GraniteSpeechFeatureExtractor  # can't find the exact Granite Speech repo ID

# load the Granite Speech model (trained with sr of 16000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = GraniteSpeechFeatureExtractor()
feature_extractor(song.audio)["input_features"][0].shape  # not 'input_values'

torch.Size([1034, 160])

In [None]:
# load the Mimi model (trained with sr of 24000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=24000)
feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi")
feature_extractor(raw_audio=song.audio, sampling_rate=24000, return_tensors="pt")['input_values'].shape

torch.Size([1, 1, 330750])

In [None]:
from transformers import MCTCTFeatureExtractor

# load the MCTCT model (trained with sr of 16000Hz - matched audio to be compatible with the model)
downsample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = MCTCTFeatureExtractor()
feature_extractor(raw_speech=song.audio, sampling_rate=16000, return_tensors="pt")['input_features'][0].shape


torch.Size([2065, 80])

In [None]:
# load the Moonshine model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape

torch.Size([1, 330750])

In [None]:
# from transformers import AutoFeatureExtractor TODO: cannot find the exact Moshiko repo ID

# # load the Kyutai Moshiko model (trained with sr of 16000Hz - matched audio to be compatible with the model)
# resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
# feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/moshiko-pytorch-bf16")
# feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")#['input_values'].shape

In [None]:
# from transformers import MusicgenMelodyFeatureExtractor TODO: took too long to load

# # # load the MusicGen model (trained with sr of 16000Hz - matched audio to be compatible with the model)
# resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
# feature_extractor = MusicgenMelodyFeatureExtractor.from_pretrained("facebook/musicgen-small")
# feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape

In [None]:
from transformers import ParakeetFeatureExtractor

# load the Parakeet model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = ParakeetFeatureExtractor.from_pretrained("facebook/parakeet-small")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape

In [None]:
from transformers import Pop2PianoFeatureExtractor

# feel free to change the sr to a suitable value.
feature_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano")
feature_extractor(
    audio=song.audio, 
    sampling_rate=22050, 
    return_attention_mask=True, 
    return_tensors="pt",
)

In [None]:
from transformers import SeamlessM4TFeatureExtractor

# load the SeamlessM4T model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained("google/seamless-m4t-medium") #"facebook/hf-seamless-m4t-medium"
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape


In [None]:
from transformers import Speech2TextFeatureExtractor

# load the Speech2Text model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = Speech2TextFeatureExtractor.from_pretrained("facebook/s2t-small-librispeech-asr")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape

# from transformers import Speech2TextProcessor

# processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
# processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt")



In [None]:
from transformers import SpeechT5FeatureExtractor

# load the SpeechT5 model (trained with sr of 16000Hz - matched audio to be compatible with the model)
resample = librosa.resample(y=song.audio, orig_sr=22050, target_sr=16000)
feature_extractor = SpeechT5FeatureExtractor.from_pretrained("microsoft/speecht5-v2-large")
feature_extractor(song.audio, sampling_rate=16000, return_tensors="pt")['input_values'].shape


In [None]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
feature_extractor(song.audio, sampling_rate=22050, return_tensors="pt")['input_values'].shape

In [None]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-sat-base-100h-libri-ft")
feature_extractor(song.audio, sampling_rate=22050, return_tensors="pt")['input_values'].shape

In [None]:
from transformers import UnivNetFeatureExtractor

feature_extractor = UnivNetFeatureExtractor.from_pretrained("dg845/univnet-dev")
feature_extractor(song.audio, sampling_rate=22050, return_tensors="pt")['input_values'].shape

In [None]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
feature_extractor(song.audio, sampling_rate=22050, return_tensors="pt")['input_values'].shape

In [None]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
feature_extractor(song.audio, sampling_rate=22050, return_tensors="pt")['input_values'].shape

In [None]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-base")
feature_extractor(song.audio, sampling_rate=22050, return_tensors="pt")['input_values'].shape

In [None]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
feature_extractor(song.audio, sampling_rate=22050, return_tensors="pt")['input_values'].shape

In [None]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained("hf-audio/xcodec-hubert-librispeech")
feature_extractor(song.audio, sampling_rate=22050, return_tensors="pt")['input_values'].shape

In [None]:
def build_dataset(files):

    # intialize variables
    X, y = [], []
    curr_sr, down_sr = 22050, 12000
    hop_size = 256
    dft = 512
    mel_bins = 96

    
    for track in files:

        # downmix and downsample the signals to 12 kHz 
        downsample = librosa.resample(y=track.audio, orig_sr=curr_sr, target_sr=down_sr)
        
        # extract Mel spectrogram
        mel_spec = librosa.feature.melspectrogram(
            y=downsample, sr=down_sr, hop_length = hop_size, 
            n_fft = dft, n_mels=mel_bins
        )

        # convert to log scale (dB)
        log_mel = librosa.power_to_db(mel_spec, ref=np.max)
        
        # add transposed matrix since RNN expect 3D input: samples, timesteps, features
        X.append(log_mel.T)
        y.append(track.lang)
    
    # convert to numpy arrays
    X = np.array(X)
    y = np.array(y)
    
    # encode targets
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)
    y_onehot = to_categorical(y_encoded)
    labels_inorder = list(encoder.classes_)
    
    print(f"Dataset shape: {X.shape}")
    print(f"Labels (In Order): {labels_inorder}")
    
    # split into train, validation and test sets
    x_train, x_eval, y_train, y_eval = train_test_split(
        X, y_onehot, test_size=0.4, random_state=42, stratify=y_encoded
    )
    y_eval_encoded = np.argmax(y_eval, axis=1)
    x_val, x_test, y_val, y_test = train_test_split(
        x_eval, y_eval, test_size=0.5, random_state=42, stratify=y_eval_encoded
    )
    
    # store preprocessed data
    data = {
        'labels_inorder': labels_inorder,
        'train_features': x_train, 'train_labels': y_train,
        'val_features': x_val, 'val_labels': y_val,
        'test_features': x_test, 'test_labels': y_test
    }
    
    print(f"Train set: {x_train.shape}")
    print(f"Validation set: {x_val.shape}")
    print(f"Test set: {x_test.shape}")
    
    return data

In [None]:
data = build_dataset(mp3_files)

In [None]:
# save data as a file to load for analysis
np.savez_compressed(
    "spotify_dataset.npz",
    labels_inorder=data['labels_inorder'],
    train_features=data["train_features"],
    train_labels=data["train_labels"],
    val_features=data["val_features"],
    val_labels=data["val_labels"],
    test_features=data["test_features"],
    test_labels=data["test_labels"]
)