In [49]:
import pandas as pd
import numpy as np
import soundfile as sf
import librosa
from IPython.display import Audio
import os
from scipy.io.wavfile import write as write_wav
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import random as rnd

### spotify playlists

* [jamaican patois](https://open.spotify.com/playlist/54AaUsgTHhmsdfhT5lMMdl)
* [ghanaian pidgin](https://open.spotify.com/playlist/74Nm56ijqC9O9Z2qF3kC5X)
* [american english](https://open.spotify.com/playlist/4WjFFH3mIMD6EEpDmPZKc3)
* [european spanish](https://open.spotify.com/playlist/38hpr6Usrjtey7IkdNHlaN)
* [chinese mandarin](https://open.spotify.com/playlist/76XEwiwfRRtg3QDeB2ttCD)
* [indian hindu](https://open.spotify.com/playlist/4m9VYOmySJiM7wv14um6lZ)

In [2]:
# create a class object to store song information
class Song:
    
    def __init__(self, name, audio_vector, lang, samplerate, length):
        self.name = name
        self.audio = audio_vector     
        self.lang = lang              
        self.samplerate = samplerate
        self.length = length

# load data 
* save values into Song objects (language + sample rate)
* load audio into 1D vectors

In [3]:
def process_song(path, lang, sr=22050, clip_len=10):
    
    # load song with the same samplerate as patois songs
    y, _ = librosa.load(path, sr=sr, mono=True)
    name = os.path.splitext(os.path.basename(path))[0].lower()

    # total duration in seconds
    duration = librosa.get_duration(y=y, sr=sr)
    center = duration / 2

    # find center 2-minute segment
    start = int((center - 120 / 2) * sr)
    end = int((center + 120 / 2) * sr)
    y = y[start:end]

    # split into 10-second clips (8 clips from 2-minute segment)
    clip_size = clip_len * sr
    clips = [y[i:i + clip_size] for i in range(0, len(y), clip_size) if len(y[i:i + clip_size]) == clip_size]
    
    # wrap into Song objects with appropriate split assignments
    return [Song(name, clip, lang, sr, clip_len) for clip in clips]


In [4]:
# create list of Song objects for all languages
languages = ['patois', 'mandarin', 'english', 'spanish', 'hindi', 'pidgin']
mp3_files = []

total_files = sum(len([f for f in os.listdir(f'./{lang}') if f.endswith('.mp3')]) for lang in languages)
processed = 0

for lang in languages:
    folder = f'./{lang}'
    for file in os.listdir(folder):
        if file.endswith('.mp3'):
            path = os.path.join(folder, file)
            try:
                clips = process_song(path, lang, clip_len=10)
                mp3_files.extend(clips)
                processed += 1
                if processed % 50 == 0:
                    print(f"Processed {processed}/{total_files} files...")
            except Exception as e:
                print(f"Error processing {file}: {e}")

print(f"Completed! Processed {processed} files, created {len(mp3_files)} clips total.")

Processed 50/575 files...
Processed 100/575 files...
Processed 150/575 files...
Processed 200/575 files...
Processed 250/575 files...
Processed 300/575 files...


[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x77bd7d56 at offset 945978.
Note: Trying to resync...
Note: Skipped 134 bytes in input.
Note: Illegal Audio-MPEG-Header 0x0e9b81e3 at offset 1082816.
Note: Trying to resync...
Note: Skipped 98 bytes in input.
Note: Illegal Audio-MPEG-Header 0xd30bc33e at offset 3202978.
Note: Trying to resync...
Note: Skipped 152 bytes in input.


Processed 350/575 files...
Processed 400/575 files...
Processed 450/575 files...
Processed 500/575 files...
Processed 550/575 files...
Completed! Processed 575 files, created 6854 clips total.


In [None]:
def extract_features(y, sr=22050, n_fft=2048, hop_length=512, n_mels=128, 
                        n_mfcc=40, n_chroma=12, n_bands=6):

    # log-scaled mel spectrogram(128 bands)
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=22050, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, fmax=sr/2)
    log_mel = librosa.power_to_db(mel_spectrogram, ref=np.max)

    # MFCCs (40 coefficients)   
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mfcc=n_mfcc)

    # chroma (12 chroma bins)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_chroma=n_chroma)

    # spectral contrast (7 coefficients)
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_bands=n_bands)

    # align frame counts
    T = log_mel.shape[1]
    assert mfccs.shape[1] == T and chroma.shape[1] == T and spectral_contrast.shape[1] == T

    # stack features along the feature axis
    # total:    187 features per frame
    features = np.vstack([log_mel, mfccs, chroma, spectral_contrast])


    # Transpose â†’ (T, 187) for RNN input (timesteps, features)
    return features.T


In [47]:
# print the number of clips for each language
patois_songs = [clip for clip in mp3_files if clip.lang == 'patois']
mandarin_songs = [clip for clip in mp3_files if clip.lang == 'mandarin']
english_songs = [clip for clip in mp3_files if clip.lang == 'english']
spanish_songs = [clip for clip in mp3_files if clip.lang == 'spanish']
hindi_songs = [clip for clip in mp3_files if clip.lang == 'hindi']
pidgin_songs = [clip for clip in mp3_files if clip.lang == 'pidgin']


for lang in languages:
    print(f"{lang}: {len([clip for clip in mp3_files if clip.lang == lang])} clips")

patois: 1254 clips
mandarin: 1138 clips
english: 1111 clips
spanish: 1184 clips
hindi: 971 clips
pidgin: 1196 clips


In [66]:
# randomly sample 971 songs for each language
patois_sampled = rnd.sample(patois_songs, 971)
mandarin_sampled = rnd.sample(mandarin_songs, 971)
english_sampled = rnd.sample(english_songs, 971)
spanish_sampled = rnd.sample(spanish_songs, 971)
hindi_sampled = rnd.sample(hindi_songs, 971)
pidgin_sampled = rnd.sample(pidgin_songs, 971)

In [None]:
len()

971

In [7]:
# look at length of a song vector
mp3_files[0].audio.shape[0]

220500

# visualize and listen to song audios

In [8]:
# include UI to listen to songs before and after sampling to ensure there is no data corruption
def visualize_audio(audio_vector, sampling_rate=22050, seconds=None, out_dir=None):
    """
    purpose:
        visualize your 1D audio vector as playable audio
    params:
        audio_vector: 1D numpy array of raw audio data
        sampling_rate: sample rate of your audio
        seconds: number of seconds to play
        out_dir: directory to save WAV file (None = just display)
    output:
        None if just display, audio saved location if input provodied for out_dir
    """
    
    # play only a portion
    if seconds is not None:
        samples_to_play = seconds * sampling_rate
        waveform = audio_vector[:samples_to_play]
    else:
        waveform = audio_vector
    
    # normalize to prevent clipping
    waveform = waveform / np.max(np.abs(waveform))
    
    # display the audio if no path is provided
    if out_dir is None:
        return Audio(waveform, rate=sampling_rate)
    
    # save the audio to a file
    os.makedirs(out_dir, exist_ok=True)
    audio_path = os.path.join(out_dir, "generated_audio.wav")
    
    # convert to 16-bit PCM for WAV file
    waveform_int16 = (waveform * 32767).astype(np.int16)
    write_wav(audio_path, sampling_rate, waveform_int16)
    return audio_path


In [91]:
# listen to a downsample version of the first song in the dataset
sample = mp3_files[0].audio
downsample = librosa.resample(y=sample, orig_sr=22050, target_sr=12000)
visualize_audio(downsample, 12000)

# feature extraction

[paper](https://ieeexplore.ieee.org/document/9287802)

In [11]:
def build_dataset(files):

    # intialize variables
    X, y = [], []

    for track in files:

        # extract features
        features = extract_features(track.audio)
        
        # add transposed matrix since RNN expect 3D input: samples, timesteps, features
        X.append(features)
        y.append(track.lang)
    
    # convert to numpy arrays
    X = np.array(X)
    y = np.array(y)
    
    # encode targets
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)
    y_onehot = to_categorical(y_encoded)
    labels_inorder = list(encoder.classes_)
    
    print(f"Dataset shape: {X.shape}")
    print(f"Labels (In Order): {labels_inorder}")
    
    # split into train, validation and test sets
    x_train, x_eval, y_train, y_eval = train_test_split(
        X, y_onehot, test_size=0.4, random_state=42, stratify=y_encoded
    )
    y_eval_encoded = np.argmax(y_eval, axis=1)
    x_val, x_test, y_val, y_test = train_test_split(
        x_eval, y_eval, test_size=0.5, random_state=42, stratify=y_eval_encoded
    )
    
    # store preprocessed data
    data = {
        'labels_inorder': labels_inorder,
        'train_features': x_train, 'train_labels': y_train,
        'val_features': x_val, 'val_labels': y_val,
        'test_features': x_test, 'test_labels': y_test
    }
    
    print(f"Train set: {x_train.shape}")
    print(f"Validation set: {x_val.shape}")
    print(f"Test set: {x_test.shape}")
    
    return data

In [12]:
data = build_dataset(mp3_files)

Dataset shape: (6854, 431, 187)
Labels (In Order): ['english', 'hindi', 'mandarin', 'patois', 'pidgin', 'spanish']
Train set: (4112, 431, 187)
Validation set: (1371, 431, 187)
Test set: (1371, 431, 187)


In [13]:
# save data as a file to load for analysis
np.savez_compressed(
    "spotify_dataset.npz",
    labels_inorder=data['labels_inorder'],
    train_features=data["train_features"],
    train_labels=data["train_labels"],
    val_features=data["val_features"],
    val_labels=data["val_labels"],
    test_features=data["test_features"],
    test_labels=data["test_labels"]
)