# Preprocessing Audio

In [1]:
import stempeg
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

In [2]:
data_path = Path('../data/raw/musdb18')
train_path = data_path / 'train'
test_path = data_path / 'test'

In [None]:
def generate_stft(audio):
    spectrograms = []
    for i in range(5):
        mono = audio[i, :, :].mean(axis=1)
        D = librosa.stft(mono)
        magnitude = np.abs(D)
        spectrograms.append(magnitude)
    return np.array(spectrograms)

In [6]:
def preprocess_dataset(path: Path, output_path: Path = ""):
    tracks = sorted(list(path.glob('*.stem.mp4')))
    print("Generating spectrograms for every stem...")
    for track in tracks:
        track_name = track.stem.replace('.stem', '')
        track_dir = output_path / track_name
        if track_dir.exists(): 
            print(f"Track {track_name} already processed. Skipping.")
            continue
        
        audio, _ = stempeg.read_stems(str(track))
        spectrograms = generate_stft(audio)
        track_dir.mkdir(exist_ok=True)
        
        stem_names = ['mix', 'drums', 'bass', 'other', 'vocals']
        for i, name in enumerate(stem_names):
            np.save(track_dir / f"{name}.npy", spectrograms[i])
    
    print(f"{len(tracks)} tracks processed succesfully.")

processed_test_path = Path('../data/processed/test')

preprocess_dataset(test_path, processed_test_path)

# Working!!

Generating spectrograms for every stem...
Track AM Contra - Heart Peripheral already processed. Skipping.
Track Al James - Schoolboy Facination already processed. Skipping.
Track Angels In Amplifiers - I'm Alright already processed. Skipping.


KeyboardInterrupt: 