In [2]:
from google.colab import files

print("Please upload your audio file (e.g., song.mp3):")
uploaded = files.upload()

# Retrieve the filename (assumes a single file is uploaded)
input_file = list(uploaded.keys())[0]
print(f"Uploaded file: {input_file}")


Please upload your audio file (e.g., song.mp3):


Saving [iSongs (mp3cut.net).mp3 to [iSongs (mp3cut.net) (1).mp3
Uploaded file: [iSongs (mp3cut.net) (1).mp3


In [3]:
# Install required packages and FFmpeg (if not already installed)
# !pip install demucs soundfile torchaudio
# !apt-get install -y ffmpeg

import io
import base64
import torch
import torchaudio
import soundfile as sf
from demucs import pretrained
from demucs.apply import apply_model
from IPython.display import HTML, display

# Step 1: Load the uploaded audio file into memory using torchaudio
waveform, sr = torchaudio.load(input_file)  # waveform shape: (channels, time)
waveform = waveform.unsqueeze(0)  # add batch dimension: now (1, channels, time)

# Step 2: Load the Demucs model ('htdemucs') and set it to evaluation mode
print("Loading Demucs model...")
model = pretrained.get_model('htdemucs')
model.eval()
model.cpu()  # ensure the model is on CPU

# Step 3: Run source separation using apply_model
print("Running source separation...")
with torch.no_grad():
    # Common parameters: shifts=1, split=True, overlap=0.25
    estimates = apply_model(model, waveform, shifts=1, split=True, overlap=0.25)

# Demucs (htdemucs) outputs 4 stems in the order: [drums, bass, other, vocals]
# Compute accompaniment (instrumental) by summing the non-vocal stems:
vocals = estimates[0, 3]
accompaniment = estimates[0, 0] + estimates[0, 1] + estimates[0, 2]

# Step 4: Write the accompaniment audio to an in-memory WAV file
# soundfile.write expects data with shape (samples, channels), so we transpose.
accompaniment_np = accompaniment.cpu().numpy().T
buffer = io.BytesIO()
sf.write(buffer, accompaniment_np, sr, format='WAV')
buffer.seek(0)  # Rewind the buffer to the beginning

# Step 5: Create an HTML download link for the in-memory WAV file
data = buffer.read()
b64 = base64.b64encode(data).decode()
html = f'''
<a download="accompaniment.wav" href="data:audio/wav;base64,{b64}" target="_blank">
    <button style="font-size:16px;padding:10px;">Download Accompaniment (Instrumental)</button>
</a>
'''
display(HTML(html))
print("Processing complete! Click the button above to download the instrumental (accompaniment) WAV file.")


Loading Demucs model...
Running source separation...


Processing complete! Click the button above to download the instrumental (accompaniment) WAV file.
