# Setup

In [None]:
# os.environ['LC_ALL'] = 'C.UTF-8'
# os.environ['LANG'] = 'C.UTF-8'

In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


Get MusicGen git repo

In [None]:
!python3 -m pip install -U git+https://github.com/facebookresearch/audiocraft#egg=audiocraft

Collecting audiocraft
  Cloning https://github.com/facebookresearch/audiocraft to /tmp/pip-install-78h8gb88/audiocraft_c0bd75f6aab848e78ca732df747b0846
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/audiocraft /tmp/pip-install-78h8gb88/audiocraft_c0bd75f6aab848e78ca732df747b0846
  Resolved https://github.com/facebookresearch/audiocraft to commit 72cb16f9fb239e9cf03f7bd997198c7d7a67a01c
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting av==11.0.0 (from audiocraft)
  Downloading av-11.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops (from audiocraft)
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting flashy>=0.0.1 (from audiocraf

Access to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Imports

In [None]:
import os
import random
from pydub import AudioSegment

import soundfile as sf
import numpy as np

# Data Processing

Create auxiliary folders

In [32]:
new_folders = ["raw", "output", "musicgen_trainer_dir"]

for folder_name in new_folders:
  if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"'{folder_name}'folder successfully created!")

'raw'folder successfully created!


Function to clean a directory

In [18]:
def clean_dir(dir_path):
  for file_name in os.listdir(dir_path):
    file_path = os.path.join(dir_path, file_name)
    if os.path.isfile(file_path):
      os.remove(file_path)
      print(f"File {file_path} was deleted")

Clean output directory

In [29]:
clean_dir('raw')

File raw/1727.wav was deleted


Remove musicgen_trainer dir and all its content

In [31]:
import shutil

shutil.rmtree("/content/musicgen_trainer_dir")

Function to process audio data: segments audio in 30 seconds each and save .txt labels and set sample rate to 32000 Hz

In [21]:
def process_audios(file_path, output_dir, segment_length=30): # 30 seconds
  # Load audio
  audio = AudioSegment.from_file(file_path)

  # Extract file name for .txt
  file_name = os.path.splitext(os.path.basename(file_path))[0]

  # Convert segment length to ms
  segment_length_ms = segment_length * 1000

  # Set the sample rate to 32000 Hz
  audio = audio.set_frame_rate(32000)

  # Calculate number of segments
  num_segments = (len(audio) + segment_length_ms - 1) // segment_length_ms

  for i in range(num_segments):
    start_time = i * segment_length_ms

    # Last segment
    if i == num_segments - 1:
      start_time = len(audio) - segment_length_ms

    end_time = start_time + segment_length_ms

    # Get segment
    segment = audio[start_time:end_time]

    # Save segment
    segment.export(os.path.join(output_dir, f'segment_{i:03d}.wav'), format='wav')

    # Save label
    with open(os.path.join(output_dir, f'segment_{i:03d}.txt'), 'w') as f:
      f.write(file_name)

Process all audios

In [37]:
output_dir = 'output'
samples_dir = 'raw'

if not os.path.exists(output_dir):
  os.makedirs(output_dir)

for file_name in os.listdir(samples_dir):
  print(file_name)
  if file_name.endswith('.wav') or file_name.endswith('.mp3'):
    file_path = os.path.join(samples_dir, file_name)
    process_audios(file_path, output_dir, segment_length=30)


classic, beethoven.wav


In [38]:
import librosa

output_dir = 'output'

for file_name in os.listdir(output_dir):
  if file_name.endswith('.wav'):
    file_path = os.path.join(output_dir, file_name)
    audio, sample_rate = librosa.load(file_path, sr=None)

    if audio.shape[0] == 32000 * 30:
      print(f"{file_name} has the correct shape: {audio.shape[0]}")
    else:
      print(f"{file_name} does not have the correct shape: {audio.shape[0]}")


segment_011.wav has the correct shape: 960000
segment_006.wav has the correct shape: 960000
segment_002.wav has the correct shape: 960000
segment_004.wav has the correct shape: 960000
segment_010.wav has the correct shape: 960000
segment_005.wav has the correct shape: 960000
segment_003.wav has the correct shape: 960000
segment_008.wav has the correct shape: 960000
segment_009.wav has the correct shape: 960000
segment_007.wav has the correct shape: 960000
segment_001.wav has the correct shape: 960000
segment_000.wav has the correct shape: 960000


# Training process

Get MusicGen Trainer

In [39]:
!cd "/content/musicgen_trainer_dir" && git clone https://github.com/chavinlo/musicgen_trainer.git

Cloning into 'musicgen_trainer'...
remote: Enumerating objects: 166, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 166 (delta 14), reused 19 (delta 6), pack-reused 136[K
Receiving objects: 100% (166/166), 10.53 MiB | 25.50 MiB/s, done.
Resolving deltas: 100% (93/93), done.


In [40]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.1.1-py2.py3-none-any.whl (277 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.3/277.3 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86

In [41]:
import wandb

In [42]:
!python /content/musicgen_trainer_dir/musicgen_trainer/run.py --dataset_path /content/output

2024-05-14 15:40:11.491748: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-14 15:40:11.491858: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-14 15:40:11.613855: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
state_dict.bin: 100% 841M/841M [00:03<00:00, 248MB/s]
spiece.model: 100% 792k/792k [00:00<00:00, 3.19MB/s]
tokenizer.json: 100% 1.39M/1.39M [00:00<00:00, 19.3MB/s]
config.json: 100% 1.21k/1.21k [00:00<00:00, 7.24MB/s]
model.safetensors: 100% 892M/892M [00:09<00:00, 94.7MB/s]
compression_state_dict.bin: 100% 236M/236M [00:00<00:00, 243MB/s]
Tuning everything
Epoch

In [43]:
from audiocraft.models import musicgen
from audiocraft.utils.notebook import display_audio
import torch

Load model

In [44]:
model = musicgen.MusicGen.get_pretrained('small', device='cuda')
model.set_generation_params(duration=8)
model.lm.load_state_dict(torch.load('models/lm_final.pt'))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


<All keys matched successfully>

Prompt

In [45]:
res = model.generate([
    'classic, beethoven'
],
  progress=True)

display_audio(res, 32000)

