In [1]:
!pip install musiclm-pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting musiclm-pytorch
  Downloading musiclm_pytorch-0.2.2-py3-none-any.whl (12 kB)
Collecting accelerate
  Downloading accelerate-0.18.0-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.3/215.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting beartype
  Downloading beartype-0.13.1-py3-none-any.whl (707 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m708.0/708.0 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting x-clip
  Downloading x_clip-0.12.1-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting vector-quantize-pytorch>=1.0.0
  Downloading vector_quantize_pytorch-1.2.2-py3-none-any.whl (10 kB)
Collecting audiolm-pytorch>=0.17.0
  Downloading audiolm_pytorch-0.27.4-py3-none-any.whl (36 kB

In [2]:
import torch
from musiclm_pytorch import MuLaN, AudioSpectrogramTransformer, TextTransformer

audio_transformer = AudioSpectrogramTransformer(
    dim = 512,
    depth = 6,
    heads = 8,
    dim_head = 64,
    spec_n_fft = 128,
    spec_win_length = 24,
    spec_aug_stretch_factor = 0.8
)

text_transformer = TextTransformer(
    dim = 512,
    depth = 6,
    heads = 8,
    dim_head = 64
)

mulan = MuLaN(
    audio_transformer = audio_transformer,
    text_transformer = text_transformer
)

# get a ton of <sound, text> pairs and train

wavs = torch.randn(2, 1024)
texts = torch.randint(0, 20000, (2, 256))

loss = mulan(wavs, texts)
loss.backward()

# after much training, you can embed sounds and text into a joint embedding space
# for conditioning the audio LM

embeds = mulan.get_audio_latents(wavs)  # during training

embeds = mulan.get_text_latents(texts)  # during inference

spectrogram yielded shape of (65, 86), but had to be cropped to (64, 80) to be patchified for transformer


In [3]:
from musiclm_pytorch import MuLaNEmbedQuantizer

# setup the quantizer with the namespaced conditioning embeddings, unique per quantizer as well as namespace (per transformer)

quantizer = MuLaNEmbedQuantizer(
    mulan = mulan,                          # pass in trained mulan from above
    conditioning_dims = (1024, 1024, 1024), # say all three transformers have model dimensions of 1024
    namespaces = ('semantic', 'coarse', 'fine')
)

# now say you want the conditioning embeddings for semantic transformer

wavs = torch.randn(2, 1024)
conds = quantizer(wavs = wavs, namespace = 'semantic') # (2, 8, 1024) - 8 is number of quantizers

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
from audiolm_pytorch import SoundStream, SoundStreamTrainer

soundstream = SoundStream(
    codebook_size = 1024,
    rq_num_quantizers = 8,
    attn_window_size = 128,       # local attention receptive field at bottleneck
    attn_depth = 2                # 2 local attention transformer blocks - the soundstream folks were not experts with attention, so i took the liberty to add some. encodec went with lstms, but attention should be better
)

trainer = SoundStreamTrainer(
    soundstream,
    folder = '/content/drive/MyDrive/ArtML_Final/recordings',
    batch_size = 4,
    grad_accum_every = 8,         # effective batch size of 32
    data_max_length_seconds = 2,  # train on 2 second audio
    num_train_steps = 20
).cuda()

trainer.train()

training with dataset of 2850 samples and validating with randomly splitted 150 samples
0: soundstream total loss: 11.102, soundstream recon loss: 0.010 | discr (scale 1) loss: 2.000 | discr (scale 0.5) loss: 2.000 | discr (scale 0.25) loss: 2.000
0: saving to results
0: saving model to results
1: soundstream total loss: 7.185, soundstream recon loss: 0.002 | discr (scale 1) loss: 2.001 | discr (scale 0.5) loss: 2.001 | discr (scale 0.25) loss: 2.001
2: soundstream total loss: 9.004, soundstream recon loss: 0.003 | discr (scale 1) loss: 2.000 | discr (scale 0.5) loss: 2.001 | discr (scale 0.25) loss: 2.002
3: soundstream total loss: 12.161, soundstream recon loss: 0.003 | discr (scale 1) loss: 1.998 | discr (scale 0.5) loss: 2.000 | discr (scale 0.25) loss: 2.000
4: soundstream total loss: 12.067, soundstream recon loss: 0.003 | discr (scale 1) loss: 1.997 | discr (scale 0.5) loss: 2.000 | discr (scale 0.25) loss: 2.000
5: soundstream total loss: 8.345, soundstream recon loss: 0.001 | 

In [9]:
import torch
from audiolm_pytorch import HubertWithKmeans, SemanticTransformer, SemanticTransformerTrainer

wav2vec = HubertWithKmeans(
    checkpoint_path = '/content/drive/MyDrive/ArtML_Final/hubert_base_ls960.pt',
    kmeans_path = '/content/drive/MyDrive/ArtML_Final/hubert_base_ls960_L9_km500.bin'
)

semantic_transformer = SemanticTransformer(
    num_semantic_tokens = wav2vec.codebook_size,
    dim = 1024,
    depth = 6,
    audio_text_condition = True      # this must be set to True (same for CoarseTransformer and FineTransformers)
).cuda()

trainer = SemanticTransformerTrainer(
    transformer = semantic_transformer,
    wav2vec = wav2vec,
    audio_conditioner = quantizer,   # pass in the MulanEmbedQuantizer instance above
    folder ='/content/drive/MyDrive/ArtML_Final/recordings',
    batch_size = 1,
    data_max_length = 320 * 32,
    num_train_steps = 1000
)

trainer.train()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Downloading (…)lve/main/config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

training with dataset of 2850 samples and validating with randomly splitted 150 samples
do you want to clear previous experiment checkpoints and results? (y/n) y
0: loss: 6.196114540100098
0: valid loss 6.128756999969482
0: saving model to results
1: loss: 6.42691707611084
2: loss: 5.791701793670654
3: loss: 4.1556620597839355
4: loss: 4.668427467346191
5: loss: 5.722742557525635
6: loss: 4.83859395980835
7: loss: 5.2559733390808105
8: loss: 6.645999431610107
9: loss: 4.512092113494873
10: loss: 6.284344673156738
11: loss: 6.3512725830078125
12: loss: 5.7947797775268555
13: loss: 4.565145015716553
14: loss: 4.949483394622803
15: loss: 4.728059768676758
16: loss: 5.563417911529541
17: loss: 4.71204948425293
18: loss: 4.797524929046631
19: loss: 6.340610027313232
20: loss: 4.45046329498291
21: loss: 4.403565406799316
22: loss: 5.140037536621094
23: loss: 4.427911758422852
24: loss: 4.4180779457092285
25: loss: 4.2119293212890625
26: loss: 4.418458938598633
27: loss: 5.197200775146484
28:

# New Section

In [10]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
from audiolm_pytorch import HubertWithKmeans, SoundStream, CoarseTransformer, CoarseTransformerTrainer

# wav2vec = HubertWithKmeans(
#     checkpoint_path = '/content/drive/MyDrive/hubert_base_ls960.pt',
#     kmeans_path = '/content/drive/MyDrive/hubert_base_ls960_L9_km500.bin'
# )

#soundstream = SoundStream.init_and_load_from('/path/to/trained/soundstream.pt')

coarse_transformer = CoarseTransformer(
    num_semantic_tokens = wav2vec.codebook_size,
    codebook_size = 1024,
    num_coarse_quantizers = 3,
    dim = 1024,
    depth = 6,
    audio_text_condition = True,
    has_condition = True
).cuda()

trainer = CoarseTransformerTrainer(
    transformer = coarse_transformer,
    codec = soundstream,
    wav2vec = wav2vec,
    audio_conditioner = quantizer,
    folder = '/content/drive/MyDrive/ArtML_Final/recordings',
    batch_size = 1,
    data_max_length = 320 * 32,
    num_train_steps = 1000
)

trainer.train()

training with dataset of 2850 samples and validating with randomly splitted 150 samples
do you want to clear previous experiment checkpoints and results? (y/n) y
0: loss: 97.36138916015625
0: valid loss 217.82882690429688
0: saving model to results
1: loss: 175.84210205078125
2: loss: 151.42623901367188
3: loss: 75.79421997070312
4: loss: 86.1459732055664
5: loss: 91.30323028564453
6: loss: 197.7296905517578
7: loss: 121.2177505493164
8: loss: 36.627708435058594
9: loss: 62.315460205078125
10: loss: 67.99581146240234
11: loss: 67.14136505126953
12: loss: 56.054569244384766
13: loss: 192.08349609375
14: loss: 155.24398803710938
15: loss: 88.91699981689453
16: loss: 95.12409210205078
17: loss: 63.50566101074219
18: loss: 55.24103546142578
19: loss: 52.708641052246094
20: loss: 74.2483901977539
21: loss: 41.16132736206055
22: loss: 52.91457748413086
23: loss: 53.62237548828125
24: loss: 47.7381706237793
25: loss: 49.69729232788086
26: loss: 29.14586639404297
27: loss: 24.07536506652832
28

In [11]:
import torch
from audiolm_pytorch import SoundStream, FineTransformer, FineTransformerTrainer

#soundstream = SoundStream.init_and_load_from('/path/to/trained/soundstream.pt')

fine_transformer = FineTransformer(
    num_coarse_quantizers = 3,
    num_fine_quantizers = 5,
    codebook_size = 1024,
    dim = 1024,
    depth = 6,
    audio_text_condition = True,
    has_condition = True
).cuda()

trainer = FineTransformerTrainer(
    transformer = fine_transformer,
    codec = soundstream,
    audio_conditioner = quantizer,
    folder = '/content/drive/MyDrive/ArtML_Final/recordings',
    batch_size = 1,
    data_max_length = 320 * 32,
    num_train_steps = 100
)

trainer.train()

training with dataset of 2850 samples and validating with randomly splitted 150 samples
do you want to clear previous experiment checkpoints and results? (y/n) y
0: loss: 102.89920806884766
0: valid loss 98.99147033691406
0: saving model to results
1: loss: 101.76252746582031
2: loss: 169.89581298828125
3: loss: 121.8134765625
4: loss: 83.03973388671875
5: loss: 67.33219909667969
6: loss: 38.159908294677734
7: loss: 87.90650939941406
8: loss: 10.048135757446289
9: loss: 92.53950500488281
10: loss: 72.66663360595703
11: loss: 42.208412170410156
12: loss: 54.060035705566406
13: loss: 44.994590759277344
14: loss: 40.80534362792969
15: loss: 53.74730682373047
16: loss: 44.59950256347656
17: loss: 39.492698669433594
18: loss: 49.04771423339844
19: loss: 41.361663818359375
20: loss: 15.554722785949707
21: loss: 33.83832550048828
22: loss: 5.972051620483398
23: loss: 41.04780197143555
24: loss: 29.863597869873047
25: loss: 33.68241500854492
26: loss: 41.20784378051758
27: loss: 25.86466217041

In [12]:
from audiolm_pytorch import AudioLM
from musiclm_pytorch import MusicLM

audiolm = AudioLM(
    wav2vec = wav2vec,
    codec = soundstream,
    semantic_transformer = semantic_transformer,
    coarse_transformer = coarse_transformer,
    fine_transformer = fine_transformer
)

musiclm = MusicLM(
    audio_lm = audiolm,
    mulan_embed_quantizer = quantizer
)

In [13]:
music = musiclm('the crystalline sounds of the piano in a ballroom', num_samples = 1) # sample 4 and pick the top match with mulan

generating semantic: 100%|██████████| 2048/2048 [05:28<00:00,  6.23it/s]
generating coarse: 100%|██████████| 512/512 [14:18<00:00,  1.68s/it]
generating fine: 100%|██████████| 512/512 [26:19<00:00,  3.08s/it]


In [1]:
print(music)

NameError: ignored