In [1]:
# cloning the musiclm-pytorch github repository
!git clone https://github.com/lucidrains/musiclm-pytorch.git

Cloning into 'musiclm-pytorch'...
remote: Enumerating objects: 293, done.[K
remote: Counting objects: 100% (137/137), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 293 (delta 129), reused 105 (delta 102), pack-reused 156[K
Receiving objects: 100% (293/293), 193.50 KiB | 11.38 MiB/s, done.
Resolving deltas: 100% (180/180), done.


In [2]:
!pip install beartype
!pip install lion_pytorch
!pip install einops

Collecting beartype
  Downloading beartype-0.14.0-py3-none-any.whl (720 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m720.2/720.2 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: beartype
Successfully installed beartype-0.14.0
[0mCollecting lion_pytorch
  Downloading lion_pytorch-0.1.2-py3-none-any.whl (4.4 kB)
Installing collected packages: lion_pytorch
Successfully installed lion_pytorch-0.1.2
[0mCollecting einops
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.6.1
[0m

In [3]:
!pip install audiolm_pytorch
!pip install x_clip
!pip install musiclm_pytorch

Collecting audiolm_pytorch
  Downloading audiolm_pytorch-1.0.4-py3-none-any.whl (37 kB)
Collecting ema-pytorch>=0.2.2
  Downloading ema_pytorch-0.2.3-py3-none-any.whl (4.4 kB)
Collecting encodec
  Downloading encodec-0.1.1.tar.gz (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting vector-quantize-pytorch>=1.4.1
  Downloading vector_quantize_pytorch-1.5.3-py3-none-any.whl (11 kB)
Collecting local-attention>=1.8.4
  Downloading local_attention-1.8.6-py3-none-any.whl (8.1 kB)
Collecting fairseq
  Downloading fairseq-0.12.2.tar.gz (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?

In [4]:
import torch
from musiclm_pytorch import MuLaN, AudioSpectrogramTransformer, TextTransformer

audio_transformer = AudioSpectrogramTransformer(
    dim = 512,
    depth = 6,
    heads = 8,
    dim_head = 64,
    spec_n_fft = 128,
    spec_win_length = 24,
    spec_aug_stretch_factor = 0.8
)

text_transformer = TextTransformer(
    dim = 512,
    depth = 6,
    heads = 8,
    dim_head = 64
)

mulan = MuLaN(
    audio_transformer = audio_transformer,
    text_transformer = text_transformer
)

# get a ton of <sound, text> pairs and train

wavs = torch.randn(2, 1024)
texts = torch.randint(0, 20000, (2, 256))

loss = mulan(wavs, texts)
loss.backward()

# after much training, you can embed sounds and text into a joint embedding space
# for conditioning the audio LM

embeds = mulan.get_audio_latents(wavs)  # during training

embeds = mulan.get_text_latents(texts)  # during inference



spectrogram yielded shape of (65, 86), but had to be cropped to (64, 80) to be patchified for transformer


In [6]:
from musiclm_pytorch import MuLaNEmbedQuantizer

# setup the quantizer with the namespaced conditioning embeddings, unique per quantizer as well as namespace (per transformer)

quantizer = MuLaNEmbedQuantizer(
    mulan = mulan,                          # pass in trained mulan from above
    conditioning_dims = (1024, 1024, 1024), # say all three transformers have model dimensions of 1024
    namespaces = ('semantic', 'coarse', 'fine')
)

# now say you want the conditioning embeddings for semantic transformer

wavs = torch.randn(2, 1024)
conds = quantizer(wavs = wavs, namespace = 'semantic') # (2, 8, 1024) - 8 is number of quantizers

In [14]:
pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
[0mNote: you may need to restart the kernel to use updated packages.


In [12]:
from torch import nn
net = nn.Sequential(
    nn.Linear(18*18, 80),
    nn.ReLU(),
    nn.Linear(80, 80),
    nn.ReLU(),
    nn.Linear(80, 10),
    nn.LogSoftmax()
)

In [15]:
import torch
from audiolm_pytorch import HubertWithKmeans, SemanticTransformer, SemanticTransformerTrainer

wav2vec = HubertWithKmeans(
    checkpoint_path = '/kaggle/input/somethingsomethign/hubert_base_ls960.pt',
    kmeans_path = '/kaggle/input/somethingpart2/hubert_base_ls960_L9_km500.bin'
)

semantic_transformer = SemanticTransformer(
    num_semantic_tokens = wav2vec.codebook_size,
    dim = 1024,
    depth = 6,
    audio_text_condition = True      # this must be set to True (same for CoarseTransformer and FineTransformers)
).cuda()

trainer = SemanticTransformerTrainer(
    transformer = semantic_transformer,
    wav2vec = wav2vec,
    audio_conditioner = quantizer,   # pass in the MulanEmbedQuantizer instance above
    folder ='/kaggle/input/midi-files',
    batch_size = 1,
    data_max_length = 320 * 32,
    num_train_steps = 1
)

trainer.train()

AssertionError: Torch not compiled with CUDA enabled