In [3]:
import argparse
import librosa

from coco_mulla.models import CoCoMulla
from coco_mulla.utilities import *
from coco_mulla.utilities.encodec_utils import extract_rvq, save_rvq
from coco_mulla.utilities.symbolic_utils import process_midi, process_chord

from coco_mulla.utilities.sep_utils import separate
from config import TrainCfg
import torch.nn.functional as F

device = get_device()


def generate(model_path, batch):
    model = CoCoMulla(TrainCfg.sample_sec,
                      num_layers=args.num_layers,
                      latent_dim=args.latent_dim).to(device)
    model.load_weights(model_path)
    model.eval()
    with torch.no_grad():
        gen_tokens = model(**batch)

    return gen_tokens


def generate_mask(xlen):
    names = ["chord-only", "chord-drums", "chord-midi", "chord-drums-midi"]
    mask = torch.zeros([4, 2, xlen]).to(device)
    mask[1, 1] = 1
    mask[2, 0] = 1
    mask[3] += 1
    return mask, names


def load_data(audio_path, chord_path, midi_path, offset):
    sr = TrainCfg.sample_rate
    res = TrainCfg.frame_res
    sample_sec = TrainCfg.sample_sec

    wav, _ = librosa.load(audio_path, sr=sr, mono=True)
    wav = np2torch(wav).to(device)[None, None, ...]
    wavs = separate(wav, sr)
    drums_rvq = extract_rvq(wavs["drums"], sr=sr)
    chord, _ = process_chord(chord_path)
    flatten_midi_path = midi_path + ".piano.mid"
    midi, _ = process_midi(midi_path)



    chord = crop(chord[None, ...], "chord", sample_sec, res)
    pad_chord = chord.sum(-1, keepdims=True) == 0
    chord = np.concatenate([chord, pad_chord], -1)

    midi = crop(midi[None, ...], "midi", sample_sec, res,offset=offset)
    drums_rvq = crop(drums_rvq[None, ...], "drums_rvq", sample_sec, res, offset=offset)

    chord = torch.from_numpy(chord).to(device).float()
    midi = torch.from_numpy(midi).to(device).float()
    drums_rvq = drums_rvq.to(device).long()

    return drums_rvq, midi, chord


def crop(x, mode, sample_sec, res, offset=0):
    xlen = x.shape[1] if mode == "chord" or mode == "midi" else x.shape[-1]
    sample_len = int(sample_sec * res) + 1
    if xlen < sample_len:
        if mode == "chord" or mode == "midi":
            x = np.pad(x, ((0, 0), (0, sample_len - xlen), (0, 0)))
        else:
            x = F.pad(x, (0, sample_len - xlen), "constant", 0)
        return x

    st = offset * res
    ed = int((offset + sample_sec) * res) + 1
    if mode == "chord" or mode == "midi":
        assert x.shape[1] > st
        return x[:, st: ed]
    assert x.shape[2] > ed
    return x[:, :, st: ed]


def save_pred(output_folder, tags, pred):
    mkdir(output_folder)
    output_list = [os.path.join(output_folder, tag) for tag in tags]
    save_rvq(output_list=output_list, tokens=pred)


def wrap_batch(drums_rvq, midi, chord, cond_mask, prompt):
    num_samples = len(cond_mask)
    midi = midi.repeat(num_samples, 1, 1)
    chord = chord.repeat(num_samples, 1, 1)
    drums_rvq = drums_rvq.repeat(num_samples, 1, 1)
    prompt = [prompt] * num_samples
    batch = {
        "seq": None,
        "desc": prompt,
        "chords": chord,
        "num_samples": num_samples,
        "cond_mask": cond_mask,
        "drums": drums_rvq,
        "piano_roll": midi,
        "mode": "inference",
    }
    return batch


def inference(args):
    drums_rvq, midi, chord = load_data(audio_path=args.audio_path,
                                       chord_path=args.chord_path,
                                       midi_path=args.midi_path,
                                       offset=args.offset)
    cond_mask, names = generate_mask(drums_rvq.shape[-1])
    batch = wrap_batch(drums_rvq, midi, chord, cond_mask, read_lst(args.prompt_path)[0])
    pred = generate(model_path=args.model_path,
                    batch=batch)
    save_pred(output_folder=args.output_folder,
              tags=names,
              pred=pred)

from types import SimpleNamespace
args = {
    "num_layers": 48,
    "latent_dim": 12,
    "output_folder": "/l/users/fathinah.izzati/coco-mulla-repo/demo/output",
    "model_path": "/l/users/fathinah.izzati/coco-mulla-repo/diff_9_end.pth",
    "audio_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be.flac",
    "prompt_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be.prompt.txt",
    "chord_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be.flac.chord.lab",
    "midi_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be.mid.piano.mid",
    "drums_path": None,
    "offset": 0
}
args = SimpleNamespace(**args)
drums_rvq, midi, chord = load_data(audio_path=args.audio_path,
                                    chord_path=args.chord_path,
                                    midi_path=args.midi_path,
                                    offset=args.offset)
cond_mask, names = generate_mask(drums_rvq.shape[-1])
batch = wrap_batch(drums_rvq, midi, chord, cond_mask, read_lst(args.prompt_path)[0])
pred = inference(args)



load....musicgen bk
lm_bk, here


CLIPPING /l/users/fathinah.izzati/coco-mulla-repo/demo/output/chord-only happening with proba (a bit of clipping is okay): 0.00012656250328291208 maximum scale:  1.2986297607421875
CLIPPING /l/users/fathinah.izzati/coco-mulla-repo/demo/output/chord-drums happening with proba (a bit of clipping is okay): 7.812499825377017e-05 maximum scale:  1.1229928731918335
CLIPPING /l/users/fathinah.izzati/coco-mulla-repo/demo/output/chord-midi happening with proba (a bit of clipping is okay): 3.12499992105586e-06 maximum scale:  1.0149791240692139
CLIPPING /l/users/fathinah.izzati/coco-mulla-repo/demo/output/chord-drums-midi happening with proba (a bit of clipping is okay): 4.687500222644303e-06 maximum scale:  1.1103787422180176


## apply to training

In [1]:
import argparse
from torch.utils.tensorboard import SummaryWriter

import torch.distributed as dist
from torch.multiprocessing import spawn
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
from coco_mulla.utilities.trainer_utils import Trainer

import torch
import torch.nn as nn
import os
from config import TrainCfg
import numpy as np

os.environ["TOKENIZERS_PARALLELISM"] = "false"

from tqdm import tqdm

from coco_mulla.data_loader.dataset_sampler import Dataset, collate_fn
from coco_mulla.models import CoCoMulla

device = "cuda"
N_GPUS = 2


def _get_free_port():
    import socketserver
    with socketserver.TCPServer(('localhost', 0), None) as s:
        return s.server_address[1]



def get_dataset(dataset_split, sampling_strategy, sampling_prob):

    file_lst = ["data/text/musdb18_full.lst",
                "data/text/closed_dataset_fm_full.lst"]
    splits = [
        [1],
        [0],
        [0, 1],
    ]
    dataset = Dataset(
        rid=0, # No distributed rank needed
        path_lst=[dataset_split],
        sampling_prob=sampling_prob,
        sampling_strategy=sampling_strategy,
        cfg=TrainCfg)

    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=TrainCfg.batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        num_workers=0,
        # sampler=DistributedSampler(dataset),
        pin_memory=True,
        drop_last=True)

    return dataset, dataloader


def train_dist(replica_id, replica_count, port, model_dir, args):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = str(port)
    torch.distributed.init_process_group('nccl', rank=replica_id, world_size=replica_count)
    device = torch.device('cuda', replica_id)
    torch.cuda.set_device(device)
    model = CoCoMulla(TrainCfg.sample_sec, num_layers=args.num_layers, latent_dim=args.latent_dim).to(device)
    model.set_training()
    model = DDP(model, [replica_id])
    dataset, dataloader = get_dataset(rid=replica_id, dataset_split=args.dataset,
                                      sampling_strategy=args.sampling_strategy,
                                      sampling_prob=[args.sampling_prob_a, args.sampling_prob_b])

    train(replica_id, model, dataset, dataloader, device, model_dir,
          args.learning_rate)


def loss_fn(outputs, y):
    prob = outputs.logits
    mask = outputs.mask
    prob = prob[mask]
    y = y[mask]
    prob = prob.view(-1, 2048)
    return nn.CrossEntropyLoss()(prob, y)


def train(model, dataset, dataloader, device, model_dir, learning_rate):
    # optimizer and lr scheduler
    num_steps = len(dataloader)
    epochs = TrainCfg.epoch
    rng = np.random.RandomState(569)
    writer = SummaryWriter(model_dir, flush_secs=20)

    trainer = Trainer(params=model.parameters(), lr=learning_rate, num_epochs=epochs, num_steps=num_steps)

    model = model.to(device)
    step = 0
    for e in range(0, epochs):
        mean_loss = 0
        n_element = 0
        model.train()

        dl = tqdm(dataloader, desc=f"Epoch {e}")
        r = rng.randint(0, 233333)
        dataset.reset_random_seed(r, e)
        for i, batch in enumerate(dl):
            desc = batch["desc"]
            mix = batch["mix"].to(device).long()
            drums = batch["drums"].to(device).long()
            chords = batch["chords"].to(device).float()
            piano_roll = batch["piano_roll"].to(device).float()
            cond_mask = batch["cond_mask"].to(device).long()

            batch_1 = {
                "seq": mix,
                "drums": drums,
                "chords": chords,
                "piano_roll": piano_roll,
                "cond_mask": cond_mask,
                "desc": desc,

            }
            # with autocast:
            outputs = model(**batch_1)
            r_loss = loss_fn(outputs, mix.long())

            grad_1, lr_1 = trainer.step(r_loss, model.parameters())

            step += 1
            n_element += 1
            writer.add_scalar("r_loss", r_loss.item(), step)
            writer.add_scalar("grad_1", grad_1, step)
            writer.add_scalar("lr_1", lr_1, step)

            mean_loss += r_loss.item()

        mean_loss = mean_loss / n_element
        with torch.no_grad():
            writer.add_scalar('train/mean_loss', mean_loss, step)
            model.module.save_weights(os.path.join(model_dir, f"diff_{e}_end.pth"))


def main(args):
    experiment_folder = args.experiment_folder
    experiment_name = args.experiment_name

    if not os.path.exists(experiment_folder):
        os.mkdir(experiment_folder)
    model_dir = os.path.join(experiment_folder, experiment_name)
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    world_size = N_GPUS
    port = _get_free_port()
    spawn(train_dist, args=(world_size, port, model_dir, args), nprocs=world_size, join=True)



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch; print(torch.version.cuda)


12.1


In [2]:
 from types import SimpleNamespace
args = {
    "num_layers": 48,
    "latent_dim": 12,
    "experiment_folder": "/l/users/fathinah.izzati/coco-mulla-repo/expe",
    "experiment_name": "experiment_1",
    "prompt_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be.prompt.txt",
    'sampling_strategy':'prob-based',
    "dataset": '/l/users/fathinah.izzati/coco-mulla-repo/train.lst',
    'learning_rate':0.1

}
args = SimpleNamespace(**args)

experiment_folder = args.experiment_folder
experiment_name = args.experiment_name
if not os.path.exists(experiment_folder):
    os.mkdir(experiment_folder)
model_dir = os.path.join(experiment_folder, experiment_name)
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
    
 dataset, dataloader = get_dataset(
        dataset_split=args.dataset,
        sampling_strategy=args.sampling_strategy,
        sampling_prob=None
    )

0 /l/users/fathinah.izzati/coco-mulla-repo/train.lst
[{'path': '/l/users/fathinah.izzati/coco-mulla-repo/demo/data/let_it_be', 'data': {'piano_roll': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])}}, {'path': '/l/users/fathinah.izzati/coco-mulla-repo/demo/data/let_it_be', 'data': {'piano_roll': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])}}, {'path': '/l/users/fathinah.izzati/coco-mulla-repo/demo/data/let_it_be', 'data': {'piano_roll': array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
      

In [3]:
model = CoCoMulla(TrainCfg.sample_sec, num_layers=args.num_layers, latent_dim=args.latent_dim).to(device)
model.set_training()



load....musicgen bk
lm_bk, here
cp_transformer.masked_embedding
cp_transformer.pos_emb
cp_transformer.gates
cp_transformer.encodec_emb.weight
cp_transformer.merge_linear.0.weight
cp_transformer.merge_linear.1.weight
cp_transformer.merge_linear.2.weight
cp_transformer.merge_linear.3.weight
cp_transformer.merge_linear.4.weight
cp_transformer.merge_linear.5.weight
cp_transformer.merge_linear.6.weight
cp_transformer.merge_linear.7.weight
cp_transformer.merge_linear.8.weight
cp_transformer.merge_linear.9.weight
cp_transformer.merge_linear.10.weight
cp_transformer.merge_linear.11.weight
cp_transformer.merge_linear.12.weight
cp_transformer.merge_linear.13.weight
cp_transformer.merge_linear.14.weight
cp_transformer.merge_linear.15.weight
cp_transformer.merge_linear.16.weight
cp_transformer.merge_linear.17.weight
cp_transformer.merge_linear.18.weight
cp_transformer.merge_linear.19.weight
cp_transformer.merge_linear.20.weight
cp_transformer.merge_linear.21.weight
cp_transformer.merge_linear.22.w

In [4]:
train(model, dataset, dataloader, device, model_dir, args.learning_rate)

Epoch 0:   0%|          | 0/2 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 188.00 MiB. GPU 0 has a total capacty of 23.62 GiB of which 7.19 MiB is free. Including non-PyTorch memory, this process has 23.43 GiB memory in use. Of the allocated memory 23.09 GiB is allocated by PyTorch, and 138.26 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

Inference step 

1. load  drums_rvq, midi, chord. drums rvq is obtained from wav and separate() function. then, they crop each of this control based on sample_sec (duration) and resolution
2. load the condition mask 
3. wrap into a batch, multiplying each sample to 4 because there are four types of output, controlled by chord only, chord-midi, chord-drums, chords-drum-midi
4. generate(model, batch)

In [1]:
import argparse
from torch.utils.tensorboard import SummaryWriter

import torch.distributed as dist
from torch.multiprocessing import spawn
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
from coco_mulla.utilities.trainer_utils import Trainer

import torch
import torch.nn as nn
import os

import numpy as np

os.environ["TOKENIZERS_PARALLELISM"] = "false"

from tqdm import tqdm


from coco_mulla.models import CoCoMulla

device = "cuda"
N_GPUS = 4

## fixed
from coco_mulla.data_loader.dataset_sampler import Dataset, collate_fn
from config import TrainCfg

def _get_free_port():
    import socketserver
    with socketserver.TCPServer(('localhost', 0), None) as s:
        return s.server_address[1]


def get_dataset(rid, dataset_path, sampling_strategy, sampling_prob):

    dataset = Dataset(
        rid=rid,
        path_lst=[dataset_path],
        sampling_prob=sampling_prob,
        sampling_strategy=sampling_strategy,
        cfg=TrainCfg)

    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=TrainCfg.batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        drop_last=True)

    return dataset, dataloader
def train_dist(replica_id, replica_count, port, model_dir, args):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = str(port)
    torch.distributed.init_process_group('nccl', rank=replica_id, world_size=replica_count)
    device = torch.device('cuda', replica_id)
    torch.cuda.set_device(device)
    model = CoCoMulla(TrainCfg.sample_sec, num_layers=args.num_layers, latent_dim=args.latent_dim).to(device)
    model.set_training()
    model = DDP(model, [replica_id])
    dataset, dataloader = get_dataset(rid=replica_id, dataset_split=args.dataset,
                                      sampling_strategy=args.sampling_strategy,
                                      sampling_prob=[args.sampling_prob_a, args.sampling_prob_b])

    train(replica_id, model, dataset, dataloader, device, model_dir,
          args.learning_rate)


def loss_fn(outputs, y):
    prob = outputs.logits
    mask = outputs.mask
    prob = prob[mask]
    y = y[mask]
    prob = prob.view(-1, 2048)
    return nn.CrossEntropyLoss()(prob, y)
def train(rank, model, dataset, dataloader, device, model_dir, learning_rate):
    # optimizer and lr scheduler
    num_steps = len(dataloader)
    epochs = TrainCfg.epoch
    rng = np.random.RandomState(569 + rank * 100)
    if rank == 0:
        writer = SummaryWriter(model_dir, flush_secs=20)

    trainer = Trainer(params=model.parameters(), lr=learning_rate, num_epochs=epochs, num_steps=num_steps)

    model = model.to(device)
    step = 0
    for e in range(0, epochs):
        mean_loss = 0
        n_element = 0
        model.train()

        dl = tqdm(dataloader, desc=f"Epoch {e}") if rank == 0 else dataloader
        r = rng.randint(0, 233333)
        dataset.reset_random_seed(r, e)
        for i, batch in enumerate(dl):
            desc = batch["desc"]
            mix = batch["mix"].to(device).long()
            rgb_emb = batch["rgb_emb"].to(device).long()
            cond_mask = batch["cond_mask"].to(device).long()

            batch_1 = {
                 "seq": mix,
                "rgb_emb":rgb_emb,
                "cond_mask": cond_mask,
                "desc": desc,

            }
            print('seq',batch_1['seq'].shape)
            print('rgb_emb',batch_1['rgb_emb'].shape)
            print('cond_mask',batch_1['cond_mask'].shape)
            print('desc',batch_1['desc'])

            
            # with autocast:
            outputs = model(**batch_1)
            r_loss = loss_fn(outputs, mix.long())
            grad_1, lr_1 = trainer.step(r_loss, model.parameters())

            step += 1
            n_element += 1
            if rank == 0:
                writer.add_scalar("r_loss", r_loss.item(), step)
                writer.add_scalar("grad_1", grad_1, step)
                writer.add_scalar("lr_1", lr_1, step)

            mean_loss += r_loss.item()

        mean_loss = mean_loss / n_element
        if rank == 0:
            with torch.no_grad():
                writer.add_scalar('train/mean_loss', mean_loss, step)
                model.module.save_weights(os.path.join(model_dir, f"diff_{e}_end.pth"))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset, dataloader =  get_dataset(1, '/l/users/fathinah.izzati/datasets/test.lst', 'prob-based', None)

from types import SimpleNamespace
args = {
    "num_layers": 48,
    "latent_dim": 768,
    "experiment_folder": "/l/users/fathinah.izzati/coco-mulla-repo/expe",
    "experiment_name": "experiment_1",
    "prompt_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be.prompt.txt",
    'sampling_strategy':'prob-based',
    'sampling_prob_a':0.5,
    'sampling_prob_b':0.5,
    "dataset": None,
    'learning_rate':0.1
}
args = SimpleNamespace(**args)
model = CoCoMulla(TrainCfg.sample_sec, num_layers=args.num_layers, latent_dim=args.latent_dim).to(device)
model.set_training()
# model = DDP(model, [0])\
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
train(1, model, dataset, dataloader, device,  '/l/users/fathinah.izzati/coco-mulla-repo/demo',args.learning_rate)

0 /l/users/fathinah.izzati/datasets/test.lst


FileNotFoundError: [Errno 2] No such file or directory: '/l/users/fathinah.izzati/datasets/test.lst'