### Inference

In [1]:
import argparse
import librosa

from coco_mulla.models import CoCoMulla ## Change this one!!
from coco_mulla.utilities import *
from coco_mulla.utilities.encodec_utils import save_rvq

from coco_mulla.utilities.sep_utils import separate
from config import TrainCfg  ## change this one!!
import torch.nn.functional as F
import numpy as np 

device = get_device()


def generate(model_path, batch):
    model = CoCoMulla(TrainCfg.sample_sec,           ## Change this one!!
                      num_layers=args.num_layers,
                      latent_dim=args.latent_dim).to(device)
    model.load_weights(model_path)
    model.eval()
    print(batch)
    with torch.no_grad():
        gen_tokens = model(**batch)

    return gen_tokens


def generate_mask(xlen):
    names = ["video-only"]
    mask = torch.ones([1, 1, xlen]).to(device)
    # mask[1, 1] = 1
    # mask[2, 0] = 1
    # mask[3] += 1
    return mask, names


def load_data(video_path, offset):
    sr = TrainCfg.sample_rate
    res = TrainCfg.frame_res
    sample_sec = TrainCfg.sample_sec
    video_rvq = np.load(video_path)
    print('video', video_rvq.shape)
    drums_rvq = crop(video_rvq[None, ...], "video_rvq", sample_sec, res, offset=offset)
    video = torch.from_numpy(drums_rvq).to(device).long()
    print('video', video.shape)
    return video


def crop(x, mode, sample_sec, res, offset=0):
    xlen = x.shape[-1]
    st = offset * res
    ed = int((offset + sample_sec) * res) + 1
    return x[:, :, st: ed]


def save_pred(output_folder, tags, pred):
    mkdir(output_folder)
    output_list = [os.path.join(output_folder, tag) for tag in tags]
    save_rvq(output_list=output_list, tokens=pred)


def wrap_batch(video, cond_mask, prompt):
    num_samples = len(cond_mask)
    video = video.repeat(num_samples, 1, 1)
    prompt = [prompt] * num_samples
    batch = {
        "music": None,
        "desc": prompt,
        "video": video,
        "num_samples": num_samples,
        "cond_mask": cond_mask,
        "mode": "inference",
    }
    return batch


def inference(args):
    video = load_data(video_path=args.video_path,
                                       offset=args.offset)
    cond_mask, names = generate_mask(video.shape[-1])
    batch = wrap_batch(video, cond_mask, read_lst(args.prompt_path)[0])
    print(batch)
    pred = generate(model_path=args.model_path,
                    batch=batch)
    save_pred(output_folder=args.output_folder,
              tags=names,
              pred=pred)

from types import SimpleNamespace
args = {
    "num_layers": 48,
    "latent_dim": 14,
    "output_folder": "/l/users/fathinah.izzati/coco-mulla-repo/demo/output/expe_tnj_1",
    "model_path": "/l/users/fathinah.izzati/coco-mulla-repo/expe/experiment_tnj_1/diff_4_end.pth",
    "prompt_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be/let_it_be.prompt.txt",
    "video_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/tom_and_jerry/segment_001/video.npy",
    "offset": 0
}
args = SimpleNamespace(**args)
inference(args)

  from .autonotebook import tqdm as notebook_tqdm


: 

In [None]:
import argparse
import librosa

from coco_mulla.models import CoCoMullaTmp ## Change this one!!
from coco_mulla.utilities import *
from coco_mulla.utilities.encodec_utils import extract_rvq, save_rvq
from coco_mulla.utilities.symbolic_utils import process_midi, process_chord

from coco_mulla.utilities.sep_utils import separate
from config_tmp import TrainCfg  ##change this one!!
import torch.nn.functional as F

device = get_device()


def generate(model_path, batch):
    model = CoCoMullaTmp(TrainCfg.sample_sec,           ## Change this one!!
                      num_layers=args.num_layers,
                      latent_dim=args.latent_dim).to(device)
    model.load_weights(model_path)
    model.eval()
    with torch.no_grad():
        gen_tokens = model(**batch)

    return gen_tokens


def generate_mask(xlen):
    names = ["chord-only", "chord-drums", "chord-midi", "chord-drums-midi"]
    mask = torch.zeros([4, 2, xlen]).to(device)
    mask[1, 1] = 1
    mask[2, 0] = 1
    mask[3] += 1
    return mask, names


def load_data(audio_path, chord_path, midi_path, offset):
    sr = TrainCfg.sample_rate
    res = TrainCfg.frame_res
    sample_sec = TrainCfg.sample_sec

    wav, _ = librosa.load(audio_path, sr=sr, mono=True)
    wav = np2torch(wav).to(device)[None, None, ...]
    wavs = separate(wav, sr)
    drums_rvq = extract_rvq(wavs["drums"], sr=sr)
    chord, _ = process_chord(chord_path)
    flatten_midi_path = midi_path + ".piano.mid"
    midi, _ = process_midi(midi_path)



    chord = crop(chord[None, ...], "chord", sample_sec, res)
    pad_chord = chord.sum(-1, keepdims=True) == 0
    chord = np.concatenate([chord, pad_chord], -1)

    midi = crop(midi[None, ...], "midi", sample_sec, res,offset=offset)
    drums_rvq = crop(drums_rvq[None, ...], "drums_rvq", sample_sec, res, offset=offset)

    chord = torch.from_numpy(chord).to(device).float()
    midi = torch.from_numpy(midi).to(device).float()
    drums_rvq = drums_rvq.to(device).long()

    return drums_rvq, midi, chord


def crop(x, mode, sample_sec, res, offset=0):
    xlen = x.shape[1] if mode == "chord" or mode == "midi" else x.shape[-1]
    sample_len = int(sample_sec * res) + 1
    if xlen < sample_len:
        if mode == "chord" or mode == "midi":
            x = np.pad(x, ((0, 0), (0, sample_len - xlen), (0, 0)))
        else:
            x = F.pad(x, (0, sample_len - xlen), "constant", 0)
        return x

    st = offset * res
    ed = int((offset + sample_sec) * res) + 1
    if mode == "chord" or mode == "midi":
        assert x.shape[1] > st
        return x[:, st: ed]
    assert x.shape[2] > ed
    return x[:, :, st: ed]


def save_pred(output_folder, tags, pred):
    mkdir(output_folder)
    output_list = [os.path.join(output_folder, tag) for tag in tags]
    save_rvq(output_list=output_list, tokens=pred)


def wrap_batch(drums_rvq, midi, chord, cond_mask, prompt):
    num_samples = len(cond_mask)
    midi = midi.repeat(num_samples, 1, 1)
    chord = chord.repeat(num_samples, 1, 1)
    drums_rvq = drums_rvq.repeat(num_samples, 1, 1)
    prompt = [prompt] * num_samples
    batch = {
        "seq": None,
        "desc": prompt,
        "chords": chord,
        "num_samples": num_samples,
        "cond_mask": cond_mask,
        "drums": drums_rvq,
        "piano_roll": midi,
        "mode": "inference",
    }
    return batch


def inference(args):
    drums_rvq, midi, chord = load_data(audio_path=args.audio_path,
                                       chord_path=args.chord_path,
                                       midi_path=args.midi_path,
                                       offset=args.offset)
    cond_mask, names = generate_mask(drums_rvq.shape[-1])
    batch = wrap_batch(drums_rvq, midi, chord, cond_mask, read_lst(args.prompt_path)[0])
    pred = generate(model_path=args.model_path,
                    batch=batch)
    save_pred(output_folder=args.output_folder,
              tags=names,
              pred=pred)

from types import SimpleNamespace
args = {
    "num_layers": 48,
    "latent_dim": 14,
    "output_folder": "/l/users/fathinah.izzati/coco-mulla-repo/demo/output",
    "model_path": "/l/users/fathinah.izzati/coco-mulla-repo/diff_9_end.pth",
    "audio_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be/let_it_be.flac",
    "prompt_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be/let_it_be.prompt.txt",
    "chord_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be/let_it_be.flac.chord.lab",
    "midi_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be/let_it_be.mid.piano.mid",
    "drums_path": None,
    "offset": 0
}
args = SimpleNamespace(**args)
inference(args)

## apply to training

In [None]:
import argparse
from torch.utils.tensorboard import SummaryWriter

import torch.distributed as dist
from torch.multiprocessing import spawn
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
from coco_mulla.utilities.trainer_utils import Trainer

import torch
import torch.nn as nn
import os
from config import TrainCfg
import numpy as np

os.environ["TOKENIZERS_PARALLELISM"] = "false"

from tqdm import tqdm

from coco_mulla.data_loader.dataset_sampler import Dataset, collate_fn
from coco_mulla.models import CoCoMulla

device = "cuda"
N_GPUS = 1


def _get_free_port():
    import socketserver
    with socketserver.TCPServer(('localhost', 0), None) as s:
        return s.server_address[1]



def get_dataset(dataset_split, sampling_strategy, sampling_prob):

    file_lst = ["data/text/musdb18_full.lst",
                "data/text/closed_dataset_fm_full.lst"]
    splits = [
        [1],
        [0],
        [0, 1],
    ]
    dataset = Dataset(
        rid=0, # No distributed rank needed
        path_lst=[dataset_split],
        sampling_prob=sampling_prob,
        sampling_strategy=sampling_strategy,
        cfg=TrainCfg)

    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=TrainCfg.batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        num_workers=0,
        # sampler=DistributedSampler(dataset),
        pin_memory=True,
        drop_last=True)

    return dataset, dataloader


def train_dist(replica_id, replica_count, port, model_dir, args):
    print('masuk sini')
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = str(port)
    torch.distributed.init_process_group('nccl', rank=replica_id, world_size=replica_count)
    device = torch.device('cuda', replica_id)
    print(device)
    torch.cuda.set_device(device)
    model = CoCoMulla(TrainCfg.sample_sec, num_layers=args.num_layers, latent_dim=args.latent_dim).to(device)
    model.set_training()
    model = DDP(model, [replica_id])
    dataset, dataloader = get_dataset(rid=replica_id, dataset_split=args.dataset,
                                      sampling_strategy=args.sampling_strategy,
                                      sampling_prob=[args.sampling_prob_a, args.sampling_prob_b])

    # train(replica_id, model, dataset, dataloader, device, model_dir,
    #       args.learning_rate)


def loss_fn(outputs, y):
    prob = outputs.logits
    mask = outputs.mask
    prob = prob[mask]
    y = y[mask]
    prob = prob.view(-1, 2048)
    return nn.CrossEntropyLoss()(prob, y)


def train(model, dataset, dataloader, device, model_dir, learning_rate):
    # optimizer and lr scheduler
    num_steps = len(dataloader)
    epochs = TrainCfg.epoch
    rng = np.random.RandomState(569)
    writer = SummaryWriter(model_dir, flush_secs=20)

    trainer = Trainer(params=model.parameters(), lr=learning_rate, num_epochs=epochs, num_steps=num_steps)

    model = model.to(device)
    step = 0
    for e in range(0, epochs):
        mean_loss = 0
        n_element = 0
        model.train()

        dl = tqdm(dataloader, desc=f"Epoch {e}")
        r = rng.randint(0, 233333)
        dataset.reset_random_seed(r, e)
        for i, batch in enumerate(dl):
            desc = batch["desc"]
            music = batch["music"].to(device).long()[:,:,:250]
            video = batch["video"].to(device).long()
            cond_mask = batch["cond_mask"].to(device).long()

            batch_1 = {
                "music": music,
                "video": video,
                "cond_mask": cond_mask,
                "desc": desc,

            }
            # with autocast:
            outputs = model(**batch_1)
            print("==========================================")
            print(outputs)
            
            r_loss = loss_fn(outputs, music)

            grad_1, lr_1 = trainer.step(r_loss, model.parameters())

            step += 1
            n_element += 1
            writer.add_scalar("r_loss", r_loss.item(), step)
            writer.add_scalar("grad_1", grad_1, step)
            writer.add_scalar("lr_1", lr_1, step)

            mean_loss += r_loss.item()

        mean_loss = mean_loss / n_element
        with torch.no_grad():
            writer.add_scalar('train/mean_loss', mean_loss, step)
            model.save_weights(os.path.join(model_dir, f"diff_{e}_end.pth"))


def main(args):
    experiment_folder = args.experiment_folder
    experiment_name = args.experiment_name

    if not os.path.exists(experiment_folder):
        os.mkdir(experiment_folder)
    model_dir = os.path.join(experiment_folder, experiment_name)
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    world_size = N_GPUS 
    port = _get_free_port()
    spawn(train_dist, args=(world_size, port, model_dir, args), nprocs=world_size, join=True)


2024-12-09 16:29:36.257278: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-09 16:29:41.294273: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ModuleNotFoundError: No module named 'flashy'

Training adoption steps

1. Change config
2. Modify data structure, move each segment into seperate folder
3. Chage test.lst


In [None]:
from types import SimpleNamespace
args = {
    "num_layers": 48,
    "latent_dim": 14,
    "experiment_folder": "/l/users/fathinah.izzati/coco-mulla-repo/expe",
    "experiment_name": "experiment_tnj_4",
    "prompt_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/tnj/tnj.prompt.txt",
    'sampling_strategy':'prob-based',
    "dataset": '/l/users/fathinah.izzati/coco-mulla-repo/train.lst',
    'learning_rate':0.05

}
args = SimpleNamespace(**args)

experiment_folder = args.experiment_folder
experiment_name = args.experiment_name
if not os.path.exists(experiment_folder):
    os.mkdir(experiment_folder)
model_dir = os.path.join(experiment_folder, experiment_name)
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
    
dataset, dataloader = get_dataset(
        dataset_split=args.dataset,
        sampling_strategy=args.sampling_strategy,
        sampling_prob=None
    )
model = CoCoMulla(TrainCfg.sample_sec, num_layers=args.num_layers, latent_dim=args.latent_dim).to(device)
model.set_training()
train(model, dataset, dataloader, device, model_dir, args.learning_rate)


# target: music [1, 4, 500], video [1, 500, 512] like chords
### original
### music shape
### 1. drums before torch.Size([1, 4, 1001])
### drums after self.encodec_emb torch.Size([1, 1001, 12])
###  2. chords shape torch.Size([1, 1001, 37])
### cond concat on the three torch.Size([1, 1001, 61])
### mask embed per layer torch.Size([1, 1001, 61])
# ccond_mask torch.Size([1, 1001, 61])
# Inside CPTransfoermer forward
# torch.Size([1, 4, 1000])


## adapter
## video video.shape b4 torch.Size([1, 14, 251])
## video after self.encodec_emb torch.Size([1, 251, 14])
# cond_mask.shape torch.Size([1, 251, 14])
# mask embedding per layer torch.Size([1, 251, 14])
## music torch.Size([1, 4, 251])



0 /l/users/fathinah.izzati/coco-mulla-repo/train.lst


FileNotFoundError: [Errno 2] No such file or directory: '/l/users/fathinah.izzati/coco-mulla-repo/demo/training_input/tom_and_jerry_train/segment_001/video_emb.npy'

In [1]:

# from types import SimpleNamespace
# from train import train_dist

# if __name__ == "__main__":
#     args = {
#     "num_layers": 48,
#     "latent_dim": 12,
#     "experiment_folder": "/l/users/fathinah.izzati/coco-mulla-repo/expe",
#     "experiment_name": "experiment_1",
#     "prompt_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be.prompt.txt",
#     'sampling_strategy':'prob-based',
#     "dataset": 0,
#     'learning_rate':0.1,
#     'sampling_prob_a':0.2,
#     'sampling_prob_b':0.8

#     }
#     args = SimpleNamespace(**args)
#     main(args)

############

import argparse
from torch.utils.tensorboard import SummaryWriter

import torch.distributed as dist
from torch.multiprocessing import spawn
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
from coco_mulla.utilities.trainer_utils import Trainer

import torch
import torch.nn as nn
import os
import numpy as np

os.environ["TOKENIZERS_PARALLELISM"] = "false"

from tqdm import tqdm

from coco_mulla.models import CoCoMullaTmp
from coco_mulla.data_loader.dataset_sampler_tmp import Dataset, collate_fn
from types import SimpleNamespace
from config_tmp import TrainCfg


def _get_free_port():
    import socketserver
    with socketserver.TCPServer(('localhost', 0), None) as s:
        return s.server_address[1]



def get_dataset(dataset_split, sampling_strategy, sampling_prob):

    file_lst = ["data/text/musdb18_full.lst",
                "data/text/closed_dataset_fm_full.lst"]
    splits = [
        [1],
        [0],
        [0, 1],
    ]
    dataset = Dataset(
        rid=0, # No distributed rank needed
        path_lst=[dataset_split],
        sampling_prob=sampling_prob,
        sampling_strategy=sampling_strategy,
        cfg=TrainCfg)

    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=TrainCfg.batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        num_workers=0,
        # sampler=DistributedSampler(dataset),
        pin_memory=True,
        drop_last=True)

    return dataset, dataloader


def train_dist(replica_id, replica_count, port, model_dir, args):
    print('masuk sini')
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = str(port)
    torch.distributed.init_process_group('nccl', rank=replica_id, world_size=replica_count)
    device = torch.device('cuda', replica_id)
    print(device)
    torch.cuda.set_device(device)
    model = CoCoMullaTmp(TrainCfg.sample_sec, num_layers=args.num_layers, latent_dim=args.latent_dim).to(device)
    model.set_training()
    model = DDP(model, [replica_id])
    dataset, dataloader = get_dataset(rid=replica_id, dataset_split=args.dataset,
                                      sampling_strategy=args.sampling_strategy,
                                      sampling_prob=[args.sampling_prob_a, args.sampling_prob_b])

    # train(replica_id, model, dataset, dataloader, device, model_dir,
    #       args.learning_rate)


def loss_fn(outputs, y):
    prob = outputs.logits
    mask = outputs.mask
    prob = prob[mask]
    y = y[mask]
    prob = prob.view(-1, 2048)
    return nn.CrossEntropyLoss()(prob, y)


def train(model, dataset, dataloader, device, model_dir, learning_rate):
    # optimizer and lr scheduler
    num_steps = len(dataloader)
    epochs = TrainCfg.epoch
    rng = np.random.RandomState(569)
    writer = SummaryWriter(model_dir, flush_secs=20)

    trainer = Trainer(params=model.parameters(), lr=learning_rate, num_epochs=epochs, num_steps=num_steps)

    model = model.to(device)
    step = 0
    for e in range(0, epochs):
        mean_loss = 0
        n_element = 0
        model.train()

        dl = tqdm(dataloader, desc=f"Epoch {e}")
        r = rng.randint(0, 233333)
        dataset.reset_random_seed(r, e)
        for i, batch in enumerate(dl):
            desc = batch["desc"]
            mix = batch["mix"].to(device).long()
            drums = batch["drums"].to(device).long()
            chords = batch["chords"].to(device).float()
            piano_roll = batch["piano_roll"].to(device).float()
            cond_mask = batch["cond_mask"].to(device).long()

            batch_1 = {
                "seq": mix,
                "drums": drums,
                "chords": chords,
                "piano_roll": piano_roll,
                "cond_mask": cond_mask,
                "desc": desc,

            }
            # with autocast:
            outputs = model(**batch_1)
            r_loss = loss_fn(outputs, mix.long())
            print('mixxxxx',mix.shape)

            grad_1, lr_1 = trainer.step(r_loss, model.parameters())

            step += 1
            n_element += 1
            writer.add_scalar("r_loss", r_loss.item(), step)
            writer.add_scalar("grad_1", grad_1, step)
            writer.add_scalar("lr_1", lr_1, step)

            mean_loss += r_loss.item()

        mean_loss = mean_loss / n_element
        with torch.no_grad():
            writer.add_scalar('train/mean_loss', mean_loss, step)
            model.save_weights(os.path.join(model_dir, f"diff_{e}_end.pth"))


def main(args):
    experiment_folder = args.experiment_folder
    experiment_name = args.experiment_name

    if not os.path.exists(experiment_folder):
        os.mkdir(experiment_folder)
    model_dir = os.path.join(experiment_folder, experiment_name)
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)
    world_size = N_GPUS
    port = _get_free_port()
    spawn(train_dist, args=(world_size, port, model_dir, args), nprocs=world_size, join=True)

args = {
    "num_layers": 48,
    "latent_dim": 12,
    "experiment_folder": "/l/users/fathinah.izzati/coco-mulla-repo/expe",
    "experiment_name": "experiment_5",
    "prompt_path": "/l/users/fathinah.izzati/coco-mulla-repo/demo/input/let_it_be.prompt.txt",
    'sampling_strategy':'prob-based',
    "dataset": '/l/users/fathinah.izzati/coco-mulla-repo/train_ori.lst',
    'learning_rate':0.1

}
args = SimpleNamespace(**args)

experiment_folder = args.experiment_folder
experiment_name = args.experiment_name
if not os.path.exists(experiment_folder):
    os.mkdir(experiment_folder)
model_dir = os.path.join(experiment_folder, experiment_name)
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
    
dataset, dataloader = get_dataset(
    dataset_split=args.dataset,
    sampling_strategy=args.sampling_strategy,
    sampling_prob=None
)


  from .autonotebook import tqdm as notebook_tqdm


num of files 104
samling strategy prob-based [0.0, 0.8]


In [2]:
sample = dataset. __getitem__(1)
print(sample['mix'].shape)
print(sample['chords'].shape)
print(sample['piano_roll'].shape)
print(sample['drums'].shape)
print(sample['cond_mask'].shape)

chords before (11718, 36)
chords after (11718, 36)
(4, 1000)
(1001, 37)
(1001, 128)
(4, 1001)
(2, 1001)


In [3]:
device='cuda'

In [4]:
model = CoCoMullaTmp(TrainCfg.sample_sec, num_layers=args.num_layers, latent_dim=args.latent_dim).to(device)
model.set_training()
train(model, dataset, dataloader, device, model_dir, args.learning_rate)

masuk cocomullatmp
CondMusicgen initiated




load....musicgen bk
lm_bk, here
CPTransformer Initiated, tbc stride
latent_dim 12
cond_dim 61
cp_transformer.masked_embedding
cp_transformer.pos_emb
cp_transformer.gates
cp_transformer.encodec_emb.weight
cp_transformer.merge_linear.0.weight
cp_transformer.merge_linear.1.weight
cp_transformer.merge_linear.2.weight
cp_transformer.merge_linear.3.weight
cp_transformer.merge_linear.4.weight
cp_transformer.merge_linear.5.weight
cp_transformer.merge_linear.6.weight
cp_transformer.merge_linear.7.weight
cp_transformer.merge_linear.8.weight
cp_transformer.merge_linear.9.weight
cp_transformer.merge_linear.10.weight
cp_transformer.merge_linear.11.weight
cp_transformer.merge_linear.12.weight
cp_transformer.merge_linear.13.weight
cp_transformer.merge_linear.14.weight
cp_transformer.merge_linear.15.weight
cp_transformer.merge_linear.16.weight
cp_transformer.merge_linear.17.weight
cp_transformer.merge_linear.18.weight
cp_transformer.merge_linear.19.weight
cp_transformer.merge_linear.20.weight
cp_trans

Epoch 0:   0%|          | 0/104 [00:00<?, ?it/s]

chords before (11718, 36)
chords after (11718, 36)
Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 10

Epoch 0:   1%|          | 1/104 [00:01<02:25,  1.41s/it]

chords before (11718, 36)
chords after (11718, 36)
Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 10

Epoch 0:   2%|▏         | 2/104 [00:02<01:41,  1.00it/s]

chords before (11718, 36)
chords after (11718, 36)
Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 10

Epoch 0:   3%|▎         | 3/104 [00:02<01:25,  1.18it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:   4%|▍         | 4/104 [00:03<01:17,  1.29it/s]

chords before (11718, 36)
chords after (11718, 36)
Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 10

Epoch 0:   5%|▍         | 5/104 [00:04<01:13,  1.36it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:   6%|▌         | 6/104 [00:04<01:09,  1.40it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:   7%|▋         | 7/104 [00:05<01:07,  1.43it/s]

chords before (11718, 36)
chords after (11718, 36)
Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 10

Epoch 0:   8%|▊         | 8/104 [00:06<01:06,  1.45it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:   9%|▊         | 9/104 [00:06<01:04,  1.47it/s]

chords before (11718, 36)
chords after (11718, 36)
Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 10

Epoch 0:  10%|▉         | 10/104 [00:07<01:03,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  11%|█         | 11/104 [00:08<01:02,  1.48it/s]

chords before (11718, 36)
chords after (11718, 36)
Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 10

Epoch 0:  12%|█▏        | 12/104 [00:08<01:01,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  12%|█▎        | 13/104 [00:09<01:01,  1.49it/s]

chords before (11718, 36)
chords after (11718, 36)
Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 10

Epoch 0:  13%|█▎        | 14/104 [00:10<01:00,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  14%|█▍        | 15/104 [00:10<00:59,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  15%|█▌        | 16/104 [00:11<00:59,  1.49it/s]

chords before (11718, 36)
chords after (11718, 36)
Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 10

Epoch 0:  16%|█▋        | 17/104 [00:12<00:59,  1.45it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  17%|█▋        | 18/104 [00:12<00:59,  1.46it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  18%|█▊        | 19/104 [00:13<01:00,  1.40it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  19%|█▉        | 20/104 [00:14<01:01,  1.37it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  20%|██        | 21/104 [00:15<00:58,  1.41it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  21%|██        | 22/104 [00:15<00:59,  1.38it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  22%|██▏       | 23/104 [00:16<01:00,  1.34it/s]

chords before (11718, 36)
chords after (11718, 36)
Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 10

Epoch 0:  23%|██▎       | 24/104 [00:17<00:58,  1.38it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  24%|██▍       | 25/104 [00:18<00:56,  1.41it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  25%|██▌       | 26/104 [00:18<00:54,  1.43it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  26%|██▌       | 27/104 [00:19<00:53,  1.45it/s]

chords before (11718, 36)
chords after (11718, 36)
Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 10

Epoch 0:  27%|██▋       | 28/104 [00:19<00:51,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  28%|██▊       | 29/104 [00:20<00:50,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  29%|██▉       | 30/104 [00:21<00:49,  1.48it/s]

chords before (11718, 36)
chords after (11718, 36)
Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 10

Epoch 0:  30%|██▉       | 31/104 [00:22<00:49,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  31%|███       | 32/104 [00:22<00:48,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  32%|███▏      | 33/104 [00:23<00:47,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  33%|███▎      | 34/104 [00:24<00:46,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  34%|███▎      | 35/104 [00:24<00:45,  1.50it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  35%|███▍      | 36/104 [00:25<00:45,  1.50it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  36%|███▌      | 37/104 [00:26<00:44,  1.50it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  37%|███▋      | 38/104 [00:26<00:44,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  38%|███▊      | 39/104 [00:27<00:43,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  38%|███▊      | 40/104 [00:28<00:43,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  39%|███▉      | 41/104 [00:28<00:42,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  40%|████      | 42/104 [00:29<00:41,  1.50it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  41%|████▏     | 43/104 [00:30<00:40,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  42%|████▏     | 44/104 [00:30<00:40,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  43%|████▎     | 45/104 [00:31<00:39,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  44%|████▍     | 46/104 [00:32<00:39,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  45%|████▌     | 47/104 [00:32<00:38,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  46%|████▌     | 48/104 [00:33<00:37,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  47%|████▋     | 49/104 [00:34<00:37,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  48%|████▊     | 50/104 [00:34<00:36,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  49%|████▉     | 51/104 [00:35<00:35,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  50%|█████     | 52/104 [00:36<00:35,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  51%|█████     | 53/104 [00:36<00:34,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  52%|█████▏    | 54/104 [00:37<00:33,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  53%|█████▎    | 55/104 [00:38<00:32,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  54%|█████▍    | 56/104 [00:38<00:32,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  55%|█████▍    | 57/104 [00:39<00:31,  1.50it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  56%|█████▌    | 58/104 [00:40<00:30,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  57%|█████▋    | 59/104 [00:40<00:30,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  58%|█████▊    | 60/104 [00:41<00:29,  1.49it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  59%|█████▊    | 61/104 [00:42<00:28,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  60%|█████▉    | 62/104 [00:42<00:28,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  61%|██████    | 63/104 [00:43<00:27,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  62%|██████▏   | 64/104 [00:44<00:26,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  62%|██████▎   | 65/104 [00:44<00:26,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  63%|██████▎   | 66/104 [00:45<00:25,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  64%|██████▍   | 67/104 [00:46<00:25,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  65%|██████▌   | 68/104 [00:46<00:24,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  66%|██████▋   | 69/104 [00:47<00:23,  1.48it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  67%|██████▋   | 70/104 [00:48<00:23,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  68%|██████▊   | 71/104 [00:48<00:22,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  69%|██████▉   | 72/104 [00:49<00:21,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  70%|███████   | 73/104 [00:50<00:21,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  71%|███████   | 74/104 [00:50<00:20,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  72%|███████▏  | 75/104 [00:51<00:19,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  73%|███████▎  | 76/104 [00:52<00:19,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  74%|███████▍  | 77/104 [00:53<00:18,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  75%|███████▌  | 78/104 [00:53<00:17,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  76%|███████▌  | 79/104 [00:54<00:17,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  77%|███████▋  | 80/104 [00:55<00:16,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  78%|███████▊  | 81/104 [00:55<00:15,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  79%|███████▉  | 82/104 [00:56<00:14,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  80%|███████▉  | 83/104 [00:57<00:14,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  81%|████████  | 84/104 [00:57<00:13,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  82%|████████▏ | 85/104 [00:58<00:12,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  83%|████████▎ | 86/104 [00:59<00:12,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  84%|████████▎ | 87/104 [00:59<00:11,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  85%|████████▍ | 88/104 [01:00<00:10,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  86%|████████▌ | 89/104 [01:01<00:10,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  87%|████████▋ | 90/104 [01:01<00:09,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  88%|████████▊ | 91/104 [01:02<00:08,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  88%|████████▊ | 92/104 [01:03<00:08,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  89%|████████▉ | 93/104 [01:03<00:07,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  90%|█████████ | 94/104 [01:04<00:06,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  91%|█████████▏| 95/104 [01:05<00:06,  1.46it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  92%|█████████▏| 96/104 [01:05<00:05,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  93%|█████████▎| 97/104 [01:06<00:04,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  94%|█████████▍| 98/104 [01:07<00:04,  1.46it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  95%|█████████▌| 99/104 [01:08<00:03,  1.46it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  96%|█████████▌| 100/104 [01:08<00:02,  1.46it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  97%|█████████▋| 101/104 [01:09<00:02,  1.46it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  98%|█████████▊| 102/104 [01:10<00:01,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0:  99%|█████████▉| 103/104 [01:10<00:00,  1.47it/s]

Inside CPTransfoermer forward
drums before torch.Size([1, 4, 1001])
drums after self.encodec_emb torch.Size([1, 1001, 12])
r 12
chords shape torch.Size([1, 1001, 37])
torch.Size([1, 1001, 2048])
cond_mask torch.Size([1, 1001, 61])
self mask_embedding before torch.Size([48, 1001, 24])
mask_embedding after torch.Size([1, 48, 1001, 24])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
inside for
pr shape torch.Size([1, 1001, 12])
cond concat on the three torch.Size([1, 1001, 61])
mask embed per layer torch.Size([1, 1001, 61])
size cond_t torch.Size([1, 1001, 61])
embedding torch.Size([1, 1001, 2048])
insi

Epoch 0: 100%|██████████| 104/104 [01:11<00:00,  1.46it/s]
