In [1]:

import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.utils.data
import src.utils as utils
import src.representation as representation
import src.dataset as dataset
import src.music_x_transformers as music_x_transformers
import src.advUtils as advUtils
from torch import nn
import copy


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#load configurations
train_args = utils.load_json(".\pre_trained_models\mmt_sod_ape_training_logs.json")
encoding = representation.load_encoding("encoding.json")

sos = encoding["type_code_map"]["start-of-song"]
eos = encoding["type_code_map"]["end-of-song"]
beat_0 = encoding["beat_code_map"][0]
beat_4 = encoding["beat_code_map"][4]
beat_16 = encoding["beat_code_map"][16]

# Load training/testing/demo Data
data_set = advUtils.convert_extract_load(train_args,encoding, json_dir = "./data/test/json",repr_dir="./data/test/repr")

data_loader = torch.utils.data.DataLoader(
    data_set,
    shuffle=True,
    num_workers=1,
    collate_fn=dataset.MusicDataset.collate,
)

test_loader = data_loader
train_loader = data_loader
valid_loader = data_loader

In [36]:
# Load model
device = torch.device("cpu")
print(f"Creating the model...")
model = music_x_transformers.MusicXTransformer(
    dim=train_args["dim"],
    encoding=encoding,
    depth=train_args["layers"],
    heads=train_args["heads"],
    max_seq_len=train_args["max_seq_len"],
    max_beat=train_args["max_beat"],
    rotary_pos_emb=train_args["rel_pos_emb"],
    use_abs_pos_emb=train_args["abs_pos_emb"],
    emb_dropout=train_args["dropout"],
    attn_dropout=train_args["dropout"],
    ff_dropout=train_args["dropout"],
).to(device)

model.load_state_dict(torch.load("../pre_trained_models/TranferLearned.pt", map_location=device))

Creating the model...


<All keys matched successfully>

In [35]:
model.eval()

MusicXTransformer(
  (decoder): MusicAutoregressiveWrapper(
    (net): MusicTransformerWrapper(
      (token_emb): ModuleList(
        (0): TokenEmbedding(
          (emb): Embedding(5, 512)
        )
        (1): TokenEmbedding(
          (emb): Embedding(257, 512)
        )
        (2): TokenEmbedding(
          (emb): Embedding(13, 512)
        )
        (3): TokenEmbedding(
          (emb): Embedding(129, 512)
        )
        (4): TokenEmbedding(
          (emb): Embedding(33, 512)
        )
        (5): TokenEmbedding(
          (emb): Embedding(65, 512)
        )
      )
      (pos_emb): AbsolutePositionalEmbedding(
        (emb): Embedding(1024, 512)
      )
      (emb_dropout): Dropout(p=0.2, inplace=False)
      (project_emb): Identity()
      (attn_layers): Decoder(
        (layers): ModuleList(
          (0): ModuleList(
            (0): ModuleList(
              (0): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
              (1): None
              (2): None
    

In [32]:

class Adapter(nn.Module):
    """
    The adapters first project the original
    d-dimensional features into a smaller dimension, m, apply
    a nonlinearity, then project back to d dimensions.
    """
    def __init__(self, size = 1, model_dim = 1):
        super().__init__()
        self.adapter_block = nn.Sequential(
            nn.Linear(model_dim, size),
            nn.ReLU(),
            nn.Linear(size, model_dim)
        )

    def forward(self, x):

        ff_out = self.adapter_block(x)
        # Skip connection
        adapter_out = ff_out + x

        return adapter_out
    
class Adaptered(nn.Module):
    def __init__(self, orig_layer):
        super().__init__()
        self.orig_layer = orig_layer
        self.adapter = Adapter()

    def forward(self, *x):
        orig_out = self.orig_layer(*x)
        output = self.adapter.forward(orig_out[0].unsqueeze(0))[0]

        return output


In [33]:
smodel = music_x_transformers.MusicXTransformer(
    dim=train_args["dim"],
    encoding=encoding,
    depth=1,
    heads=train_args["heads"],
    max_seq_len=50,
    max_beat=train_args["max_beat"],
    rotary_pos_emb=train_args["rel_pos_emb"],
    use_abs_pos_emb=train_args["abs_pos_emb"],
    emb_dropout=train_args["dropout"],
    attn_dropout=train_args["dropout"],
    ff_dropout=train_args["dropout"],
).to(device)


In [34]:
#Applying Adapters
# Adapter model
#smodel=copy.deepcopy(model)
for i in range (0,len(smodel.decoder.net.attn_layers.layers)):
    smodel.decoder.net.attn_layers.layers[1][1]=Adaptered(smodel.decoder.net.attn_layers.layers[1][1])
smodel.state_dict()        

OrderedDict([('decoder.net.token_emb.0.emb.weight',
              tensor([[-0.0439, -0.0196, -0.0295,  ..., -0.0066,  0.0447,  0.1069],
                      [ 0.0420, -0.0388,  0.0729,  ...,  0.0420, -0.0550, -0.0572],
                      [-0.0469, -0.0630, -0.0076,  ...,  0.0310,  0.0314,  0.0062],
                      [ 0.0256,  0.0755,  0.0219,  ..., -0.0900,  0.0364, -0.1459],
                      [ 0.1592,  0.1332,  0.0025,  ...,  0.0783,  0.1121, -0.0762]])),
             ('decoder.net.token_emb.1.emb.weight',
              tensor([[ 0.0095, -0.0110,  0.0819,  ...,  0.0710, -0.0171,  0.0040],
                      [ 0.0809, -0.0916,  0.0653,  ...,  0.0990,  0.0136,  0.0673],
                      [ 0.0736,  0.0889, -0.0930,  ..., -0.0009,  0.0398, -0.0549],
                      ...,
                      [ 0.0306,  0.0332, -0.0249,  ...,  0.0843,  0.0102, -0.0121],
                      [-0.0288,  0.0407, -0.0359,  ..., -0.0993, -0.0673,  0.0151],
                      [-0.

In [38]:
advUtils.generate(2,"./samples/TransferLearned",model,test_loader,encoding,"cpu",seq_len=50,modes=["4_beat","16_beat"])

  0%|                                                     | 0/2 [00:00<?, ?it/s]

Generating based on ['anglebert_fugue_3_(c)mccoy']


 50%|██████████████████████▌                      | 1/2 [01:09<01:09, 69.68s/it]

Generating based on ['albinoni_sonate_da_chiesa_6_(c)icking-archive']


100%|█████████████████████████████████████████████| 2/2 [01:35<00:00, 47.84s/it]


In [28]:
advUtils.generate(2,"./samples2",smodel,test_loader,encoding,"cpu",seq_len=50,modes=["unconditioned","instrument_informed"])

100%|█████████████████████████████████████████████| 2/2 [00:09<00:00,  4.99s/it]

Generating based on ['anglebert_fugue_3_(c)mccoy']
Generating based on ['albinoni_sonate_da_chiesa_6_(c)icking-archive']



