In [1]:
%load_ext autoreload
%autoreload 2

device = 'cuda'

In [2]:

import torch 
from torch import nn
import torch.nn.functional as F
from torch.optim import Adam
from local_attention import LocalAttention
from pianogen.pe import binary_positional_encoding, sinusoidal_positional_encoding

class BinaryPositionalEncoding(nn.Module):
    '''
    Input: B, L (long)
    Output: B, L, D
    '''
    def __init__(self, dim:int, max_len:int):
        super().__init__()
        self.register_buffer('pos_encoding', binary_positional_encoding(max_len, dim).unsqueeze(0))

    def forward(self, pos: torch.Tensor):
        return torch.gather(self.pos_encoding.expand(pos.shape[0], -1, -1), 1, pos.unsqueeze(-1).expand(-1, -1, self.pos_encoding.shape[-1]))
    
class SinusoidalPositionalEncoding(nn.Module):
    '''
    Input: B, L (long)
    Output: B, L, D
    '''
    def __init__(self, dim:int, max_len:int):
        super().__init__()
        self.register_buffer('pos_encoding', sinusoidal_positional_encoding(max_len, dim).unsqueeze(0))

    def forward(self, pos: torch.Tensor):
        return torch.gather(self.pos_encoding.expand(pos.shape[0], -1, -1), 1, pos.unsqueeze(-1).expand(-1, -1, self.pos_encoding.shape[-1]))
    
class LocalMultiHeadAttention(nn.Module):
    '''
    Input: B, L, D
    Output: B, L, D
    '''
    def __init__(self, heads, dim, window_size, causal = False, dropout = 0.):
        super().__init__()
        assert dim % heads == 0, 'dimension must be divisible by number of heads'
        self.heads = heads
        self.to_qkv = nn.Linear(dim, dim * 3, bias = False)
        self.local_attn = LocalAttention(dim = dim // heads, window_size = window_size, causal = causal, dropout = dropout, autopad=True)

    def forward(self, x, mask = None):
        B, L, D = x.shape
        H = self.heads
        E = D // H

        qkv = self.to_qkv(x).chunk(3, dim = -1) # B, L, 3 * H, E
        q, k, v = map(lambda t: t.view(B, L, H, E).transpose(1, 2), qkv)

        out = self.local_attn(q, k, v, mask = mask)
        out = out.transpose(1, 2).reshape(B, L, D)
        return out

class LMHATransformerBlock(nn.Module):
    '''
    Input: B, L, D
    Output: B, L, D
    '''
    def __init__(self, dim, heads, window_size, dim_feedforward, dropout = 0., causal = False):
        super().__init__()
        self.attn = LocalMultiHeadAttention(heads = heads, dim = dim, window_size = window_size, dropout = dropout, causal = causal)
        self.ff = nn.Sequential(
            nn.Linear(dim, dim_feedforward),
            nn.LeakyReLU(),
            nn.Linear(dim_feedforward, dim)
        )
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask = None):
        x = x + self.dropout(self.attn(self.norm1(x), mask = mask))
        x = x + self.dropout(self.ff(self.norm2(x)))
        return x

class SelectiveAttnTransformer(nn.Module):
    '''
    Token level attention is too expensive to apply on the whole sequence. This module instead learns a regular attention mask with
    a downsampled sequence (segment level), then transform it into the mask for the token level attention, by sparsely select the
    most important segments (the selection is not differentiable though).

    As such, token level attention is only applied on the selected segments, which is much faster.

    Input: B, n_token, n_feature
    '''

    def __init__(self, vocab_size, segment_len, dim = 256):
        super().__init__()

        self.segment_len = segment_len
        self._downsampled_path_enabled = True

        self.binary_pe_dim = 5
        self.sinusoidal_pe_dim = 123
        self.token_embedding = nn.Embedding(vocab_size, dim)
        self.binary_pos_encoding = BinaryPositionalEncoding(self.binary_pe_dim, 10240)
        self.sinusoidal_pos_encoding = SinusoidalPositionalEncoding(self.sinusoidal_pe_dim, 10240)

        self.in_local_attention = nn.Sequential(*[
            LMHATransformerBlock(heads=8, dim=dim, window_size=200, causal=True, dropout=0.1, dim_feedforward=1024) for _ in range(2)
        ])
        self.downsample = nn.AvgPool1d(segment_len, stride=segment_len)
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=dim, nhead=8, dim_feedforward=1024, batch_first=True), num_layers=4)
        self.upsample = nn.Upsample(scale_factor=segment_len, mode='nearest')
        self.out_local_attention = nn.Sequential(*[
            LMHATransformerBlock(heads=8, dim=dim, window_size=200, causal=True, dropout=0.1, dim_feedforward=1024) for _ in range(2)
        ])
        self.out_linear = nn.Linear(dim, vocab_size)

    def forward(self, x, pos):
        B, L = x.shape
        # x: B, L
        # pos: B, L+1
        x = self.token_embedding(x)

        pe = torch.cat([
            self.binary_pos_encoding(pos),
            self.sinusoidal_pos_encoding(pos),
        ], dim=-1) # B, L+1, D/2

        pe = torch.cat([
            pe[:, :-1], # pe of the input tokens
            pe[:, 1:]   # pe of the target tokens
        ], dim=2) # B, L, D

        x = x + pe
        
        
        x = self.in_local_attention(x)
        before_down = x

        # before entering the downsampled path, we need to make L % segment_len == 0

        if L >= self.segment_len and self._downsampled_path_enabled: # if L < segment_len, we don't need to downsample

            x = x[:, :L - (L % self.segment_len)] # B, L - (L % segment_len), D

            x = self.downsample(x.transpose(1, 2)).transpose(1, 2)
            x = self.transformer(x)
            x = self.upsample(x.transpose(1, 2)).transpose(1, 2) # B, L - (L % segment_len), D

            # to avoid information leak, shift the data from the downsampled path right by segment_len-1
            x = F.pad(x, (0,0,self.segment_len-1, 0), 'constant', 0) # B, L+self.segment_len-1-(L % self.segment_len), D
            # crop redundant right-most stuff due to the right shift
            x = x[:, :L] # B, L, D
            # skip connection
            x = x + before_down

        x = self.out_local_attention(x)
        x = F.leaky_relu(x)
        x = self.out_linear(x)
        return x
    
    def set_downsampled_path_enabled(self, enabled):
        self._downsampled_path_enabled = enabled

class PianoRollGenerator(nn.Module):
    def __init__(self):
        super().__init__()
        self.in_linear = nn.Linear(200, 256)
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=256, nhead=8, dim_feedforward=1024, batch_first=True), num_layers=6)
        self.out_linear = nn.Linear(256, 121)

    def forward(self, x):
        x = self.in_linear(x)
        x = self.transformer(x, mask = nn.Transformer.generate_square_subsequent_mask(x.shape[1]).to(x.device), is_causal = True)
        x = self.out_linear(x)
        return x
        


In [7]:
from pianogen.dataset.pianorolldataset import PianoRollDataset
from pianogen.dataset.tokenized import TokenizedPianoRollDataset
from pianogen.tokenizer import PianoRollTokenizer
from torch.utils.data import DataLoader

pr_ds = PianoRollDataset(r'W:\music\music-data-analysis\data', max_duration=32*150) # 150 bars
tokenizer = PianoRollTokenizer(n_pitch=88, n_velocity=32, token_seq_len=10240+1)
ds = TokenizedPianoRollDataset(pr_ds, tokenizer)
dl = DataLoader(ds,batch_size=8, shuffle=True, num_workers=8)


# pr_ds = PianoRollDataset('data', segment_len=32*4, hop_len=32*4) # 4 bars
# tokenizer = PianoRollTokenizer(n_pitch=88, n_velocity=32, token_seq_len=600+1)
# ds = TokenizedPianoRollDataset(pr_ds, tokenizer)
# dl = DataLoader(ds,batch_size=32, shuffle=True, num_workers=8)

Loaded 2368 samples from 2570 songs


In [10]:
sample = pr_ds.get_sample(500)
sample.song.read_json("chords")[sample.start // 16 : sample.end // 16]

['Em',
 'Em7',
 'C',
 'G',
 'F',
 'C',
 'C',
 'C',
 'C',
 'Dm',
 'Dm',
 'A#M7',
 'C',
 'Dm7',
 'C',
 'Am',
 'Gm7',
 'C',
 'Dm',
 'A#M7',
 'C',
 'Dm7',
 'C',
 'Am',
 'D7',
 'Em7',
 'Gm7',
 'C',
 'A#M7',
 'F7',
 'A#M7',
 'F',
 'A#M7',
 'F',
 'A#M7',
 'Dm7',
 'Dm',
 'G7',
 'Am',
 'G',
 'C',
 'C',
 'Dm',
 'E',
 'Am',
 'G',
 'F',
 'E',
 'AM7',
 'Am',
 'F',
 'Dm',
 'Em',
 'E7',
 'Am',
 'G',
 'C',
 'C',
 'Dm',
 'E',
 'Am',
 'C',
 'F',
 'E',
 'Am',
 'Am',
 'F',
 'Dm',
 'E7',
 'E7',
 'C',
 'G',
 'F',
 'C',
 'C',
 'C',
 'Em',
 'Em7',
 'C',
 'G',
 'F',
 'C']

In [4]:

from tqdm import tqdm
from music_data_analysis.data import PianoRoll
from pianogen.gpu_temp_control import GPUTempControl

gpu_control = GPUTempControl(64,3)

tokenizer: PianoRollTokenizer
model: SelectiveAttnTransformer
def inference(file_path:str, length:int=512, prompt:PianoRoll|None=None, batch_size:int|None=None):
    model.eval()
    if prompt is None:
        tokens = [{'type':'start'}]
    else:
        tokens = ds.tokenizer.tokenize(prompt, pad=False)
        print('prompt:', tokens[:10])

    indices = tokenizer.vocab.tokens_to_indices(tokens)
    pos = tokenizer.get_frame_indices(tokens, infer_next_frame=True)
    indices = indices.unsqueeze(0).to(device)
    pos = pos.unsqueeze(0).to(device)

    last_token = tokens[-1]

    for _ in tqdm(range(length - len(tokens))):
        gpu_control.cooldown()

        logits = model(indices,pos).squeeze(0)[-1].detach().cpu()
        new_token = tokenizer.sample_from_logits(logits, last_token)
        tokens.append(new_token)
        last_token = new_token

        # update indices and pos

        new_token_idx = tokenizer.vocab.get_idx(new_token)
        indices = torch.cat([indices, torch.tensor([[new_token_idx]]).to(device)], dim=-1)
        if new_token['type'] == 'next_frame':
            new_pos = pos[0,-1] + 1
        else:
            new_pos = pos[0,-1]
        pos = torch.cat([pos, torch.tensor([[new_pos]]).to(device)], dim=-1)

        if new_token['type'] == 'end':
            break

    tokenizer.detokenize(tokens).to_midi(file_path)
    print('result:', tokens[:10])


In [5]:
model = SelectiveAttnTransformer(len(tokenizer.vocab),128,256)
crit = nn.CrossEntropyLoss(ignore_index=0)
opt = Adam(model.parameters(), lr=1e-3)
print('number of parameters:', sum(p.numel() for p in model.parameters())/1e6, 'M')

number of parameters: 6.115452 M


In [6]:
model.set_downsampled_path_enabled(True)

In [7]:
# load from checkpoint
from_epoch = 40
if from_epoch > 0:
    checkpoint = torch.load(f'checkpoint/{from_epoch}.pt')
    model.load_state_dict(checkpoint['model'])
    opt.load_state_dict(checkpoint['opt'])

In [17]:
# train
import time
from tqdm import tqdm

from pianogen import gpu_temp_control

temp_control = gpu_temp_control.GPUTempControl(64,3)


model.to(device)
crit.to(device)

model.train()

for epoch in range(from_epoch+1,100):
    tq = tqdm(dl)
    for i, batch in enumerate(tq):
        batch = {k:v.to(device) for k,v in batch.items()}
        opt.zero_grad()
        out = model(batch['indices'][:,:-1], batch['pos'])
        loss = crit((out+batch['output_mask']).transpose(1,2), batch['indices'][:,1:])
        loss.backward()
        opt.step()
        temp_control.cooldown()
        if i % 10 == 0:
            # print the loss to tqdm
            tq.set_postfix(batch = i, loss= loss.item(), gpu_temp=temp_control.get_temp())
                    
        if torch.isnan(loss):
            raise ValueError("Loss is NaN")
    
    inference(f'./output/output_{epoch}_{i}.mid', 1024)

    if epoch % 10 == 0:
        torch.save({'model':model.state_dict(), 'opt':opt.state_dict()}, f'checkpoint/{epoch}.pt')
    



100%|██████████| 300/300 [01:56<00:00,  2.56it/s, batch=290, gpu_temp=62, loss=1.13] 
100%|██████████| 1023/1023 [00:11<00:00, 87.53it/s]


result: [{'type': 'start'}, {'value': 12, 'type': 'pitch'}, {'value': 20, 'type': 'velocity'}, {'value': 24, 'type': 'pitch'}, {'value': 20, 'type': 'velocity'}, {'value': 31, 'type': 'pitch'}, {'value': 21, 'type': 'velocity'}, {'value': 36, 'type': 'pitch'}, {'value': 20, 'type': 'velocity'}, {'type': 'next_frame'}]


100%|██████████| 300/300 [02:01<00:00,  2.46it/s, batch=290, gpu_temp=57, loss=0.802]
100%|██████████| 1023/1023 [00:10<00:00, 95.38it/s]


result: [{'type': 'start'}, {'value': 12, 'type': 'pitch'}, {'value': 20, 'type': 'velocity'}, {'value': 19, 'type': 'pitch'}, {'value': 20, 'type': 'velocity'}, {'value': 24, 'type': 'pitch'}, {'value': 20, 'type': 'velocity'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}]


100%|██████████| 300/300 [02:03<00:00,  2.42it/s, batch=290, gpu_temp=57, loss=0.845]
100%|██████████| 1023/1023 [00:10<00:00, 97.47it/s]


result: [{'type': 'start'}, {'value': 8, 'type': 'pitch'}, {'value': 16, 'type': 'velocity'}, {'value': 20, 'type': 'pitch'}, {'value': 16, 'type': 'velocity'}, {'value': 36, 'type': 'pitch'}, {'value': 18, 'type': 'velocity'}, {'value': 39, 'type': 'pitch'}, {'value': 16, 'type': 'velocity'}, {'value': 43, 'type': 'pitch'}]


100%|██████████| 300/300 [02:05<00:00,  2.40it/s, batch=290, gpu_temp=62, loss=0.881]
100%|██████████| 1023/1023 [00:10<00:00, 95.30it/s]


result: [{'type': 'start'}, {'value': 27, 'type': 'pitch'}, {'value': 14, 'type': 'velocity'}, {'value': 31, 'type': 'pitch'}, {'value': 15, 'type': 'velocity'}, {'value': 34, 'type': 'pitch'}, {'value': 16, 'type': 'velocity'}, {'value': 39, 'type': 'pitch'}, {'value': 15, 'type': 'velocity'}, {'value': 46, 'type': 'pitch'}]


100%|██████████| 300/300 [02:09<00:00,  2.32it/s, batch=290, gpu_temp=62, loss=0.967]
100%|██████████| 1023/1023 [00:10<00:00, 94.38it/s]


result: [{'type': 'start'}, {'value': 32, 'type': 'pitch'}, {'value': 14, 'type': 'velocity'}, {'type': 'next_frame'}, {'value': 44, 'type': 'pitch'}, {'value': 12, 'type': 'velocity'}, {'value': 48, 'type': 'pitch'}, {'value': 16, 'type': 'velocity'}, {'value': 55, 'type': 'pitch'}, {'value': 15, 'type': 'velocity'}]


100%|██████████| 300/300 [02:13<00:00,  2.25it/s, batch=290, gpu_temp=61, loss=1.03] 
100%|██████████| 1023/1023 [00:11<00:00, 90.71it/s]


result: [{'type': 'start'}, {'value': 27, 'type': 'pitch'}, {'value': 21, 'type': 'velocity'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'value': 34, 'type': 'pitch'}, {'value': 20, 'type': 'velocity'}, {'type': 'next_frame'}]


100%|██████████| 300/300 [02:15<00:00,  2.21it/s, batch=290, gpu_temp=63, loss=1.16] 
100%|██████████| 1023/1023 [00:13<00:00, 78.46it/s]


result: [{'type': 'start'}, {'value': 12, 'type': 'pitch'}, {'value': 19, 'type': 'velocity'}, {'value': 24, 'type': 'pitch'}, {'value': 20, 'type': 'velocity'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'value': 24, 'type': 'pitch'}]


100%|██████████| 300/300 [02:22<00:00,  2.10it/s, batch=290, gpu_temp=63, loss=0.883]
100%|██████████| 1023/1023 [00:12<00:00, 82.75it/s]


result: [{'type': 'start'}, {'value': 32, 'type': 'pitch'}, {'value': 21, 'type': 'velocity'}, {'value': 55, 'type': 'pitch'}, {'value': 22, 'type': 'velocity'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'value': 36, 'type': 'pitch'}]


100%|██████████| 300/300 [02:22<00:00,  2.10it/s, batch=290, gpu_temp=63, loss=1.1]  
100%|██████████| 1023/1023 [00:12<00:00, 81.09it/s]


result: [{'type': 'start'}, {'value': 24, 'type': 'pitch'}, {'value': 10, 'type': 'velocity'}, {'value': 36, 'type': 'pitch'}, {'value': 10, 'type': 'velocity'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'value': 24, 'type': 'pitch'}]


100%|██████████| 300/300 [02:25<00:00,  2.06it/s, batch=290, gpu_temp=58, loss=0.805]
100%|██████████| 1023/1023 [00:12<00:00, 83.38it/s]


result: [{'type': 'start'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}]


100%|██████████| 300/300 [02:20<00:00,  2.13it/s, batch=290, gpu_temp=64, loss=0.98] 
100%|██████████| 1023/1023 [00:12<00:00, 83.60it/s]


result: [{'type': 'start'}, {'value': 22, 'type': 'pitch'}, {'value': 16, 'type': 'velocity'}, {'value': 38, 'type': 'pitch'}, {'value': 17, 'type': 'velocity'}, {'value': 41, 'type': 'pitch'}, {'value': 15, 'type': 'velocity'}, {'value': 46, 'type': 'pitch'}, {'value': 19, 'type': 'velocity'}, {'value': 50, 'type': 'pitch'}]


100%|██████████| 300/300 [02:21<00:00,  2.12it/s, batch=290, gpu_temp=58, loss=0.797]
100%|██████████| 1023/1023 [00:12<00:00, 82.62it/s]


result: [{'type': 'start'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'value': 27, 'type': 'pitch'}]


100%|██████████| 300/300 [02:21<00:00,  2.12it/s, batch=290, gpu_temp=62, loss=1.06] 
100%|██████████| 1023/1023 [00:12<00:00, 82.70it/s]


result: [{'type': 'start'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}]


100%|██████████| 300/300 [02:12<00:00,  2.26it/s, batch=290, gpu_temp=62, loss=1.07] 
100%|██████████| 1023/1023 [00:11<00:00, 89.64it/s]


result: [{'type': 'start'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}, {'type': 'next_frame'}]


 22%|██▏       | 65/300 [00:34<02:03,  1.90it/s, batch=60, gpu_temp=60, loss=0.884]


KeyboardInterrupt: 

In [41]:
#inference('output/test.mid', 512,pr_ds.get_piano_roll(53548).slice(0, 32))
from pianogen.data.pianoroll import PianoRoll
model.to(device)
inference('output/test.mid', 2000,PianoRoll.from_midi(r"W:\music\piano-music-gen\output\output_35_299.mid").slice(32*0,32*8))
#inference('output/test.mid', 4000,pr_ds.get_piano_roll(2150).slice(32*0,32*12))

None
prompt: [{'type': 'start'}, {'type': 'pitch', 'value': 32}, {'type': 'velocity', 'value': 14}, {'type': 'next_frame'}, {'type': 'pitch', 'value': 44}, {'type': 'velocity', 'value': 12}, {'type': 'pitch', 'value': 48}, {'type': 'velocity', 'value': 16}, {'type': 'pitch', 'value': 55}, {'type': 'velocity', 'value': 15}]


100%|██████████| 1610/1610 [00:19<00:00, 84.64it/s]

result: [{'type': 'start'}, {'type': 'pitch', 'value': 32}, {'type': 'velocity', 'value': 14}, {'type': 'next_frame'}, {'type': 'pitch', 'value': 44}, {'type': 'velocity', 'value': 12}, {'type': 'pitch', 'value': 48}, {'type': 'velocity', 'value': 16}, {'type': 'pitch', 'value': 55}, {'type': 'velocity', 'value': 15}]





In [67]:
handle.remove()
handle = model.in_local_attention.register_backward_hook(lambda m, g_in, g_out: print('in_local_attention', g_in[0].norm(dim=(0,2))))

In [66]:
inp = torch.randint(0,10,(1, 220),dtype=torch.long).to(device)
model(inp, torch.arange(220+1).unsqueeze(0).to(device))[0,50].norm().backward()

in_local_attention tensor([ 2.8973,  0.8830,  0.7586,  0.9899,  1.1048,  1.2402,  0.3792,  0.6488,
         1.6528,  2.7425,  0.4552,  1.5836,  1.0128,  4.0604,  0.6707,  2.0891,
         0.5262,  0.5684,  2.6314,  0.4023,  1.3787,  0.5873,  0.9673,  0.5430,
         1.6225,  0.3887,  3.4039,  1.1779,  2.0503,  1.2313,  0.4384,  1.0173,
         0.5497,  0.2947,  1.5697,  0.3889,  3.1795,  0.6917,  0.8305,  2.3320,
         0.7165,  4.5779,  4.4148,  2.6414,  3.2237,  3.0840,  1.4059,  0.7477,
         6.2199,  6.4681, 18.2421,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0

In [62]:
len([ 0.0193,  0.0189,  0.0729,  0.0536,  0.1458,  0.0867,  0.1135,  0.7866,
         0.1798,  0.0767,  0.0629,  0.0755,  0.1229,  0.1874,  0.6108,  0.1180,
         0.1461,  0.0984,  0.0650,  0.1149,  0.3938,  0.2112,  0.1433,  0.3247,
         0.1211,  0.1318,  0.2227,  0.1505,  0.3462,  0.1255,  0.0516,  0.2467,
         0.2044,  0.4414,  0.1183,  0.4642,  0.1468,  0.0673,  0.0795,  0.6533,
         0.2189,  0.1100,  0.1983,  0.2572,  0.3179,  0.1389,  0.6460,  0.0662,
         0.1327,  0.1566,  0.5608,  0.0544,  0.3800,  0.3710,  0.1071,  0.0769,
         0.1528,  0.2218,  0.0826,  0.0729,  0.0932,  0.1366,  0.2292,  0.3016,
         0.3495,  0.9662,  0.0683,  0.1068,  0.3928,  0.4896,  0.2337,  0.1346,
         0.8395,  0.2339,  0.1197,  0.9997,  0.2210,  0.3909,  0.1135,  0.1822,
         0.3711,  0.2440,  0.0881,  0.1852,  0.1072,  0.0451,  0.0846,  0.1209,
         0.1635,  0.4193,  0.0510,  0.0800,  0.1212,  0.2231,  0.1077,  0.1427,
         0.1932,  0.1705,  0.3363,  0.3363,  0.2017,  0.1830,  0.2525,  0.4927,
         0.6330,  0.5120,  0.2838,  1.1671,  0.7589,  1.1129,  1.9647,  1.1941,
         0.7201,  0.3712,  0.3980,  0.5747,  0.9515,  2.6499,  0.3535,  2.9585,
         0.5410,  0.7637,  0.5093,  1.4005,  3.1254,  2.2679,  1.5576,  1.8914,
         2.9836, 13.3045])

130

In [22]:
temp(torch.randn(10,10)).sum().backward()

temp 10.0


In [None]:
import torch
torch.cuda.is_available()

True

In [5]:
ds.ds.get_piano_roll(10).to_midi('output/test.mid')

ticks per beat: 480
max tick: 0
tempo changes: 1
time sig: 0
key sig: 0
markers: 0
lyrics: False
instruments: 1

In [37]:
for i in range(1330,3000):
    print(i)
    pr = ds.ds.get_piano_roll(i)
    l = []
    for i in range(5):
        l.append(pr.notes[i].pitch)
    if len( {36, 48, 52, 55, 62} - set(l)) < 2:
        break


1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366


In [38]:
l

[36, 43, 48, 52, 55]

In [32]:
pr.to_midi('output/test.mid')

ticks per beat: 480
max tick: 0
tempo changes: 1
time sig: 0
key sig: 0
markers: 0
lyrics: False
instruments: 1

In [39]:
import random


random.choice(['隨單','戰旗'])

'戰旗'