In [1]:
import pathlib
import os

import random

import numpy as np

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision

In [2]:
def add_token(song, token, begin):
    token_shape = list(song.shape)
    token_shape[0] = 1
    tens_tokens = np.ones(token_shape) * token
    if begin == True:
        return np.concatenate((tens_tokens, song), axis = 0)
    else:
        return np.concatenate((song, tens_tokens), axis = 0) 
    
def standardize_song(song, lenght):
    if(song.shape[0]>lenght):
        return song
    song_shape = list(song.shape)
    song_shape[0] = lenght - song_shape[0]
    song = np.concatenate((song, np.zeros(song_shape)), axis=0)
    assert song.shape[0] == lenght
    return song

def song_transform(song, max_lenght):
    song = song[:max_lenght]
    song += np.ones_like(song)*np.arange(song.shape[-1])*100
    song = np.pad(song, 0, 'constant')
    song = add_token(song, 1, begin=True)
    song = add_token(song, -1, begin=False)
    song = standardize_song(song, max_lenght+2)    
    return song

def batch_uniform(song_batch, max_lenght):
    return torch.from_numpy(np.stack([song_transform(song, max_lenght) for song in song_batch], axis=0))


def rand_song_generation(L_distribution, bs, max_lenght):    
    songs_batch = []
    for _ in range(bs):
        L = np.random.choice(L_distribution)
        songs_batch.append(song_transform(np.random.randint(36,82,(L, 4)),max_lenght))
    return batch_uniform(songs_batch, max_lenght)

In [3]:
class SongIterator():
    def __init__(self, song_list, batch_size, song_size, shuffle):
        self.step = 0
        self.batch_size = batch_size
        self.song_size = song_size
        if shuffle == True:
            self.internal_song_list = random.sample(list(song_list),len(list(song_list)))
        else:
            self.internal_song_list = list(song_list)
                    
    def __iter__(self):
        return self
    
    def __next__(self):
        self.step += 1
        if self.step > 1000:
            raise StopIteration
        if (self.step + 1)  * self.batch_size < len(self.internal_song_list):
            batch = self.internal_song_list[(self.step * self.batch_size):((self.step + 1) * self.batch_size)]
        elif self.step  * self.batch_size < len(self.internal_song_list):
            batch = self.internal_song_list[(self.step * self.batch_size):]
        else:
            raise StopIteration
        return batch_uniform(batch, self.song_size)


In [4]:
# Extract real data

input_path = os.path.join(os.path.join(pathlib.Path(globals()['_dh'][0]).parent, "data"), "js-fakes-16thSeparated.npz")
jsf = np.load(input_path, allow_pickle=True, encoding='latin1')
len_seq = np.asarray([len(song) for song in jsf['pitches']])

max_song_length = len_seq.max()

#song_iterator = SongIterator(song_list=jsf['pitches'], batch_size=64, song_size=max_song_length)

# test iterator -> Now working
# for epoch in range(3):
#     for i, batch_song in enumerate(SongIterator(song_list=jsf['pitches'], batch_size=64, song_size=max_song_length, shuffle=True)):
#         print("Epoch = {}, step = {}, input type = {}, input shape = {}".format(epoch, i, type(batch_song), batch_song.shape))

In [5]:
# testing real and fake batch song generation
real_batch_song = next(SongIterator(song_list=jsf['pitches'], batch_size=64, song_size=max_song_length, shuffle=True))
fake_batch_song = rand_song_generation(len_seq, 64, max_song_length)


In [6]:
# TODO: Add key_padding_mask

class TransformerBlock(nn.Module):
    def __init__(self, num_heads_1, num_heads_2, bs, emb_notes, emb_f):
        super(TransformerBlock, self).__init__() # Seq, batch, features
        self.ln1 = nn.LayerNorm([bs * num_heads_2 * emb_notes, num_heads_1 * emb_f]) #layer norm: [L, bs, Emb_f, Emb_notes] -> [L, bs, Emb_f, Emb_notes]
        self.mha_f = nn.MultiheadAttention(num_heads_1 * emb_f, num_heads_1, dropout=0.25) # multi-head attention per features: [L, BS * Emb_notes, Emb_f] -> [L, BS * Emb_notes, Emb_f]
        self.ln2 = nn.LayerNorm([bs * num_heads_1 * emb_f, num_heads_2 * emb_notes]) #layer norm: [BS, L, Emb_f, Emb_notes] -> [BS, L, Emb_f, Emb_notes]
        self.mha_l = nn.MultiheadAttention(num_heads_2 * emb_notes, num_heads_2, dropout=0.25) # multi-head attention per lunghezza: [BS * Emb_f, L, Emb_notes] -> [BS * Emb_f, L, Emb_notes]
        self.ln3 = nn.LayerNorm([bs, num_heads_1 * emb_f, num_heads_2 * emb_notes]) #layer norm
        
        self.mlp = nn.Sequential(
            nn.Linear(num_heads_2 * emb_notes, num_heads_2 * emb_notes),  # Linear transformation
            nn.LayerNorm([bs, num_heads_1 * emb_f, num_heads_2 * emb_notes]),  # Layer normalization
            nn.ELU(),  # Activation function (ELU)
            nn.Linear(num_heads_2 * emb_notes, num_heads_2 * emb_notes)  # Linear transformation
        )

    
    def forward(self, x): # add various reshape
        #[L, bs, Emb_f, Emb_notes]
        #print("Step 1, x shape = {}".format(x.shape))
        x_1 = x.transpose(2, 3).reshape((x.shape[0], x.shape[1] * x.shape[3], x.shape[2])) 
        #[L, BS * Emb_notes, Emb_f]
        #print("Step 2, x_1 shape = {}".format(x_1.shape))
        norm_x_1 = self.ln1(x_1) 
        #print("Step 3, x_1 shape = {}".format(x_1.shape))
        attn_output_1 = self.mha_f(norm_x_1, norm_x_1, norm_x_1)[0] # [0] selects the attention output, to be decided if padding is needed
        #print("Step 4, attn_output_1 shape = {}".format(attn_output_1.shape))
        x_2 = x_1 + attn_output_1 # residual connection
        #print("Step 4, x_2 shape = {}".format(x_2.shape))
        x_2 = x_2.reshape(x.shape[0], x.shape[1], x.shape[3], x.shape[2]).transpose(2,3).reshape(x.shape[0], x.shape[1] * x.shape[2], x.shape[3])
        #[L, BS * Emb_f, Emb_notes]
        #print("Step 5, x_2 shape = {}".format(x_2.shape))
        norm_x_2 = self.ln2(x_2)
        #print("Step 6, x_norm_x_2 shape = {}".format(norm_x_2.shape))
        attn_output_2 = self.mha_l(norm_x_2, norm_x_2, norm_x_2)[0] # [0] selects the attention output, to be decided if padding is needed
        #print("Step 7, attn_output_2 shape = {}".format(attn_output_2.shape))
        x_3 = x + attn_output_2.reshape(x.shape)
        #print("Step 8, x shape = {}".format(x.shape))
        x_3 = self.ln3(x_3)
        #print("Step 9, x shape = {}".format(x.shape))
        x_3 = self.mlp(x_3)
        #print("Step 9, x shape = {}".format(x.shape))
        return x_3 + x

In [7]:
# Test TransformerBlock -> Work
bs = 4
emb_notes = 32 
emb_f = 4
L = 150
num_heads_1 = 4
num_heads_2 = 4
trans_block = TransformerBlock(num_heads_1 = num_heads_1, num_heads_2 = num_heads_2, bs = bs, emb_notes = emb_notes, emb_f = emb_f)
#[L, bs, Emb_f, Emb_notes]

rand_input = torch.rand((L, bs, num_heads_1 * emb_f, num_heads_2 * emb_notes))
print(rand_input.shape)
print(trans_block(rand_input).shape)


torch.Size([150, 4, 16, 128])
torch.Size([150, 4, 16, 128])


In [8]:
class SinusoidalPosEmb(nn.Module):
    def __init__(self, dim):
        super(SinusoidalPosEmb, self).__init__()
        self.dim = dim
    
    def forward(self, x):
        position_enc = np.array([[pos / np.power(10000, 2*i/self.dim) for i in range(self.dim)] 
                                 if pos != 0 else np.zeros(self.dim) for pos in range(x.shape[0])])
        # keep dim 0 for padding token position encoding zero vector # To be decided what to do with this
        position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
        position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
        position_enc = np.expand_dims(position_enc, axis=(1,2))
        position_enc = np.repeat(position_enc, repeats=x.shape[1], axis=1)
        position_enc = np.repeat(position_enc, repeats=x.shape[2], axis=2)
        return torch.from_numpy(position_enc).type(torch.FloatTensor)

class Generator(nn.Module):
    def __init__(self, num_layers, num_emb, num_heads_1, emb_f, num_heads_2, emb_notes, bs):
        super(Generator, self).__init__()
        #[L, bs, Emb_f, notes]
        self.embedding = nn.Embedding(num_emb, num_heads_1 * emb_f) # To be refined
        self.embedding.weight.data = 0.001 * self.embedding.weight.data # Unclear
        #[L, bs, Emb_f, num_heads_1 * emb_f]
        self.pos_emb = SinusoidalPosEmb(num_heads_2 * emb_notes)
        self.blocks = nn.ModuleList([TransformerBlock(num_heads_1, num_heads_2, bs, emb_notes, emb_f) for _ in range(num_layers)])
        self.fc_out = nn.Linear(num_heads_2 * emb_notes, num_heads_2 * emb_notes)

    def forward(self, x):
        print("Gen Step 1, x shape = {}".format(x.shape))
        input_emb = self.embedding(x)
        print("Gen Step 2, input_emb shape = {}".format(input_emb.shape))
        pos_emb = self.pos_emb(x)
        print("Gen Step 3, pos_emb shape = {}".format(pos_emb.shape))
        emb = input_emb + pos_emb
        print("Gen Step 4, emb shape = {}".format(emb.shape))
        for block in self.blocks:
            emb = block(emb) # add key_padding_mask
            print("Gen Step 5 loop, emb shape = {}".format(emb.shape))
        return emb
    
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
    
    def forward(self, x):
        return x

In [9]:
# S soprano (100), A alto (200), T tenor (300), B bass (400). 

n_tokens = len([0, 1, -1, -2]) + 4 * len(range(36,82)) 
tokens = [0, 1, -1, -2] + list(range(36,82)) # 0: padding, 1: start of signal, -1: end of signal, -2: unknown
batch_song = next(SongIterator(song_list=jsf['pitches'], batch_size=64, song_size=max_song_length, shuffle=True))

In [18]:
# Test that Generator work as expected 
num_layers = 3
num_emb = 10 # to be adjusted  
#Gen = Generator(num_layers, n_tokens, num_heads_1, emb_f, num_heads_2, emb_notes, bs)
noise_input = rand_song_generation(len_seq, bs, max_song_length)
#Gen(noise_input.type(torch.int32))


test_embedding = nn.Embedding(n_tokens, num_heads_1 * emb_f) # To be refined
test_embedding.weight

Parameter containing:
tensor([[-0.7893, -1.0900, -0.7382,  ...,  0.3927, -0.0835, -1.1346],
        [ 1.2599,  0.3797,  0.1331,  ...,  0.5523, -0.5993, -0.2147],
        [ 0.9429,  0.1292, -0.7234,  ...,  0.0361,  0.8884,  0.1994],
        ...,
        [ 1.7880,  0.0824,  1.4844,  ..., -0.5106,  0.2426,  1.2985],
        [-0.0824, -0.8936, -0.4848,  ..., -0.5941, -1.3363,  0.8541],
        [ 1.1477, -0.5253,  1.3606,  ..., -1.4152,  0.8251, -0.3641]],
       requires_grad=True)

In [None]:
#Discriminator might relize that padding = real example, find out how to avoid this -> Thus add an attenction mask for padding

#Discriminator -> Some Transformer block followed by taking the first token + linear + tanh

# Create a tranform operation that remove extra symbol, add 