In [None]:
import os
import shutil
import numpy as np

In [None]:
def get_file_paths(route):
    name_list = []
    for root, dirs, files in os.walk(route):
        for file in files:
            name_list.append(os.path.join(root, file))
    if not name_list:
        raise FileNotFoundError("")
    return name_list


def merge_data(path):
    file_paths = get_file_paths(path)
    return [np.load(file_path) for file_path in file_paths]

0. Clean tabs: remove broken or unsupported files

In [None]:
import shutil
def dump_files(files_dict):
    os.makedirs(os.path.join("clean_tabs", "gp3"))
    os.makedirs(os.path.join("clean_tabs", "gp4"))
    os.makedirs(os.path.join("clean_tabs", "gp5"))
    os.makedirs(os.path.join("clean_tabs", "gtp"))

    for idx, full_path in enumerate(files_dict.values()):
        shutil.copy(
            full_path,
            os.path.join(
                "clean_tabs",
                os.path.splitext(os.path.basename(full_path))[1][1:],
                os.path.basename(full_path)
            )
        )

def get_unique_file_dict(path_list):
    def is_acceptable(song_path, fmt):
        if fmt not in ('.gp3', '.gp4', '.gp5', ".gtp") \
                or os.path.getsize(song_path) <= 1024:
            return False
        return True

    files_dict = {}
    for filepath in path_list:
        # avoid duplicates
        name = os.path.basename(filepath)
        k, extension = os.path.splitext(name)
        if is_acceptable(filepath, extension) and not files_dict.get(k):
            files_dict[k] = filepath
    return files_dict

In [None]:
filepaths = get_file_paths("dirty_tabs")
print("Files in folder: {}".format(len(filepaths)))
files_dict = get_unique_file_dict(filepaths)
print("Unique files in folder: {}".format(len(files_dict)))

In [None]:
dump_files(files_dict)

1. Split into chunks
2. Get chunk insight

In [None]:
inst_chunk_dict={instrument: merge_data(os.path.join("inst_grouped",instrument)) for instrument in os.listdir("inst_grouped")}

In [None]:
lengths = np.array([chunk.shape[0] for chunk in data])
import plotly.express as px
fig = px.histogram(lengths)
fig.show()


In [None]:
print(np.sum(lengths))
print(np.sum(lengths[lengths >1000]))
print(np.sum(lengths[lengths <60]))



In [None]:
bulk=np.concatenate(data)

In [None]:
note =bulk[:,0]
octave=bulk[:,1]
duration=bulk[:,2]
dotted=bulk[:,3]

In [None]:
unique, counts = np.unique(octave, return_counts=True)

print(np.asarray((unique, counts)).T)

2. Shuffle and split into train and test

In [None]:
import os
song_paths = [x for x in os.listdir("song_chunks/song_grouped/")]
n_songs= len(song_paths)
print(len(song_paths))

In [None]:
thr = int(0.9*n_songs)
import random
song_paths = random.sample(song_paths, n_songs)
train_sp = song_paths[:thr]
test_sp = song_paths[thr:]

In [None]:
train_path = os.path.join("split_dataset","train")
test_path = os.path.join("split_dataset","test")

In [None]:
import shutil
os.makedirs(test_path,exist_ok=True)
os.makedirs(train_path,exist_ok=True)
for idx, song in enumerate(train_sp):
    shutil.copytree(os.path.join("song_chunks","song_grouped",song),os.path.join(train_path,song))
    print(idx)
for idx, song in enumerate(test_sp):
    print(idx)
    shutil.copytree(os.path.join("song_chunks","song_grouped",song),os.path.join(test_path,song))

3. Apply window 

In [1]:
import numpy as np
import os
train_path = os.path.join("split_dataset","train")
test_path = os.path.join("split_dataset","test")
def get_file_paths(route):
    name_list = []
    for root, dirs, files in os.walk(route):
        for file in files:
            name_list.append(os.path.join(root, file))
    if not name_list:
        raise FileNotFoundError("")
    return name_list


def merge_data(path):
    file_paths = get_file_paths(path)
    return [np.load(file_path) for file_path in file_paths]

In [14]:
def apply_song_window(window_size,song):
    sub_windows = (
        # expand_dims are used to convert a 1D array to 2D array.
        np.expand_dims(np.arange(window_size), 0) +
        np.expand_dims(np.arange(song.shape[0]-window_size +1), 0).T
    )
    return np.array(song[sub_windows],dtype=np.int8).reshape(-1,window_size*4)

def apply_window(window_size, data):
    songs = [apply_song_window(window_size,song) for song in data]
    return np.vstack(songs).reshape(-1,window_size,4)

In [15]:
def window_and_shuffle(ws,dist_name):
    data = train_data if dist_name=="train" else test_data
    dataset = apply_window(ws,data)
    np.random.shuffle(dataset)
    dest_folder=os.path.join("windowed",dist_name,str(ws))
    os.makedirs(dest_folder,exist_ok=True)
    dest = os.path.join(dest_folder,"windows.npy")
    np.save(dest,dataset)

In [16]:
train_data = merge_data(train_path)
print(len(train_data))

62429


In [17]:
window_and_shuffle(6, "train")
window_and_shuffle(11, "train")
window_and_save(21, "train")
window_and_save(31, "train")
window_and_save(41, "train")
window_and_save(51, "train")

In [18]:
test_data = merge_data(test_path)
print(len(test_data))

7165


In [19]:
window_and_shuffle(6, "test")
window_and_shuffle(11, "test")
window_and_shuffle(21, "test")
window_and_shuffle(31, "test")
window_and_shuffle(41, "test")
window_and_shuffle(51, "test")

4. Modest size

In [11]:
in_path = "windowed"
import os
import numpy as np
def modest_size(ws,dist,n_ex):
    n_ex=int(n_ex)
    inp=os.path.join(in_path,dist,str(ws),"windows.npy")
    outp=os.path.join("modest",dist,str(ws))
    os.makedirs(outp,exist_ok=True)
    in_data=np.load(inp)
    print(in_data.shape)
    out_data=in_data[:n_ex,:,:]
    np.save(os.path.join(outp,"windows.npy"),out_data)
    
    
modest_size(6, "train",2e6)
modest_size(11, "train",2e6)
modest_size(21, "train",2e6)
modest_size(31, "train",2e6)
modest_size(41, "train",2e6)
modest_size(51, "train",2e6)

modest_size(6, "test",2e5)
modest_size(11, "test",2e5)
modest_size(21, "test",2e5)
modest_size(31, "test",2e5)
modest_size(41, "test",2e5)
modest_size(51, "test",2e5)

(20877003, 6, 4)
(20564858, 11, 4)
(19940568, 21, 4)
(19316278, 31, 4)
(18691988, 41, 4)
(18067698, 51, 4)
(2366574, 6, 4)
(2330749, 11, 4)
(2259099, 21, 4)
(2187449, 31, 4)
(2115799, 41, 4)
(2044149, 51, 4)


5.Get weights

In [9]:
import os
in_path = os.path.join("modest","train")
import numpy as np

def single_feature_weights(data,ws,index):
    data=data[:,:,index]
    feature=["semitone","octave","dur_log","dotted"][index]
    _,freqs=np.unique(data,return_counts=True)
    i_freqs = i_freqs = np.divide(1., freqs, out=np.zeros_like(freqs, dtype='float'), where=freqs != 0)
    weight_vector = freqs.shape[0] * i_freqs / np.sum(i_freqs)
    out_path=os.path.join(in_path,str(ws),f'{feature}_weights.npy')
    np.save(out_path,weight_vector)
    
def extract_all_weights(labels,ws):
    single_feature_weights(labels,ws,0)
    single_feature_weights(labels,ws,1)
    single_feature_weights(labels,ws,2)
    single_feature_weights(labels,ws,3)
def get_and_save_weights(ex_size):
    in_file = os.path.join(in_path,str(ex_size),"windows.npy")
    data = np.load(in_file)
    ws=ex_size-1
    print(data.shape)
    labels=data[:,ws::ex_size,:]
    extract_all_weights(labels,ex_size)
get_and_save_weights(6)
get_and_save_weights(11)
get_and_save_weights(21)
get_and_save_weights(31)
get_and_save_weights(41)
get_and_save_weights(51)

(2000000, 6, 4)
(2000000, 11, 4)
(2000000, 21, 4)
(2000000, 31, 4)
(2000000, 41, 4)
(2000000, 51, 4)


6 check model weights

In [9]:
import h5py
import numpy as np
def traverse_datasets(hdf_file):

    def h5py_dataset_iterator(g, prefix=''):
        for key in g.keys():
            item = g[key]
            path = f'{prefix}/{key}'
            if isinstance(item, h5py.Dataset): # test for dataset
                yield (path, item)
            elif isinstance(item, h5py.Group): # test for group (go down)
                yield from h5py_dataset_iterator(item, path)

    for path, _ in h5py_dataset_iterator(hdf_file):
        yield path
        
import os
weight_path=os.path.join("..","report","1(2022-04-19-21-31-53)","weights","lstm.h5")

f = h5py.File(weight_path, 'r')
for dset in traverse_datasets(f):
    print('Path:', dset)
    print('Shape:', f[dset].shape)
    print('Data type:', f[dset].dtype)
    print(np.array(f[dset]).min(),np.array(f[dset]).max())

Path: /dotted/dotted/bias:0
Shape: (2,)
Data type: float32
-0.8876514 0.887647
Path: /dotted/dotted/kernel:0
Shape: (128, 2)
Data type: float32
-1.2200236 1.272265
Path: /dur_log/dur_log/bias:0
Shape: (7,)
Data type: float32
-2.9435296 1.1916081
Path: /dur_log/dur_log/kernel:0
Shape: (128, 7)
Data type: float32
-7.0105762 1.1719713
Path: /embedding_4/embedding_4/embeddings:0
Shape: (13, 8)
Data type: float32
-1.6156926 1.6451619
Path: /embedding_5/embedding_5/embeddings:0
Shape: (11, 8)
Data type: float32
-1.3608106 1.6911125
Path: /embedding_6/embedding_6/embeddings:0
Shape: (7, 8)
Data type: float32
-1.7391678 2.693332
Path: /embedding_7/embedding_7/embeddings:0
Shape: (2, 8)
Data type: float32
-0.70769805 1.7395235
Path: /lstm/lstm/lstm_cell/bias:0
Shape: (512,)
Data type: float32
-0.9392935 1.711859
Path: /lstm/lstm/lstm_cell/kernel:0
Shape: (32, 512)
Data type: float32
-3.134302 3.0694928
Path: /lstm/lstm/lstm_cell/recurrent_kernel:0
Shape: (128, 512)
Data type: float32
-2.740366 

-2.740366 2.5098338
