In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def read_dataset(path_):
    
    '''
    Read analysis(dataframes) dataset stored as parquet files.
    
    path_ : path to dataset directory
    
    return : a dict with key/value pairs for all playlists in the folder.
             Key : Name of the playlist (string)
             Value : a dict of track analysis dataframes of all tracks from the playlist
    '''
    dataset = []
    for pl in path_.iterdir():
        
        tempo_list = [(re.sub('_tempo.parquet','',t.name), pd.read_parquet(t)) for t in pl.glob('*_tempo.parquet')]
        segments_list = [ (re.sub('_segments.parquet','',s.name), pd.read_parquet(s)) for s in pl.glob('*_segments.parquet')]
        dataset.append((pl.name, tempo_list, segments_list))
                       
    return dataset

In [3]:
def transform_dataset(dataset, min_tracks = 10, segments = 50):
    
    '''
    Create input arrays to be fed into a model with label provided in tuples).
    
    dataset : list of tuples - (playlist name,  tempo list of name and df tuple, segments list of name and df tuple)).
    min_tracks : minimum number of tracks in a playlist
    segments : Default : 50 - Number of segments to be taken for input.
    
    returns : data - list of tuples of input data arrays and (one-hot)encoded labels.
              categories - array  of unique playlist name/labels in the data.
              num_tracks -  number of tracks in each category/playlist
              Currently only using segment arrays consisting of sequences pitch array and timbre values flattened for input.
              Future edit should account for other features, i.e tempo and/or audio features from spotify.
    '''
    data_array = []
    data_label = []
    data_trackname = []
    num_tracks= []
    
    for pl in dataset:
        
        if len(pl[2]) > min_tracks :
            num_tracks.append(len(pl[2]))
            for track in pl[2]:   
                data_trackname.append(track[0])
                segments =  track[1][['pitches', 'timbre']].to_numpy()
                segments = np.hstack(np.hstack(segments))
                data_array.append(segments)
                data_label.append(pl[0])
                
    
    X = np.array(data_label).reshape(-1,1)
    data_encode = OneHotEncoder().fit(X)
    categories = data_encode.categories_
    data_encoded = data_encode.transform(X).toarray()
    
    data = [i for i in zip(data_trackname, data_array, data_encoded)]
    
    return data, categories, num_tracks
    

In [4]:
p = Path.cwd().parent.joinpath('Dataset')
pl = read_dataset(p)

In [5]:
print('Total number of playlists : ',len(pl))
print(' \n sample playlist name : ',pl[0][0])
print(' \n sample Tempos in  playlist: ', pl[0][1][:3] )
print('\n segments of sample track: ',pl[0][2][0][0] )
pl[0][2][0][1].head()

Total number of playlists :  18
 
 sample playlist name :  Chill out
 
 sample Tempos in  playlist:  [('Alicante',      tempo
0  219.839), ('Beautiful Day - Kiwamu Remix',      tempo
0  121.012), ('Chanunpa - Chillout Mix - Mixed',     tempo
0  99.893)]

 segments of sample track:  Alicante


Unnamed: 0,start,start_minute,duration,confidence,pitches,timbre
4,0.54045,00:00:54,0.2678,1.0,"[0.749, 1.0, 0.208, 0.118, 0.087, 0.053, 0.036...","[42.647, 51.306, -108.983, 117.669, 26.065, -1..."
9,1.35306,00:01:35,0.27238,0.904,"[0.88, 0.282, 0.215, 0.328, 1.0, 0.255, 0.222,...","[33.762, 206.043, -85.068, 19.343, 48.513, 10...."
10,1.62544,00:01:63,0.27855,1.0,"[0.762, 1.0, 0.243, 0.143, 0.11, 0.068, 0.047,...","[41.21, 62.28, -121.38, 121.816, 24.943, -69.0..."
11,1.90399,00:01:90,0.26717,1.0,"[0.253, 0.081, 0.105, 0.281, 1.0, 0.752, 0.216...","[35.63, 213.17, -69.972, 31.139, 67.143, -13.2..."
14,2.44308,00:02:44,0.26871,0.977,"[0.821, 0.214, 0.326, 0.312, 1.0, 0.184, 0.177...","[34.401, 209.175, -82.835, 38.543, 55.049, -6...."


In [6]:
data = create_dataset(pl)
print(data[1])
print('number of tracks in the above playlists : \n', data[2])
data[0][:3]

[array(['Chill out', 'Deep house', 'Deeper House', 'Mo House lo Trance',
       'Our old school trance', 'Our old school trance 138',
       'Our old school trance 3', 'Progressive 5', 'Progressive 5.2',
       'Progressive 6', 'Progressive 8', 'Progressive House',
       'That familiar trance'], dtype='<U25')]
number of tracks in the above playlists : 
 [11, 16, 11, 41, 27, 14, 20, 19, 18, 28, 18, 15, 61]


[('Alicante',
  array([  0.749,   1.   ,   0.208, ...,   0.776, -30.258,   4.948]),
  array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])),
 ('Beautiful Day - Kiwamu Remix',
  array([ 0.252,  0.266,  0.19 , ..., 12.148, -9.44 , 23.983]),
  array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])),
 ('Chanunpa - Chillout Mix - Mixed',
  array([  0.6  ,   0.09 ,   0.174, ..., -39.236,   6.351,   5.214]),
  array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))]

In [7]:
X = [(i[0],i[1]) for i in data[0]]
Y = [i[2] for i in data[0]]

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 17)

In [9]:
print('\n X_train: \n', X_train[:3])
print('\n X_test: \n',X_test[:2])
print('\n Y_train: \n',Y_train[:3])
print('\n X_test: \n',Y_test[:2])


 X_train: 
 [('Solidus', array([  0.625,   0.039,   0.704, ...,  34.995, -33.891,  32.598])), ('Oasis - Original Mix', array([  1.   ,   0.893,   0.708, ...,  10.675, -35.065,  18.925])), ('Frontier', array([ 0.616,  1.   ,  0.829, ..., 17.32 ,  2.126,  4.621]))]

 X_test: 
 [('Burning Skylines - Original Mix', array([ 0.952,  0.699,  0.459, ..., 28.282, 16.522, -1.571])), ('Rise Above The World - Ultimate Remix', array([  0.824,   0.948,   0.809, ..., -17.213,  -0.277,  12.228]))]

 Y_train: 
 [array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.])]

 X_test: 
 [array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.])]
