In [1]:
%load_ext autoreload

In [2]:
#from cap_package import SpotipyCollectPub as scp
from cap_package import ReadTransform as rt
from collections import OrderedDict
from datetime import datetime
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder

In [3]:
%autoreload 2

In [4]:
from IPython.utils.text import columnize
def disp_col(list_):
    
    # import -> from IPython.utils.text import columnize
    l = list(map(lambda x:repr(x)+ ',', list_))
    print(columnize(l, displaywidth=120))
def timer(start_time=None):
    if start_time is not None:
        return (datetime.now() - start_time).total_seconds()
    else:
        start_time = datetime.now()
        return start_time

In [5]:
# load path
load_dotenv()
path = Path(os.getenv('PATH_DATASET1.2'))

In [6]:
# create timbre and ptich column name lists for splitting
timbre_ = ['timbre_{:0>2d}'.format(i+1) for i in range(12)]
pitch_ = ['pitch_{:0>2d}'.format(i+1) for i in range(12)]

In [7]:
# Read dataset
path_ = path.joinpath('user_playlists')
pl_dataset = rt.read_dataset(path_, segments=True, sections=True)

print('Number of playlists:', len(pl_dataset))

Number of playlists: 13


In [8]:
# Create track and playlist name dataframe
tr_pl = []
for pl in pl_dataset:
    for track in pl[1]:
        tr_pl.append([track[0], pl[0]])

tr_label_df = pd.DataFrame(tr_pl, columns=['track_name', 'pl_name'])

In [9]:
# Check for duplicate track names and rename to indicate the track is a duplicate
ind = tr_label_df[tr_label_df.track_name.duplicated()].index
tr_label_df.iloc[ind, 0] = tr_label_df.iloc[ind]['track_name'].apply(lambda x: x + '_dup')

In [10]:
tr_label_df.head()

Unnamed: 0,track_name,pl_name
0,2 Roads [Mix Cut] - Blood Groove & Kikis Remix...,Classic progressive
1,After The Rain - Club Mix_The,Classic progressive
2,Always A Stranger - Dub_The,Classic progressive
3,Colors Of The Night - Dub_Haz,Classic progressive
4,Eclipse - Original Mix_Haz,Classic progressive


In [11]:
# group track names by playlist labels and assign these to a dictionary
grpdf_dict = {}
for pl, group_df in tr_label_df.groupby('pl_name'):
     grpdf_dict[pl] = group_df

categories =list(grpdf_dict.keys())
disp_col(categories)

'Classic progressive',  'Our old school trance',      'Progressive 1',  'Progressive 4',      'That familiar trance',
'Deep house',           'Our old school trance 138',  'Progressive 2',  'Progressive 5',    
'House-Trance',         'Our old school trance 2',    'Progressive 3',  'Progressive House',



In [12]:
# spilt columns for pitch and timbre vectors in segment dataframes
# filter 'start', 'duration', 'loudness', 'key' columns in section dataframes

pl_segsec = []

for pl in pl_dataset:
     
    tracks_seg = []
    tracks_sec = []
    
    # iterate over length of segments/sections tracks list
    for t in range(len(pl[1])):
        seg = pl[1][t][1]
        tracks_seg.append(pd.concat([seg['start'], rt.split_columns(seg, pitch_cols=pitch_, timbre_cols=timbre_, )], \
                          axis=1))
        sec = pl[2][t][1]
        tracks_sec.append(sec[['start', 'duration', 'loudness', 'key']])
    
    pl_segsec.append((pl[0], tracks_seg, tracks_sec)) 

In [13]:
print('\nSample track segments dataframe after splitting of columns:\n')
pl_segsec[0][1][0]


Sample track segments dataframe after splitting of columns:



Unnamed: 0,start,pitch_01,pitch_02,pitch_03,pitch_04,pitch_05,pitch_06,pitch_07,pitch_08,pitch_09,...,timbre_03,timbre_04,timbre_05,timbre_06,timbre_07,timbre_08,timbre_09,timbre_10,timbre_11,timbre_12
10,1.62544,0.295,1.000,0.585,0.390,0.078,0.051,0.087,0.045,0.039,...,16.626,-71.006,-20.437,-7.096,3.518,6.664,-30.561,-8.857,-24.678,-12.109
16,2.63533,0.301,1.000,0.630,0.377,0.066,0.026,0.058,0.028,0.049,...,-18.425,-9.383,-13.917,12.891,5.099,-31.578,-29.472,-6.842,20.759,-21.670
22,3.57029,0.291,1.000,0.628,0.380,0.061,0.036,0.071,0.036,0.047,...,15.169,-58.960,-25.921,3.635,-6.813,5.275,-26.416,-19.053,-18.672,-10.533
28,4.55138,0.279,1.000,0.676,0.439,0.068,0.032,0.066,0.032,0.060,...,9.771,-45.580,-16.500,38.646,4.630,-7.225,-30.034,-21.961,-12.211,-7.807
57,9.40390,0.285,1.000,0.499,0.309,0.057,0.023,0.056,0.025,0.042,...,-41.286,-9.563,8.900,39.028,-2.487,-24.125,9.302,-22.652,-12.984,-17.392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1657,331.79560,0.301,1.000,0.669,0.442,0.090,0.074,0.096,0.041,0.064,...,7.756,6.566,43.438,-8.002,19.392,-11.726,-5.749,-15.792,4.831,-9.427
1664,332.79388,0.988,1.000,0.261,0.132,0.126,0.132,0.197,0.318,0.207,...,15.456,1.247,25.350,-61.933,23.849,-6.019,-28.627,20.800,-4.471,1.743
1670,333.74008,0.183,0.486,1.000,0.975,0.518,0.267,0.306,0.209,0.228,...,30.090,28.861,37.424,-9.041,19.000,-12.378,-20.167,-2.300,18.475,4.498
1676,334.72180,0.221,0.647,1.000,0.980,0.190,0.268,0.287,0.200,0.356,...,19.006,26.857,38.898,-46.612,10.738,-33.803,-31.841,6.538,9.014,8.731


In [14]:
print('\nSample track sections dataframe after filtering:\n')
pl_segsec[0][2][0]


Sample track sections dataframe after filtering:



Unnamed: 0,start,duration,loudness,key
0,0.0,91.15762,-8.346,6
1,91.15762,14.04401,-9.694,4
2,105.20163,21.87251,-15.955,10
3,127.07414,20.84346,-19.916,10
4,147.9176,40.60797,-8.795,10
5,188.52556,28.48634,-9.197,3
6,217.01192,30.99096,-7.827,6
7,248.00287,60.10378,-7.091,1
8,308.10663,32.67463,-6.312,10


We will get statistical moments of timbre values and use them as features.
Two set of moments are calculated to get these: 
- Moments calculated over all the segments in a track
- Means calculated for the segments falling under the top 5 sections with the longest durations

For the second set, two scenarios must be taken care of in the following order:
1. It is possible that a section may not have any segments, especially the last ones, since our segments here have been filtered through certain criterias. If such a section is in the top 5, we will use mean/avg of the remaining top 5 for it.
2. A track may have less than 5 sections, in which case we use the avg over the all the sections, i.e the whole track


Flatten the stats dataframes to act as one entry per track in a playlist

In [15]:
pl_stats = []
for pl in pl_segsec:
    
    # get segment and section stats for playlist
    seg_stat, sec_stat = rt.get_segsec_stats(tracks_seg=pl[1], tracks_sec=pl[2])
    # get updated track name column and playlist label column
    name_col = grpdf_dict[pl[0]].track_name.reset_index(drop=True)
    pl_col = grpdf_dict[pl[0]].pl_name.reset_index(drop=True)
    
    # Flatten the segment stats dataframe
    flat_seg = []
    for seg in seg_stat:
        flat_seg.append(seg.unstack().to_frame().sort_index(level=1).T)
    # concat to form one dataframe per playlist
    pl_seg = pd.concat(flat_seg, ignore_index=True)
    # join resulting tuple of column names from flattening
    pl_seg.columns = pl_seg.columns.map('_'.join)
    # update dataframe by adding track name and playlist column
    pl_seg.insert(loc=0, column='track_name', value=name_col)
    pl_seg.insert(loc=0, column='playlist', value=pl_col)
    
    # Flatten the section stats dataframe
    flat_sec = []
    for sec in sec_stat:
        # reset index to reflect top section numbers in column names after flattening
        sec = sec.set_index([pd.Index(['topsec0', 'topsec1', 'topsec2', 'topsec3', 'topsec4'])])
        flat_sec.append(sec.unstack().to_frame().sort_index(level=1).T)
    # concat to form one dataframe per playlist
    pl_sec = pd.concat(flat_sec, ignore_index=True)
    # join resulting tuple of column names from flattening
    pl_sec.columns = pl_sec.columns.map('_'.join)
    # update dataframe by adding track name and playlist column
    pl_sec.insert(loc=0, column='track_name', value=name_col)
    pl_sec.insert(loc=0, column='playlist', value=pl_col)
    
    pl_stats.append((pl[0], pl_seg, pl_sec))


In [16]:
print('\nSample playlist segments stats dataframe:\n')
pl_stats[0][1].head()


Sample playlist segments stats dataframe:



Unnamed: 0,playlist,track_name,timbre_01_kurtosis,timbre_02_kurtosis,timbre_03_kurtosis,timbre_04_kurtosis,timbre_05_kurtosis,timbre_06_kurtosis,timbre_07_kurtosis,timbre_08_kurtosis,...,timbre_03_std,timbre_04_std,timbre_05_std,timbre_06_std,timbre_07_std,timbre_08_std,timbre_09_std,timbre_10_std,timbre_11_std,timbre_12_std
0,Classic progressive,2 Roads [Mix Cut] - Blood Groove & Kikis Remix...,2.607578,5.769883,-0.054055,-0.71882,1.933854,0.284112,-0.02706,-0.24764,...,38.454921,31.135436,24.726457,29.32537,20.464225,16.824545,15.602014,16.39692,16.011653,12.440938
1,Classic progressive,After The Rain - Club Mix_The,0.032042,-0.037483,0.940155,0.448693,0.353088,0.332408,-0.110257,-0.2533,...,24.121851,28.267504,16.776601,28.043682,18.368293,18.797308,10.532395,17.077417,14.417215,10.014669
2,Classic progressive,Always A Stranger - Dub_The,1.667161,1.627691,4.120893,0.062973,4.441805,3.931294,0.077389,1.107477,...,37.809674,40.844812,17.892933,35.980012,16.540434,19.789902,12.013537,15.536084,21.290878,11.61474
3,Classic progressive,Colors Of The Night - Dub_Haz,1.317984,1.235949,0.579231,0.890771,-0.728799,1.944975,-0.410863,2.804291,...,47.994029,62.364249,31.759329,37.864532,23.708468,28.05743,14.543709,18.776696,24.333183,10.934214
4,Classic progressive,Eclipse - Original Mix_Haz,-1.190126,0.820478,-0.63119,0.03734,-0.106807,-0.594247,0.250968,1.128378,...,58.263083,73.442255,31.218061,58.979114,23.446888,33.841101,19.89467,42.286601,50.565395,14.422039


In [17]:
print('\nSample playlist sections stats dataframe:\n')
pl_stats[0][2].head()


Sample playlist sections stats dataframe:



Unnamed: 0,playlist,track_name,key_01_topsec0,key_02_topsec0,key_03_topsec0,key_04_topsec0,key_05_topsec0,key_06_topsec0,key_07_topsec0,key_08_topsec0,...,timbre_03_topsec4,timbre_04_topsec4,timbre_05_topsec4,timbre_06_topsec4,timbre_07_topsec4,timbre_08_topsec4,timbre_09_topsec4,timbre_10_topsec4,timbre_11_topsec4,timbre_12_topsec4
0,Classic progressive,2 Roads [Mix Cut] - Blood Groove & Kikis Remix...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,29.080792,13.802042,25.277,-32.948583,17.312542,-12.906875,-21.60875,-0.966958,0.269958,8.368042
1,Classic progressive,After The Rain - Club Mix_The,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-3.381045,5.244773,14.047773,-33.0015,0.778386,-15.752159,2.761523,4.371068,-13.471977,-0.787523
2,Classic progressive,Always A Stranger - Dub_The,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-32.6083,6.164125,17.0603,-26.6983,17.009175,11.365875,-9.59155,12.6958,-18.8818,2.7099
3,Classic progressive,Colors Of The Night - Dub_Haz,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,45.584944,7.923778,80.423861,-21.019139,-12.866694,14.524917,-14.065833,-0.770167,-11.91975,-2.52725
4,Classic progressive,Eclipse - Original Mix_Haz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,13.269545,-52.955636,-4.268364,-14.068545,25.495636,0.540727,-12.415091,11.890091,-8.753455,0.035364


In [18]:
# Check for null values in segments and sections dataframe
nulls_seg = []
nulls_sec = []
for pl in pl_stats:
    n1 = pl[1].isnull().sum().sum()
    n2 = pl[2].isnull().sum().sum()
    if n1 > 0:
        nulls_seg.append(pl[0])
    if n2 > 0:
        nulls_sec.append(pl[0])

if nulls_seg and nulls_sec:
    print('Some null values are present')
    print('In segments list:', nulls_seg)
    print('In sections list:', nulls_sec)
else:
    print('No null values are found')

No null values are found


In [None]:
p = path.joinpath('user_pl_featstats')
p1 = p.joinpath('user_pl_segstat') 
p2 = p.joinpath('user_pl_secstat')

np.savetxt(p.joinpath('enc_categories.csv'), categories, fmt='%s', delimiter = ',')

for pl in pl_stats:
    
        pl[1].to_parquet(p1.joinpath('{}_segstat.parquet'.format(pl[0])), engine='pyarrow')
        pl[2].to_parquet(p2.joinpath('{}_secstat.parquet'.format(pl[0])), engine='pyarrow')                    
    