In [1]:
from cap_package import ReadTransform as rt
from datetime import datetime
from dotenv import load_dotenv
from hyperas import optim
from hyperas.distributions import choice, uniform
from hyperopt import Trials, STATUS_OK, tpe
import numpy as np
import os
import pandas as pd
from pathlib import Path
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, BatchNormalization, Activation, Dropout


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from IPython.utils.text import columnize
def disp_col(list_):
    
    # import -> from IPython.utils.text import columnize
    l = list(map(lambda x:repr(x)+ ',', list_))
    print(columnize(l, displaywidth=120))
def timer(start_time=None):
    if start_time is None:
        start_time = datetime.now()
        return start_time
    else:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [3]:
#tf.keras.backend.clear_session()

In [4]:
load_dotenv()
path = Path(os.getenv('PATH_DATASET1.2')).joinpath('user_pl_featstats')

In [5]:
path_ = path.joinpath('user_pl_segstat')
dfs = []
for f in path_.iterdir():
    dfs.append(pd.read_parquet(f))

segstat_df = pd.concat(dfs, ignore_index=True)

In [6]:
path_ = path.joinpath('user_pl_secstat')
dfs = []
for f in path_.iterdir():
    dfs.append(pd.read_parquet(f))

secstat_df = pd.concat(dfs, ignore_index=True)

In [7]:
segsecstat_df = pd.concat([segstat_df, secstat_df.iloc[:, 2:]], axis=1)
segsecstat_df.head()

Unnamed: 0,playlist,track_name,timbre_01_kurtosis,timbre_02_kurtosis,timbre_03_kurtosis,timbre_04_kurtosis,timbre_05_kurtosis,timbre_06_kurtosis,timbre_07_kurtosis,timbre_08_kurtosis,...,timbre_03_topsec4,timbre_04_topsec4,timbre_05_topsec4,timbre_06_topsec4,timbre_07_topsec4,timbre_08_topsec4,timbre_09_topsec4,timbre_10_topsec4,timbre_11_topsec4,timbre_12_topsec4
0,Classic progressive,2 Roads [Mix Cut] - Blood Groove & Kikis Remix...,2.607578,5.769883,-0.054055,-0.71882,1.933854,0.284112,-0.02706,-0.24764,...,29.080792,13.802042,25.277,-32.948583,17.312542,-12.906875,-21.60875,-0.966958,0.269958,8.368042
1,Classic progressive,After The Rain - Club Mix_The,0.032042,-0.037483,0.940155,0.448693,0.353088,0.332408,-0.110257,-0.2533,...,-3.381045,5.244773,14.047773,-33.0015,0.778386,-15.752159,2.761523,4.371068,-13.471977,-0.787523
2,Classic progressive,Always A Stranger - Dub_The,1.667161,1.627691,4.120893,0.062973,4.441805,3.931294,0.077389,1.107477,...,-32.6083,6.164125,17.0603,-26.6983,17.009175,11.365875,-9.59155,12.6958,-18.8818,2.7099
3,Classic progressive,Colors Of The Night - Dub_Haz,1.317984,1.235949,0.579231,0.890771,-0.728799,1.944975,-0.410863,2.804291,...,45.584944,7.923778,80.423861,-21.019139,-12.866694,14.524917,-14.065833,-0.770167,-11.91975,-2.52725
4,Classic progressive,Eclipse - Original Mix_Haz,-1.190126,0.820478,-0.63119,0.03734,-0.106807,-0.594247,0.250968,1.128378,...,13.269545,-52.955636,-4.268364,-14.068545,25.495636,0.540727,-12.415091,11.890091,-8.753455,0.035364


In [8]:
path_ = path.joinpath('user_pl_feat')
dfs = []
for f in path_.iterdir():
    
    df = pd.read_parquet(f)
    pl_col = [f.name.replace('_features.parquet', '')] * len(df)
    df.insert(loc=0, column='playlist', value=pl_col)
    dfs.append(df)
    

full_feat_df = pd.concat(dfs, ignore_index=True)
feat_df = full_feat_df.loc[:, : 'artists_name']

In [9]:
# rename 'name' column to match 'track name' column in segstat_df
feat_df = feat_df.rename(columns={'name': 'track_name'})

# update track name column values to match that of in segstat_df

# add first 3 characters from artists_name
upd_trname = feat_df.track_name + '_' + feat_df.artists_name.apply(lambda x: x[:3])
# remove any special characters
upd_trname.replace(regex=r'[*|><:"?/]|\\', value='', inplace=True)
# find duplicates and add 'dup' to those track names
ind = upd_trname[upd_trname.duplicated()].index
upd_trname.iloc[ind] = upd_trname.iloc[ind].apply(lambda x: x + '_dup')

feat_df['track_name'] = upd_trname

In [10]:
new_df = pd.merge(feat_df, segsecstat_df, how='outer', on=['track_name', 'playlist'])
disp_col(new_df.columns)

'playlist',            'timbre_10_max',       'timbre_06_std',      'loudness_topsec1',   'key_07_topsec3',   
'danceability',        'timbre_11_max',       'timbre_07_std',      'timbre_01_topsec1',  'key_08_topsec3',   
'energy',              'timbre_12_max',       'timbre_08_std',      'timbre_02_topsec1',  'key_09_topsec3',   
'loudness',            'timbre_01_mean',      'timbre_09_std',      'timbre_03_topsec1',  'key_10_topsec3',   
'speechiness',         'timbre_02_mean',      'timbre_10_std',      'timbre_04_topsec1',  'key_11_topsec3',   
'acousticness',        'timbre_03_mean',      'timbre_11_std',      'timbre_05_topsec1',  'key_12_topsec3',   
'instrumentalness',    'timbre_04_mean',      'timbre_12_std',      'timbre_06_topsec1',  'loudness_topsec3', 
'valence',             'timbre_05_mean',      'key_01_topsec0',     'timbre_07_topsec1',  'timbre_01_topsec3',
'tempo',               'timbre_06_mean',      'key_02_topsec0',     'timbre_08_topsec1',  'timbre_02_topsec3',
'

In [11]:
data_df = new_df.drop(['track_name', 'artists_name', 'playlist'], axis=1).values
labels = new_df.loc[:, 'playlist'].values
enc_labels, categories = rt.encode_label(labels)

In [12]:
tot_num = len(data_df)
categ = len(categories)
print(' Total number of tracks:', tot_num)
print(' Total number of categories/playlists:', categ)

 Total number of tracks: 372
 Total number of categories/playlists: 1


In [13]:
SEED=17

In [14]:
x_train, x_test, y_train, y_test = train_test_split( \
            data_df, enc_labels, test_size = 0.25, stratify=enc_labels, random_state=SEED)

In [15]:
def create_classifier(base_model, nodes, dropout=True, new_idx=True):
    
    def dense_layer(x, nodes_, activation=tf.nn.leaky_relu, dropout=dropout, name_idx=None):
    
        # define a dense layer section
        x = Dense(nodes_, name='dense_{}'.format(name_idx))(x)
        x = Activation(activation, name='act_{}'.format(name_idx))(x)
        x = BatchNormalization(name='bn_{}'.format(name_idx))(x)
        if dropout:
            x = Dropout(0.25, name='dropout_{}'.format(name_idx))(x)
            
        return x
    
    # classes/playlists in the dataset
    classes = len(y_train[0])
    
    if new_idx:
        # create new index for naming layers
        idx = int(base_model.layers[-1].name[-1]) + 1
    else:
        idx = 0
    # Create classifier by adding dense layers to the base model
    x = dense_layer(base_model.layers[-1].output, nodes[0], activation='tanh', name_idx=idx + 0 )
   
    for i in range(1, len(nodes)):
        x = dense_layer(x, nodes[i], name_idx=idx + i)
    
    # Set nodes of last dense layer as number of classes
    outputs = Dense(classes, activation='softmax', name='dense_{}'.format(idx + len(nodes)))(x)
    
    # create model
    model = Model(inputs=base_model.input, outputs=outputs, name='classifier')
    
    return  model

In [16]:
# Load encoder model
loaded_enc = keras.models.load_model(filepath=Path(os.getenv('PATH_MODELS')).joinpath('encoder'))
    
# Freeze and rename all the layers
for layer in loaded_enc.layers[:]:
    layer.trainable = False
    layer._name = str('enc_') + layer.name
    


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [25]:

inputlength = len(x_train[0])
print(inputlength)
inputs = Input(shape=(inputlength,), name='base_input')
x = Dense(50, name='base_dense')(inputs)
outputs = Activation(tf.nn.leaky_relu, name='base_act')(x)
#outputs = BatchNormalization(name='base_bn')(x)

basemodel = Model(inputs, outputs, name='base_model')

217


In [33]:
# Set number of units/nodes for the dense layers added in the classifier
nodes = [150, 120, 100, 80, 60, 50, 40, 25]
# Create model
cls_nn = create_classifier(basemodel, nodes, dropout=True, new_idx=False)

In [34]:
#cls_nn.summary()

In [35]:
opt = tf.keras.optimizers.Adam(0.01)
cls_nn.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=opt)

In [36]:
BATCH_SIZE = 16
TRAIN_STEPS_PER_EPOCH = np.ceil(len(x_train)/ BATCH_SIZE)       #np.ceil(TRAIN_COUNT/BATCH_SIZE)
#VAL_STEPS_PER_EPOCH = np.ceil(len(x_val)/BATCH_SIZE)        #np.ceil(VAL_COUNT/BATCH_SIZE)
TEST_STEPS = len(x_test)
EPOCHS = 1000

In [37]:
callback = tf.keras.callbacks.EarlyStopping(monitor='acc', patience=100)
result = cls_nn.fit(x_train, y_train,
              batch_size=BATCH_SIZE,
              verbose=0,
              epochs=EPOCHS,
              validation_split=0.12,
              callbacks=[callback])

In [38]:
len(result.history['acc'])

134

In [39]:
result.history['acc'][-5:]

[0.12653062, 0.14693877, 0.118367344, 0.13469388, 0.14693877]

In [42]:
result.history['val_acc'][-5:]

[0.11764706, 0.11764706, 0.11764706, 0.1764706, 0.1764706]