In [200]:
import tensorflow as tf
import numpy as np
import keras
from keras.models import Model
from keras.layers import Conv1D, MaxPooling1D, Dense, Lambda, concatenate, Input, GlobalMaxPooling1D, GlobalAveragePooling1D
import keras.backend as K
import pandas as pd
import json
import pickle
import random
import sklearn
from sklearn.model_selection import train_test_split


In [3]:
num_frames = 599
num_freq_bins = 128
dummy_data = np.random.random((num_frames, num_freq_bins))
num_conv_filters_1 = 256
kernel_size = 4

In [4]:
def l2_norm(x):
    x = x ** 2
    x = K.sum(x, axis=1)
    x = K.sqrt(x)
    return x

In [144]:
class AudioCNNModel():
    def __init__(self, num_frames, num_freq_bins, num_conv_filters1, pool_size_1, kernel_size):
        
        self.num_frames = num_frames
        self.num_freq_bins = num_freq_bins
        self.num_conv_filters1 = num_conv_filters1
        self.pool_size1 = pool_size_1
        self.kernel_size = kernel_size
        self.model_input = Input(shape=(num_frames, num_freq_bins))
        
        x = Conv1D(filters=self.num_conv_filters1, kernel_size=self.kernel_size, input_shape=(self.num_frames, self.num_freq_bins))(self.model_input)
        x = MaxPooling1D(pool_size=self.pool_size1)(x)
        x = Conv1D(filters=256, kernel_size=self.kernel_size)(x)
        x = MaxPooling1D(pool_size=2)(x)
        x = Conv1D(filters=512, kernel_size=self.kernel_size)(x)
        x = MaxPooling1D(pool_size=2)(x)

        #temporal pooling, L2, mean
        max_layer = GlobalMaxPooling1D(data_format='channels_last')(x)
        mean_layer = GlobalAveragePooling1D(data_format='channels_last')(x)
        L2_layer = Lambda(lambda x: l2_norm(x))(x)
        #TODO:concatenate
        
        x = concatenate([max_layer, mean_layer, L2_layer])
        #End
        x = Dense(2048, activation='relu')(x)
        x = Dense(2048, activation='relu')(x)
        latent_factors = Dense(50)(x)
        self.net = Model(inputs=self.model_input, outputs=latent_factors)
        
        


In [206]:
model = AudioCNNModel(num_frames, num_freq_bins, num_conv_filters_1, 4, kernel_size)
model.net.predict(np.array([dummy_data])).shape

(1, 50)

In [93]:
f = open("data/chroma.npy", "rb")
features = np.load(f)
print(features.shape)
f.close()


(7457, 300, 12)


In [207]:
model = AudioCNNModel(300, 12, num_conv_filters_1, 4, kernel_size)
model.net.predict(np.array([features[0]]))

array([[  4.6740026 ,  -9.249433  ,  -1.3980534 ,  26.877184  ,
         -0.83539367, -16.137814  , -13.544751  ,  -0.743586  ,
        -13.395603  ,  -1.2896507 , -12.627663  ,  -3.1189494 ,
        -16.375254  ,   9.278057  ,  11.357858  ,  -6.0458646 ,
        -13.366302  , -28.689785  ,   9.980033  ,   1.921149  ,
         -0.8705852 ,  14.232031  ,  24.4798    ,  -5.9997387 ,
        -10.810645  ,  -4.9090867 ,  17.26567   , -33.375454  ,
         -6.5144606 ,   2.1467118 ,   9.165983  ,  27.146532  ,
          0.09802441, -18.815805  ,   4.780112  ,   6.260415  ,
         10.078434  ,   5.7494774 ,  11.292911  ,  -6.909471  ,
        -17.60566   , -16.109322  , -15.617443  ,   4.06311   ,
        -18.56289   , -35.05837   ,  17.217773  ,  13.385312  ,
         43.127274  ,  -0.63544846]], dtype=float32)

In [108]:
with open('data/song-track-mapping.json', 'rb') as fp:
    song_track_mapping = json.load(fp)


In [109]:
with open('data/track-song-mapping.json', 'rb') as fp:
    track_song_mapping = json.load(fp)

In [110]:
print(song_track_mapping['SOSIANM12AB018CC80'], track_song_mapping['TRBBQGV12903CB5CD3'])

TRBBQGV12903CB5CD3 SOSIANM12AB018CC80


In [102]:
#df.loc[df['song'] == 'SOSIANM12AB018CC80']['features'].values

In [111]:
with open('data/song_factors.pkl', 'rb') as f:
    song_factors_dict = pickle.load(f)
np.array(song_factors_dict['SOSIANM12AB018CC80'])

array([-0.16932662, -0.08086438,  0.06148606, -0.04873373,  0.04393521,
        0.10009863, -0.01554874, -0.12346692, -0.04547898,  0.13980697,
        0.08787304, -0.18935911,  0.03744032,  0.12680109, -0.00610406,
       -0.0684809 , -0.04503168,  0.14103816,  0.01458193, -0.0455489 ,
        0.10926478, -0.03594866,  0.16459042,  0.08418246, -0.00514472,
        0.1974538 ,  0.15211657,  0.01583795, -0.01534751, -0.08645362,
       -0.04650037,  0.00193394,  0.02213042, -0.10820573, -0.01231078,
       -0.10624152, -0.1368499 ,  0.10162297, -0.15660124,  0.28276092,
        0.04303405,  0.06791999,  0.09406953,  0.01736024, -0.19946253,
        0.01771   , -0.04061035,  0.23670618, -0.02108839,  0.05780403])

In [125]:
track_features_dict = {}
f = open("data/track_ids_for_chroma.txt", "r")
counter = 0
for line in f:
    track_features_dict[line.strip()] = new_l[counter]
    counter += 1

In [227]:
model.net.compile(loss=keras.losses.mean_squared_error, optimizer=keras.optimizers.Adam(lr=1e-5))

In [131]:
track_id_set = track_features_dict.keys()
song_id_set = list(song_factors_dict.keys())
count = 0
for song_id_key in song_id_set:
    if song_track_mapping[str(song_id_key)] not in track_id_set:
        count +=1
#         print(song_track_mapping[song_id_key])
        del song_factors_dict[song_id_key]



In [209]:
# song_factors_dict
keys = song_factors_dict.keys()
track_id_list = [song_track_mapping[key] for key in keys]
y = np.array([np.array(song_factors_dict[song_id]) for song_id in keys])
x = np.array([track_features_dict[track_id] for track_id in track_id_list])
# x = sklearn.preprocessing.normalize(x, norm='l2', axis=1)

In [210]:

xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 0)
print(xTrain.shape, yTrain.shape)


(2188, 300, 12) (2188, 50)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f3c3767a1d0>

In [228]:
model.net.fit(xTrain, yTrain, epochs=20, batch_size=50)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f3c32afcac8>

In [229]:
norm_y = (np.linalg.norm(yTest)**2)/yTest.shape[0]
pred = model.net.predict(xTest)
norm_pred = (np.linalg.norm(pred)**2)/pred.shape[0]
norm_y_train = (np.linalg.norm(yTrain)**2)/yTrain.shape[0]
train_pred = model.net.predict(xTrain)
# norm_pred_train = (np.linalg.norm(train_pred)**2)/train_pred.shape[0]
# avg_loss_test = (np.linalg.norm(yTest-pred)**2)/yTest.shape[0]
# loss_train = np.linalg.norm(yTrain - train_pred)**2
# avg_loss_train = loss_train/yTrain.shape[0]

    

In [232]:
avg_loss_train = np.mean(np.square(train_pred - yTrain))
avg_loss_test = np.mean(np.square(pred - yTest))
avg_pred_norm = np.mean(np.square(train_pred))
norm_y_train = np.mean(np.square(yTrain))
norm_y = np.mean(np.square(yTest))
print("average train loss",avg_loss_train)
print("average test loss", avg_loss_test)
print("average train norm",norm_y_train)
print("average test norm",norm_y)
print("average norm of predictions", avg_pred_norm)

average train loss 0.013648288253064815
average test loss 0.0902450124428803
average train norm 0.0017976001065020996
average test norm 0.0013229732227013138
average norm of predictions 0.013672692


In [169]:
print(len(list(song_factors_dict.keys())))

2735


In [174]:
norm_sum = 0.0
for song_id in song_factors_dict.keys():
    y = np.array(song_factors_dict[song_id])
    x = track_features_dict[song_track_mapping[str(song_id_key)]]
#     y_pred = model.net.predict(np.array([x]))
    norm_sum += np.linalg.norm(y)**2
print(norm_sum)

232.8407692922114


In [234]:
model_json = model.net.to_json()
with open('params/model_params_v0.json', "w") as f:
    f.write(model_json)
model.net.save_weights("params/model_params_v0.h5")