In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import tensorflow as tf
from tensorboard.plugins.hparams import api as hp
from tensorflow.keras import layers
from sklearn import preprocessing
import datetime
import math 

In [None]:
def process_data():
    df_test = pd.read_csv(r'../test_data_clean', sep = ",", engine='python')
    df_train = pd.read_csv(r'../train_data_clean', sep = ",", engine='python')
    
    #Removing unwanted column
    test = df_test.drop(df_test.columns[0],axis=1)
    train = df_train.drop(df_train.columns[0],axis=1)
    
    #Seperating "target" into own dataframe
    test_targets = test.iloc[:,[13]]
    test_inputs = test.loc[:,test.columns != 'target']
    train_targets = train.iloc[:,[13]]
    train_inputs = train.loc[:,train.columns != 'target']
    
    #Saving the data, not necessary to do this every time, but it is implemented in the method to show the entire process
    np.savez('Spotify_data_train', inputs=train_inputs, targets=train_targets)
    np.savez('Spotify_data_test', inputs=test_inputs, targets=test_targets)

    npz = np.load('Spotify_data_train.npz')
    train_inputs,train_targets = npz['inputs'].astype(np.float),npz['targets'].astype(np.int)

    npz = np.load('Spotify_data_test.npz')
    test_inputs, test_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)

    #Preprocessing using the sklearn method
    test_inputs = preprocessing.scale(test_inputs)
    
    return train_inputs,train_targets,test_inputs,test_targets

train_inputs,train_targets,test_inputs,test_targets = process_data()
total_samples = len(train_inputs)
split = math.ceil(total_samples/5)
k = 5

In [None]:
#Method for implementing crossvalidation, takes as input the train inputs and train targets. 
#Returns the training set and validation set depending on which fold to use. 

def crossFold(train_inputs,train_targets,i):
    
    if(i==0):
        #Doesnt preprocess the data until it is seperated into validation and training set, to prevent data leakage. 
        validation_inputs = preprocessing.scale(train_inputs[:split])
        validation_targets = train_targets[:split]
        inputs = preprocessing.scale(train_inputs[split:])
        targets = train_targets[split:]
    elif(i == 4):
        validation_inputs = preprocessing.scale(train_inputs[i*split:])
        validation_targets = train_targets[i*split:]
        inputs = preprocessing.scale(train_inputs[:i*split])
        targets = train_targets[:i*split]
    else:
        i1 = train_inputs[:i*split]
        i2 = train_inputs[(i+1)*split:]
        inputs = preprocessing.scale(np.concatenate((i1,i2)))
        
        it1 = train_targets[:i*split]
        it2 = train_targets[(i+1)*split:]
        targets = np.concatenate((it1,it2))
        
        validation_inputs = preprocessing.scale(train_inputs[i*split:(i+1)*split])
        validation_targets = train_targets[i*split:(i+1)*split]
        
    return inputs,targets,validation_inputs,validation_targets

In [None]:
#Setting the parameter values we are trying to test

number_of_layers = [1,2,5]
number_of_nodes = [10,17,30]
optimizers = ['adam','RMSprop','sgd']
batch_sizes = [10,50,100]
max_epochs = [5,10,25]
regulizer_param = [0.001,0.01]



HP_NUM_LAYERS = hp.HParam('num_layers', hp.Discrete(number_of_layers))
HP_NUM_NODES = hp.HParam('num_nodes', hp.Discrete(number_of_nodes)) 
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(optimizers))
HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete(batch_sizes)) 
HP_MAX_EPOCHS = hp.HParam('max_epochs', hp.Discrete(max_epochs))
HP_L2 = hp.HParam('l2 regularizer', hp.RealInterval(regulizer_param[0],regulizer_param[1]))

METRIC_ACCURACY = 'accuracy'

#Creates a summary file writer for the given log directory.
with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
    hp.hparams_config(
    hparams=[HP_NUM_LAYERS,HP_NUM_NODES,HP_OPTIMIZER,HP_BATCH_SIZE, HP_MAX_EPOCHS,HP_L2], 
    metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
  )
   
        
def train_test_model(run_dir,hparams,t_inputs,t_targets,v_inputs,v_targets): 
    
    
    input_size = 17 
    output_size = 1
    
    #Builds and compiles the model with the specified metrics. 
    model = tf.keras.Sequential()
    model.add(layers.Dense(hparams[HP_NUM_NODES], activation='relu',input_shape=(input_size,)))
    for i in range(hparams[HP_NUM_LAYERS]-1):
        model.add(layers.Dense(hparams[HP_NUM_NODES],kernel_regularizer=tf.keras.regularizers.l2(hparams[HP_L2]), activation='relu'))
    model.add(layers.Dense(output_size, activation='sigmoid'))
    
    model.compile(optimizer=hparams[HP_OPTIMIZER], loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
    
       
    batch_size = hparams[HP_BATCH_SIZE]
    max_epochs = hparams[HP_MAX_EPOCHS]
    
    #Callbacks for logging of hyperparameters, metrics and early stopping. 
    callbacksList = [
        tf.keras.callbacks.TensorBoard(run_dir,histogram_freq=1,update_freq=1,embeddings_freq=1),
        tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0.001, patience=8, mode='max',baseline=None),
        tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=8, verbose=0, mode='min',baseline=0.6)
    ]
    #Training the model. 
    history = model.fit(  t_inputs,t_targets,
                          batch_size=batch_size,
                          epochs=max_epochs, 
                          validation_data=(v_inputs, v_targets),
                          callbacks=callbacksList,
                          verbose = 0) 

    test_loss,test_accuracy = model.evaluate(test_inputs, test_targets)
    
    
    return test_loss, test_accuracy   
    
#Writes accuracy and loss return by model.evaluate, for each run. Is logged in Tensorboard
def run(run_dir,hparams,t_inputs,t_targets,v_inputs,v_targets):
    with tf.summary.create_file_writer(run_dir+"/Kfold"+str(i)).as_default():
        hp.hparams(hparams)  # record the values used in this trial
        accuracy = train_test_model(run_dir+"/Kfold"+str(i),hparams,t_inputs,t_targets,v_inputs,v_targets)
       
        tf.summary.scalar("loss",accuracy[0],step=session_num)
        tf.summary.scalar(METRIC_ACCURACY, accuracy[1], step=session_num)
   
    #Return accuracy to be able to calculate the combined accuracy for the given hyperparameters.
    return accuracy[1]
        

In [None]:
session_num = 0
start = datetime.datetime.now()


for num_layers in HP_NUM_LAYERS.domain.values:
    for num_nodes in HP_NUM_NODES.domain.values:
        for optimizer in HP_OPTIMIZER.domain.values:
            for batch_size in HP_BATCH_SIZE.domain.values:
                for max_epoch in HP_MAX_EPOCHS.domain.values:
                        for l2 in (HP_L2.domain.min_value, HP_L2.domain.max_value):
                            
                            #Defines dictionary of hyperparameters.
                            hparams = {
                                HP_NUM_LAYERS: num_layers,
                                HP_NUM_NODES: num_nodes,
                                HP_OPTIMIZER: optimizer,
                                HP_BATCH_SIZE: batch_size, 
                                HP_MAX_EPOCHS: max_epoch,
                                HP_L2: l2,
                            }
                            combined_acc = 0
                            
                            start_time_run = datetime.datetime.now()
                            
                            for i in range(5):
                                start_time_fold = datetime.datetime.now()
                                t_inputs,t_targets,v_inputs,v_targets = crossFold(train_inputs,train_targets,i)
                                run_name = "run-%d" % session_num
                                print('----------- Starting trial: %s' % run_name + '-----------')
                                print({h.name: hparams[h] for h in hparams})
                                print("Fold nr "+str(i+1))
                                
                                accuracy = run('logs/hparam_tuning/' + run_name, hparams,t_inputs,t_targets,v_inputs,v_targets)
                                combined_acc += accuracy
                                
                        
                                finished_time = datetime.datetime.now()
                                
                                print('Total time elapsed: ', finished_time-start,'\n')
                                print('Time elapsed for fold: ', finished_time-start_time_fold ,'\n')
                            
                            combined_acc = combined_acc/k
                            tf.summary.scalar('Combined accuray:', combined_acc , step=1)
                            print("Combined accuray: ",combined_acc,'\n')
                            
                            print('Time elapsed for run: ',finished_time-start_time_run)
                            
                            
                            session_num += 1

print("----- Session finished -----")
