In [25]:
import pandas as pd
import numpy as np


import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf
import tensorflow.keras as keras 

from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Activation
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier

from keras.constraints import maxnorm

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import fbeta_score, make_scorer
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [26]:
train_df = pd.read_csv("music_scaled_train_pca_75.csv", header=0)
test_df = pd.read_csv("music_scaled_test_pca_25.csv", header=0)

In [69]:
X_train_df = train_df.drop(columns=['genre'])
X_test_df = test_df.drop(columns=['genre'])
y_train_df = train_df['genre']
y_test_df = test_df['genre']

In [87]:
label_num = train_df.groupby('genre')['genre'].count().count()
label_num

121

In [61]:
test_df.groupby('genre')['genre'].count().count()

121

In [44]:
#functions

def loss_scorer(y_true, y_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None):
    return log_loss(y_true, y_pred, eps, normalize, sample_weight, labels=__LABELS__)

def search_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=10, scoring_fit='neg_mean_squared_error',
                       do_probabilities = False, search_mode = 'GridSearchCV', n_iterations = 0):
    fitted_model = None
    
    if(search_mode == 'GridSearchCV'):
        gs = GridSearchCV(
            estimator=model,
            param_grid=param_grid, 
            cv=cv, 
            n_jobs=-1, 
            scoring=scoring_fit,
            verbose=2
        )
        fitted_model = gs.fit(X_train_data, y_train_data)

    elif (search_mode == 'RandomizedSearchCV'):
        rs = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grid, 
            cv=cv,
            n_iter=n_iterations,
            n_jobs=-1, 
            scoring=scoring_fit,
            verbose=2
        )
        fitted_model = rs.fit(X_train_data, y_train_data)
    
    
    if(fitted_model != None):
        if do_probabilities:
            pred = fitted_model.predict_proba(X_test_data)
        else:
            pred = fitted_model.predict(X_test_data)
            
        return fitted_model, pred

In [109]:
# Deep Learning model base
# Function to create model, required for KerasClassifier
def create_model(
    activation='relu',
    dropout_rate=0.0,
    init_mode='uniform',
    weight_constraint=0,
    optimizer='adam',
    lr=0.01,
    momentum=0):
    # create model
    model = Sequential()
    model.add(Dense(1028, input_dim=input_dim, 
                    kernel_initializer=init_mode,
                    activation=activation, 
                    kernel_constraint=maxnorm(weight_constraint)))
    model.add(Dense(512, activation=activation))
    model.add(Dense(256, activation=activation))
    model.add(Dense(128, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(label_num, 
                    kernel_initializer=init_mode, 
                    activation=activation))
    # Compile model
    model.compile(loss='sparse_categorical_crossentropy', 
                  optimizer=optimizer, 
                  metrics=['accuracy'])
    return model

# fix random seed for reproducibility
seed = 7


#gridsearch area
activation = ['relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear', 'softmax']
momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
dropout_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
weight_constraint = [1, 2, 3, 4, 5]
neurons = [1, 5, 10, 15, 20, 25, 30]
init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform']
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']

epochs = [1, 10, 50, 100]
batch_size = [128, 512, 1024, 2048, 4096]
#param_grid = dict(epochs=epochs, batch_size=batch_size)

#reduce or expand this based on need and runtime
param_grid = dict(epochs=epochs
                  , batch_size=batch_size
                  , activation=activation
                  , dropout_rate=dropout_rate
                  , weight_constraint=weight_constraint,
                  , init_mode=init_mode
                  , optimizer=optimizer
                 )

In [110]:
#Encode labels since tensorflow expects integers
encoder = LabelEncoder()
y = encoder.fit_transform(y_train_df)

#create global variable for this
#this isnt good practice but whatever
global __LABELS__
__LABELS__ = list(set(y))

In [None]:
# spliting of dataset into train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X_train_df
                                                    , y
                                                    , test_size=0.2
                                                    , stratify=y)

#use RepeatedStratifiedKFold to roughly ensure that each 
#fold is representative of all strata of the data
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3)

# create Keras model
input_dim = X_train.shape[1]
model = KerasClassifier(build_fn=create_model)

model, pred = search_pipeline(X_train
                              , X_test
                              , y_train
                              , y_test
                              ,model
                              ,param_grid
                              ,cv=cv
                              ,scoring_fit=make_scorer(loss_scorer, 
                                                      greater_is_better=False, 
                                                      needs_proba=True)
                              ,search_mode = 'RandomizedSearchCV'
                              ,n_iterations = 5
                             )

# summarize results
print("Best: %f using %s" % (model.best_score_, model.best_params_))
means = model.cv_results_['mean_test_score']
stds = model.cv_results_['std_test_score']
params = model.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 9 folds for each of 56 candidates, totalling 504 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed: 10.6min
