In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

from tqdm import tqdm

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
def accuracy(a,b):
    return (a == b).sum() / b.shape[0]

In [None]:
FILENAME = '/kaggle/input/lyrics-covariates-final/model_input_covariates_bow.csv'

RESULTS_FILENAME = 'results_NN_covariates_bow_genre.csv'

DATETIME_FORMAT = '%H:%M:%S'

In [None]:
# load all data and output
df = pd.read_csv(FILENAME)

df.head()

In [None]:
# convert datetime to datetime
df['song_runtime'] = pd.to_datetime(df['song_runtime'], format=DATETIME_FORMAT)

df = df.assign(song_runtime_secs=lambda x: (x['song_runtime'].dt.minute * 60 + x['song_runtime'].dt.second))

X = df.drop(columns = ['song_genre', 'artist_name', 'song_title', 'song_runtime'])
y = df['artist_name']

In [None]:
# drop the rws where genre is NaN
df_genre = df.dropna(subset=['song_genre'])

print("Rows in full dataframe: {}".format(df.shape[0]))
print("Rows in dataframe after removing rows where genre is NaN: {}".format(df_genre.shape[0]))

# convert datetime to datetime
df_genre['song_runtime'] = pd.to_datetime(df_genre['song_runtime'], format=DATETIME_FORMAT)

df_genre = df_genre.assign(song_runtime_secs=lambda x: (x['song_runtime'].dt.minute * 60 + x['song_runtime'].dt.second))

X = df_genre.drop(columns = ['song_genre', 'artist_name', 'song_title', 'song_runtime'])
y = df_genre['song_genre']

# conversion of X  into array
# prevents warning X does not have valid feature names, but MLPClassifier was fitted with feature names
X = X.values

X_train, X_val, y_train, y_val = \
train_test_split(X, y, test_size=0.2, random_state=1234)

In [None]:
static_parameters = []
net_size_parameters = []
hidden_layer_parameter = []
batch_size_parameters = []

# dynamic values
size_values = [10, 100]
batch_sizes = [32, 64, 128]
train_perf = []
val_perf = []

# size of the network from https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
# (ii) the number of neurons in that layer is the mean of the neurons in the input and output layers:
# #input layers, #input layers/2, #input layers/4, #input layers/8, #input layers/16, #output layers
for net_size in tqdm(size_values):

    # batch_size:
    for batch_size in tqdm(batch_sizes):

        # HIDDEN LAYERS == 1

        # 3 times the number of columns in your data
        static_parameters.append('Epochs: 6000; Hidden layers: 1')
        net_size_parameters.append(net_size)
        batch_size_parameters.append(batch_size)                

        nnclf = MLPClassifier(hidden_layer_sizes=(net_size,),# one hidden layer with net_size neurons
                            activation='relu', # sigmoid
                            solver='sgd', # optimization algorithm
                            alpha=0.3, # Strength of the L2 regularization term
                            learning_rate_init=0.1, # The initial learning rate used
                            random_state=1234,
                            max_iter=6000, # number of iterations (epochs for sgd)
                            tol=1e-7, # convergence
                            early_stopping=True 
        ) 

        nnclf.fit(X_train, y_train)

        # check training set performance
        y_hat_train = nnclf.predict(X_train)

        train_perf.append( accuracy(y_hat_train, y_train) )

        # check validation set performance
        y_hat_val = nnclf.predict(X_val)

        val_perf.append( accuracy(y_hat_val, y_val) )  

        # HIDDEN LAYERS == 5
        
        # 3 times the number of columns in your data
        static_parameters.append('Epochs: 6000; Hidden layers: 5')
        net_size_parameters.append(net_size)
        batch_size_parameters.append(batch_size)                

        nnclf = MLPClassifier(hidden_layer_sizes=(net_size, net_size, net_size, net_size, net_size),# two hidden layers with net_size neurons
                            activation='relu', # sigmoid
                            solver='sgd', # optimization algorithm
                            alpha=0.3, # Strength of the L2 regularization term
                            learning_rate_init=0.1, # The initial learning rate used
                            random_state=1234,
                            max_iter=6000, # number of iterations (epochs for sgd)
                            tol=1e-7, # convergence
                            early_stopping=True 
        ) 

        nnclf.fit(X_train, y_train)

        # check training set performance
        y_hat_train = nnclf.predict(X_train)

        train_perf.append( accuracy(y_hat_train, y_train) )

        # check validation set performance
        y_hat_val = nnclf.predict(X_val)

        val_perf.append( accuracy(y_hat_val, y_val) )   

In [None]:
d = {
    'Static parameters': static_parameters,
    'Number of neurons': net_size_parameters, 
    'Batch size': batch_size_parameters,
    'Training set performance (MSE)': train_perf, 
    'Validation set performance (MSE)': val_perf
}

results = pd.DataFrame(data=d)

results  

In [None]:
results.to_csv(RESULTS_FILENAME, index=False)
