# Preprocessing

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [None]:
data = pd.read_csv(r"./SpotifyFeatures.csv")

In [None]:
one_hot_mode = pd.get_dummies(data['mode'])
data = pd.concat([data, one_hot_mode], axis=1)
data.drop(['mode'], axis=1, inplace=True)
# turning genres into numbers
le = LabelEncoder()
data['key'] = le.fit_transform(data['key'])
data['genre'] = le.fit_transform(data['genre'])

In [None]:
outliers = pd.DataFrame(data.loc[:, ~data.columns.isin(['artist_name', 'track_name', 'track_id', 'popularity', 'Major', 'Minor', 'time_signature'])])
outliers

In [None]:
genres = outliers.genre.unique()

In [None]:
new_pd = []
for i in genres:
    new_pd.append(outliers[outliers.genre == i])

In [None]:
for i in new_pd:
    for j in range(11):
        quantiles = i.iloc[:,j+1].quantile([.25, .5, .75])
        QRI = quantiles[.75] - quantiles[.25]
        lb = quantiles[.25] - 1.5 * QRI 
        ub = quantiles[.75] + 1.5 * QRI
        i[(i.iloc[:, j+1]<ub) & (i.iloc[:,j+1]>lb)]


In [None]:
pd.concat(new_pd)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 

fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(data.corr(), annot=True, cmap='YlGnBu', vmin=-1, vmax=1, center=0, ax=ax)
plt.title('LINEAR CORRELATION MATRIX - CLASS_TRAIN')
plt.show()

In [None]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


nn = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes': [(100,100,100), (50,100, 50), (100,100,100), (50,100,100), (100,150,100), (100,)],
    'activation': ['tanh', 'relu'],
    'alpha': [.001, .05]
}

from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(nn, parameter_space, n_jobs=-1, cv = 3)
clf.fit(x_train, y_train)
print(clf.best_params_)

In [None]:
nn.fit(x_train, y_train)

In [None]:
y2pred = nn.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y2pred, y_test)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test.values.argmax(axis=1), y2pred.values.argmax(axis=1))


In [None]:
labels = ['A Capella', 'Alternative', 'Anime', 'Blues', "Children's Music",
       'Children’s Music', 'Classical', 'Comedy', 'Country', 'Dance',
       'Electronic', 'Folk', 'Hip-Hop', 'Indie', 'Jazz', 'Movie', 'Opera',
       'Pop', 'R&B', 'Rap', 'Reggae', 'Reggaeton', 'Rock', 'Ska', 'Soul',
       'Soundtrack', 'World']
import seaborn as sns
sns.heatmap(cm, annot = False, cmap = "Blues", xticklabels = labels, yticklabels = labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title("Confusion Matrix for Model B")

In [None]:
roc_auc = roc_auc_score(y_test, y2pred, multi_class = 'ovr')

In [None]:
nn.loss_

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y2pred, y_test))

# Neural Network

In [None]:
def circle_of_fifths_encoding(keys):
    key_encoding = {'C#': 7, 'F#': 2, 'C': 9, 'F': 4, 'G': 11, 'E': 6, 
                    'D#': 1, 'G#': 3, 'D': 8, 'A#': 0, 'A': 5, 'B': 10}
    key_vectors = []
    for key in keys:
        circle_of_fifths_num = key_encoding[key]
        key_vector = [np.cos(2*np.pi*circle_of_fifths_num/12), 
                      np.sin(2*np.pi*circle_of_fifths_num/12)]
        key_vectors.append(key_vector)
    key_vectors = np.array(key_vectors)
    return key_vectors[:, 0], key_vectors[:, 1]

In [None]:
data = pd.read_csv(r"./SpotifyFeatures.csv")
data = data.drop_duplicates(subset=['track_id'], keep='last')
features_to_drop = ['track_name', 'track_id', 'popularity', 'artist_name']
data = data.drop(features_to_drop, axis=1)
sin_key, cos_key =  circle_of_fifths_encoding(data['key'])
data['sin_key'] = sin_key
data['cos_key'] = cos_key
data.drop(['key'], axis=1, inplace=True)
le = LabelEncoder()
data['mode'] = le.fit_transform(data['mode'])
data['time_signature'] = le.fit_transform(data['time_signature'])
y_one_hot = pd.get_dummies(data['genre'])
x = data.drop(['genre'], axis=1)



In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y_one_hot, test_size=0.2, random_state=44)

In [None]:
scaler = RobustScaler().fit(x_train)
x_train = pd.DataFrame(scaler.transform(x_train), columns=x_train.columns)
x_test = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)

In [None]:
model = Sequential()
model.add(Dense(8, input_dim=14, activation='relu'))
model.add(Dense(8, activation = 'tanh'))
model.add(Dense(27, activation='softmax'))
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', 'Precision', 'AUC'])
model.fit(x_train.values, y_train.values, epochs=100, batch_size=5, validation_data=(x_test.values, y_test.values))
loss, accuracy, precision, auc = model.evaluate(x_test, y_test)
model_stats = pd.DataFrame([loss, accuracy, precision, auc], index=['Loss', 'Accuracy', 'Precision', 'AUC'], columns=['Model Statistics'])


In [None]:
ypred = model.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test.values.argmax(axis=1), ypred.values.argmax(axis=1))


In [None]:
labels = ['A Capella', 'Alternative', 'Anime', 'Blues', "Children's Music",
       'Children’s Music', 'Classical', 'Comedy', 'Country', 'Dance',
       'Electronic', 'Folk', 'Hip-Hop', 'Indie', 'Jazz', 'Movie', 'Opera',
       'Pop', 'R&B', 'Rap', 'Reggae', 'Reggaeton', 'Rock', 'Ska', 'Soul',
       'Soundtrack', 'World']
import seaborn as sns
sns.heatmap(cm, annot = False, cmap = "Blues", xticklabels = labels, yticklabels = labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
model_stats