In [5]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Read in data
df = pd.read_csv('msd_genre_dataset.txt')

# Remove excess data
df_relevant_features = df.drop(columns=["track_id", "artist_name", "title"])

X = df_relevant_features.drop(columns=['genre'])
y = df_relevant_features['genre']

# Encode nominal genres
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


# Upsample data
smote = SMOTE()

sampled_X, sampled_y = smote.fit_resample(X,y)

# Standardize upsampled data
standardizer = StandardScaler()
std_X = standardizer.fit_transform(sampled_X)

# Change y to be the same np.shape as std_X
expanded_y = sampled_y[:, np.newaxis]

# Add the sampled y values to the beginning of std_X
std_X = np.concatenate([expanded_y, std_X], axis=1)

# `sweetviz` likes having the whole dataframe for train and test
sampled_X = pd.DataFrame(std_X, columns=df_relevant_features.columns)
sampled_X['genre'] = sampled_y

# Create a new data frame that fits all this criteria
std_dev_limit = 5

new_dfs = sampled_X.query(f"-{std_dev_limit} <= loudness <= {std_dev_limit} &\
                          -{std_dev_limit} <= tempo <= {std_dev_limit} &\
                          -{std_dev_limit} <= time_signature <= {std_dev_limit} &\
                          -{std_dev_limit} <= key <= {std_dev_limit} &\
                          -{std_dev_limit} <= mode <= {std_dev_limit} &\
                          -{std_dev_limit} <= duration <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre1 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre2 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre3 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre4 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre5 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre6 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre7 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre8 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre9 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre10 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre11 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre12 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre1 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre2 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre3 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre4 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre5 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre6 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre7 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre8 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre9 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre10 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre11 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre12 <= {std_dev_limit}")

In [8]:
new_dfs.to_csv('msd_genre_dataset_no_outliers.csv', index=False)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(new_dfs, new_dfs['genre'], test_size=0.25, random_state=16, shuffle=True)

In [None]:
import sweetviz as sv

#analyzing the dataset
comparison_report = sv.compare([X_train, "Train"], [X_test, "Test"], target_feat="genre", feat_cfg=sv.FeatureConfig(force_num=['genre']))
comparison_report.show_notebook()

In [12]:
new_dfs

Unnamed: 0,genre,loudness,tempo,time_signature,key,mode,duration,avg_timbre1,avg_timbre2,avg_timbre3,...,var_timbre3,var_timbre4,var_timbre5,var_timbre6,var_timbre7,var_timbre8,var_timbre9,var_timbre10,var_timbre11,var_timbre12
0,0,0.392249,0.969191,-2.018614,1.280456,1.037697,-0.092377,0.708216,0.339749,0.166139,...,-0.671229,-0.956615,-0.771387,-0.589754,-1.023299,-0.679297,-0.873599,-0.853991,-0.646093,-1.014796
1,0,0.069256,0.769769,-2.018614,-0.327381,-0.963673,-0.529173,0.284845,-1.570108,0.924810,...,-0.127071,-0.542417,-0.799008,-0.609014,-0.442474,-0.431178,-0.261611,-0.554438,-0.562723,-0.404307
2,0,-0.397455,-0.313507,-2.018614,1.602023,-0.963673,-0.773407,-0.564198,-1.080516,0.571426,...,-0.707879,0.970724,-0.486880,-0.436911,-0.363434,-0.307441,0.300854,-0.395394,0.735976,-0.402457
3,0,-0.280900,-0.175786,0.551144,0.637321,1.037697,-0.062301,0.133999,0.388534,1.282264,...,-0.993727,-0.971025,-0.999604,-0.924734,-0.866423,-0.963177,-0.810687,-0.961475,-1.149886,0.134501
4,0,-0.496065,0.558738,0.551144,1.280456,-0.963673,0.376917,-0.088351,-0.188965,0.882102,...,-0.788310,-1.167962,-0.143674,-1.197322,-1.076425,-1.132618,0.086670,-1.111740,-1.171887,-0.667991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238945,9,0.411107,0.980001,0.551144,-0.970516,1.037697,-0.548378,0.461532,0.014236,1.159716,...,-0.204143,-0.134065,-0.698809,-0.156559,-0.118219,-0.163229,-0.318083,-0.170879,0.105931,-0.306526
238946,9,0.100077,-0.874637,-0.305442,-1.613650,-0.963673,0.144666,-0.168575,-0.702234,-1.578658,...,0.249781,0.076993,-0.121499,-0.054975,-0.189713,-0.085345,-0.327967,-0.196044,-0.009917,-0.234851
238947,9,-0.466884,-0.263144,1.407730,0.637321,-0.963673,-0.309784,-0.584958,0.570261,0.516075,...,0.698236,1.383794,0.635681,1.647119,1.170684,1.109566,0.955771,1.882051,1.615547,0.306175
238948,9,0.590161,2.927770,3.120902,0.637321,1.037697,-0.460635,0.704200,1.826370,-1.471552,...,-0.649961,-0.549869,-0.951161,0.041977,-0.663911,0.491304,-1.002011,0.912752,-0.240591,-1.182213


In [15]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier

X_train, X_test, y_train, y_test = train_test_split(new_dfs.drop(columns=['genre']), new_dfs['genre'], test_size=0.2, random_state=None)

mlp_clf = MLPClassifier(hidden_layer_sizes=(100,100), max_iter=1000)
random_forest_clf = RandomForestClassifier()
xgboost_clf = xgb.XGBClassifier()
knn_clf = KNeighborsClassifier(n_neighbors=4, weights="distance")

ensemble_clf = VotingClassifier(estimators=[
    ('mlp', mlp_clf),
    ('rf', random_forest_clf),
    ('xgb', xgboost_clf),
    ('knn', knn_clf)
], voting='soft', weights=[1, 2, 1, 1])

In [16]:
ensemble_clf.fit(X_train, y_train)

Accuracy: 0.8971131215857319


In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Calculate accuracy, I know there's some better metrics we can use right?
y_pred = ensemble_clf.predict(X_test)
accuracy, precision, recall, f1 = accuracy_score(y_test, y_pred), precision_score(y_test, y_pred, average='weighted'), recall_score(y_test, y_pred, average='weighted'), f1_score(y_test, y_pred, average='weighted')

In [21]:
print(f"Accuracy: {accuracy:.5f} Precision: {precision:.5f} Recall: {recall:.5f} F1: {f1:.5f}")

Accuracy: 0.89711 Precision: 0.89391 Recall: 0.89711 F1: 0.89009
