In [7]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np

# Read in data
df = pd.read_csv('msd_genre_dataset.txt')

# Remove excess data
df_relevant_features = df.drop(columns=["track_id", "artist_name", "title"])

X = df_relevant_features.drop(columns=['genre'])
y = df_relevant_features['genre']

# Encode nominal genres
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


# Upsample data
smote = SMOTE()

sampled_X, sampled_y = smote.fit_resample(X,y)

# Standardize upsampled data
standardizer = StandardScaler()
std_X = standardizer.fit_transform(sampled_X)

# Change y to be the same np.shape as std_X
expanded_y = sampled_y[:, np.newaxis]

# Add the sampled y values to the beginning of std_X
std_X = np.concatenate([expanded_y, std_X], axis=1)

# `sweetviz` likes having the whole dataframe for train and test
sampled_X = pd.DataFrame(std_X, columns=df_relevant_features.columns)
sampled_X['genre'] = sampled_y

# Create a new data frame that fits all this criteria
std_dev_limit = 4

new_dfs = sampled_X.query(f"-{std_dev_limit} <= loudness <= {std_dev_limit} &\
                          -{std_dev_limit} <= tempo <= {std_dev_limit} &\
                          -{std_dev_limit} <= time_signature <= {std_dev_limit} &\
                          -{std_dev_limit} <= key <= {std_dev_limit} &\
                          -{std_dev_limit} <= mode <= {std_dev_limit} &\
                          -{std_dev_limit} <= duration <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre1 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre2 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre3 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre4 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre5 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre6 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre7 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre8 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre9 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre10 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre11 <= {std_dev_limit} &\
                          -{std_dev_limit} <= avg_timbre12 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre1 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre2 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre3 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre4 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre5 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre6 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre7 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre8 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre9 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre10 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre11 <= {std_dev_limit} &\
                          -{std_dev_limit} <= var_timbre12 <= {std_dev_limit}")

In [8]:
new_dfs.to_csv('msd_genre_dataset_no_outliers.csv', index=False)

In [10]:
new_dfs

Unnamed: 0,genre,loudness,tempo,time_signature,key,mode,duration,avg_timbre1,avg_timbre2,avg_timbre3,...,var_timbre3,var_timbre4,var_timbre5,var_timbre6,var_timbre7,var_timbre8,var_timbre9,var_timbre10,var_timbre11,var_timbre12
0,0,0.391093,0.968229,-2.021633,1.278134,1.036836,-0.092204,0.706542,0.337522,0.166171,...,-0.670465,-0.953212,-0.772777,-0.590343,-1.019202,-0.677465,-0.872431,-0.853128,-0.648090,-1.008959
1,0,0.067727,0.769004,-2.021633,-0.327221,-0.964472,-0.527255,0.283058,-1.575915,0.925619,...,-0.127040,-0.540785,-0.800487,-0.609616,-0.441246,-0.430021,-0.261125,-0.553659,-0.564626,-0.403099
2,0,-0.399522,-0.313201,-2.021633,1.599205,-0.964472,-0.770514,-0.566211,-1.085405,0.571874,...,-0.707066,0.965886,-0.487349,-0.437396,-0.362597,-0.306621,0.300713,-0.394659,0.735537,-0.401263
3,0,-0.282834,-0.175615,0.553482,0.635992,1.036836,-0.062249,0.132172,0.386399,1.283439,...,-0.992529,-0.967559,-1.001732,-0.925551,-0.863101,-0.960571,-0.809588,-0.960582,-1.152451,0.131623
4,0,-0.498246,0.558183,0.553482,1.278134,-0.964472,0.375215,-0.090237,-0.192182,0.882868,...,-0.787388,-1.163654,-0.143032,-1.198324,-1.072066,-1.129550,0.086768,-1.110805,-1.174478,-0.664784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238945,9,0.821801,0.586032,-0.304890,-0.006150,-0.964472,-0.571064,0.771634,0.404815,0.665006,...,-0.215885,-0.494123,-0.456492,-0.554272,1.256895,-0.301505,-0.235143,-0.264253,-0.447619,-0.140158
238946,9,0.274628,0.237364,-0.304890,-1.290434,-0.964472,-0.670092,0.303954,-0.178984,0.612516,...,-0.655155,0.008202,-0.951877,0.591690,-0.603978,-0.098356,-0.543758,-0.190337,0.186655,-0.582933
238947,9,0.822571,-0.593622,-0.304890,-0.006150,1.036836,-0.049943,0.249061,0.637380,-1.040370,...,-0.116651,0.778525,0.762113,-0.143123,1.263102,0.695323,0.873561,0.159311,0.061518,0.798395
238948,9,0.519686,0.633914,-0.304890,0.314921,-0.964472,-0.116874,0.423780,-0.403219,-0.438014,...,0.382976,0.838231,0.016881,0.421255,0.178878,0.737198,0.747555,0.931791,1.055834,0.459351


In [25]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(new_dfs.drop(columns=['genre']), new_dfs['genre'], test_size=0.2, random_state=None)

mlp_clf = MLPClassifier(hidden_layer_sizes=(100,100), max_iter=1000)
random_forest_clf = RandomForestClassifier()
xgboost_clf = xgb.XGBClassifier()
knn_clf = KNeighborsClassifier(n_neighbors=4, weights="distance")

ensemble_clf = VotingClassifier(estimators=[
    ('mlp', mlp_clf),
    ('rf', random_forest_clf),
    ('xgb', xgboost_clf),
    ('knn', knn_clf)
], voting='soft', weights=[1, 2, 1, 1])

In [26]:
ensemble_clf.fit(X_train, y_train)

y_pred = ensemble_clf.predict(X_test)

# Calculate accuracy, I know there's some better metrics we can use right?
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8971801309064825
