In [49]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [31]:
# It makes sense to drop track_id artist_name and title

# TODO
# - Normalization/Standardization of most if not all of these features
# - Figure out what do do with time_signature, key and mode
# - Pop and Classic rock have 15,000 entries while Rap/Hip hop only has 500 or so, we will either need to downsample
#   or else make use of synthetic minority oversampling
# - Note that loudness has some negative values. This may affect our approach for normalization. We may also need to keep in mind that
#   this is probably measured in decibels and is thus logarithmic. 
#   I believe this won't be a problem as the network can learn non linear relationships

# - I will try twice. One dropping the key attribute, one time with it. On a purely musical level the key doesn't mattert
#   but it's possible that certain genres use one key more often, and it's possible that the timbre measurements could have some synergy
#   with the key in play.

smote = SMOTE()

df = pd.read_csv('msd_genre_dataset.txt')
df_relevant_features = df.drop(columns=["track_id", "artist_name", "title"])
df_relevant_features.head()

X_resampled, y_resampled = smote.fit_resample(df_relevant_features.drop(columns=['genre']), df_relevant_features['genre'])

# X = df_relevant_features.drop(columns=['key'])

#X = pd.get_dummies(X, columns=['time_signature', 'key', 'mode'])
X = pd.get_dummies(X_resampled, columns=['time_signature', 'mode'])

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_resampled)

# Now the data is properly encoded and irrelevant variables are gone

# Normalize the features

scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Convert normalized features back to a DataFrame for easier inspection
X_normalized_df = pd.DataFrame(X_normalized, columns=X.columns)





In [17]:
##### PCA #####
# Initialize PCA with the desired number of components
pca = PCA(n_components=20)  # Adjust the number of components as needed

# Apply PCA to the normalized features
X_pca = pca.fit_transform(X_normalized_df)

# Convert PCA results back to a DataFrame for easier inspection
X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i}' for i in range(1, pca.n_components_ + 1)])

# Print the explained variance ratio
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

# Print the DataFrame with PCA results
print(X_pca_df.head())

Explained Variance Ratio: [0.19534464 0.09989111 0.06036111 0.04878384 0.04765565 0.04408339
 0.03981409 0.03823336 0.03373249 0.03047064 0.02983635 0.02920269
 0.0285015  0.02645565 0.02597008 0.02582799 0.02433396 0.02336032
 0.0210196  0.01897077]
        PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0 -2.998347  0.335644 -0.680494  1.516816  1.756516 -0.017049 -1.190089   
1 -1.776102 -0.457462  1.475422  0.208960  2.258398  1.263633 -0.724537   
2  0.874894 -1.013989  1.047119 -0.650188  3.498022 -0.049266  0.464714   
3 -3.099636 -0.648593 -0.531658  0.432725 -0.551784  0.379651 -0.080109   
4 -2.931069 -1.168187  2.664846 -0.119262  0.007438 -0.096471  0.106467   

        PC8       PC9      PC10      PC11      PC12      PC13      PC14  \
0  0.403937  0.523576  0.433837 -1.092615  0.887452 -0.637790 -0.415363   
1 -0.336276 -0.282811  0.515409 -1.386272  1.160628 -0.723605  1.037168   
2 -0.392122  1.381407  0.831990  0.176645  1.236679 -0.144145 -1.815785  

In [50]:
### Let's Train This Thing ###
X_train, X_test, y_train, y_test = train_test_split(X_normalized_df, y, test_size=0.2, random_state=None)

mlp_clf = MLPClassifier()
random_forest_clf = RandomForestClassifier()
xgboost_clf = xgb.XGBClassifier()
knn_clf = KNeighborsClassifier(n_neighbors=4, weights="distance")

ensemble_clf = VotingClassifier(estimators=[
    ('mlp', mlp_clf),
    ('rf', random_forest_clf),
    ('xgb', xgboost_clf),
    ('knn', knn_clf)
], voting='soft', weights=[1, 2, 1, 1])



ensemble_clf.fit(X_train, y_train)

y_pred = ensemble_clf.predict(X_test)

# Calculate accuracy, I know there's some better metrics we can use right?
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [6]:
### Takeaways ###

# With a PCA of ten PCs, and an MLP with 100 and 50 layers, we got 50% test accuracy
# W/o PCA we got nearly 60%
# I bet the low accuracy has something to do with the skewed data, that's probably our next step!