In [31]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



In [32]:
# It makes sense to drop track_id artist_name and title

# TODO
# - Normalization/Standardization of most if not all of these features
# - Figure out what do do with time_signature, key and mode
# - Pop and Classic rock have 15,000 entries while Rap/Hip hop only has 500 or so, we will either need to downsample
#   or else make use of synthetic minority oversampling
# - Note that loudness has some negative values. This may affect our approach for normalization. We may also need to keep in mind that
#   this is probably measured in decibels and is thus logarithmic. 
#   I believe this won't be a problem as the network can learn non linear relationships

# - I will try twice. One dropping the key attribute, one time with it. On a purely musical level the key doesn't mattert
#   but it's possible that certain genres use one key more often, and it's possible that the timbre measurements could have some synergy
#   with the key in play.


df = pd.read_csv('msd_genre_dataset.txt')
df_relevant_features = df.drop(columns=["track_id", "artist_name", "title"])
df_relevant_features.head()

X = df_relevant_features.drop(columns=['genre'])
# X = df_relevent_features.drop(columns=['key'])

X = pd.get_dummies(X, columns=['time_signature', 'key', 'mode'])
# X = X.get_dummies(X, columns=['time_signature', 'mode'])

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_relevant_features['genre'])

# Now the data is properly encoded and irrelevant variables are gone

# Normalize the features
# We should experiment with a few types of normalization/standardization, some features are more uniform than others

# scaler = MinMaxScaler()
scaler = StandardScaler()

X_normalized = scaler.fit_transform(X)

# Convert normalized features back to a DataFrame for easier inspection
X_normalized_df = pd.DataFrame(X_normalized, columns=X.columns)


In [33]:
##### PCA #####
# Initialize PCA with the desired number of components
pca = PCA(n_components=10)  # Adjust the number of components as needed

# Apply PCA to the normalized features
X_pca = pca.fit_transform(X_normalized)

# Convert PCA results back to a DataFrame for easier inspection
X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i}' for i in range(1, pca.n_components_ + 1)])

# Print the explained variance ratio
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

# Print the DataFrame with PCA results
print(X_pca_df.head())

Explained Variance Ratio: [0.14998559 0.07696145 0.04822348 0.03780027 0.03670179 0.03365263
 0.03039367 0.02920828 0.02461011 0.02408281]
        PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
0 -3.061145  0.299252 -0.727100  0.814719  2.128445  0.111168 -1.344552   
1 -1.822265 -0.423075  1.995744 -0.788603  2.403267  1.409285 -0.385317   
2  0.952322 -1.000054  1.322135 -1.506617  2.967055 -0.219142  0.386567   
3 -3.126369 -0.695175 -0.981982  0.641821 -0.508449  0.157359 -0.187894   
4 -2.993877 -1.167106  2.528458 -0.087107 -0.150061 -0.105525 -0.063881   

        PC8       PC9      PC10  
0  0.201114  0.677558  0.568962  
1 -1.008940  0.100014 -0.432697  
2 -0.269162  2.449830 -0.086100  
3 -0.454608 -0.254903 -2.093109  
4 -0.670802  0.586568  0.484102  


In [35]:
### Let's Train This Thing ###
X_train, X_test, y_train, y_test = train_test_split(X_normalized_df, y, test_size=0.2, random_state=42)

mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)

# Calculate accuracy, I know there's some better metrics we can use right?
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5874161073825503


In [None]:
### Takeaways ###

# With a PCA of ten PCs, and an MLP with 100 and 50 layers, we got 50% test accuracy
# W/o PCA we got nearly 60%
# I bet the low accuracy has something to do with the skewed data, that's probably our next step!