# Start with a multiclass classifier

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

## Read in data

In [2]:
df_music = pd.read_pickle('extracted data/2025_01_07_local_music_librosa_features.pkl')

## Modeling

In [3]:
# Select features (from mfcc_0 to tempo)
feature_columns = df_music.columns[df_music.columns.get_loc('mfcc_0'):df_music.columns.get_loc('tempo')+1]
feature_columns

Index(['mfcc_0', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6',
       'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12',
       'mfcc_13', 'mfcc_14', 'mfcc_15', 'mfcc_16', 'mfcc_17', 'mfcc_18',
       'mfcc_19', 'spectral_centroid', 'chroma_0', 'chroma_1', 'chroma_2',
       'chroma_3', 'chroma_4', 'chroma_5', 'chroma_6', 'chroma_7', 'chroma_8',
       'chroma_9', 'chroma_10', 'chroma_11', 'tempo'],
      dtype='object')

In [4]:
# Create feature matrix and labels
X = df_music[feature_columns].values
y = df_music['artist'].values

In [9]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# For track names
_, test_indices = train_test_split(range(len(df_music)), test_size=0.2, random_state=42)

### RF

In [6]:
# Train model
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)

In [7]:
# Evaluate model
y_pred_rf = rf_clf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {accuracy_rf}")

Accuracy: 0.8409090909090909


#### Dive deeper into RF performance

In [8]:
# Create a DataFrame with actual and predicted labels
results_df_rf = pd.DataFrame({
    'Artist': y_test,
    'Predicted': y_pred_rf,
    'Correct': y_test == y_pred_rf
})

In [10]:
# Add track names (assuming they're in the same order as X_test)
results_df_rf['Track'] = df_music['track_name'].values[test_indices]

In [11]:
# Summary by artist
artist_summary_rf = results_df_rf.groupby('Artist').agg({
    'Correct': ['count', 'sum', lambda x: f"{x.mean():.2%}"]
}).reset_index()
artist_summary_rf.columns = ['Artist', 'Total', 'Correct', 'Accuracy']
print("Summary by Artist:")
print(artist_summary_rf)

Summary by Artist:
                     Artist  Total  Correct Accuracy
0         Animal Collective      8        7   87.50%
1                      Pogo     15       15  100.00%
2               The Strokes     13        9   69.23%
3  The Tallest Man on Earth      8        6   75.00%


In [14]:
# Display mislabeled songs
mislabeled_rf = results_df_rf[~results_df_rf['Correct']]
print("\nMislabeled Songs - RF:")
for _, row in mislabeled_rf.iterrows():
    print(f"Artist: {row['Artist']}, Predicted: {row['Predicted']}, Track: {row['Track']}")


Mislabeled Songs - RF:
Artist: The Strokes, Predicted: Animal Collective, Track: 04 12_51.m4a
Artist: The Strokes, Predicted: Animal Collective, Track: 05 You Talk Way Too Much.m4a
Artist: Animal Collective, Predicted: The Tallest Man on Earth, Track: 05 Bees.m4a
Artist: The Strokes, Predicted: Animal Collective, Track: 01 Is This It.m4a
Artist: The Strokes, Predicted: Animal Collective, Track: 11 Take It or Leave It.m4a
Artist: The Tallest Man on Earth, Predicted: Animal Collective, Track: 07 Timothy.mp3
Artist: The Tallest Man on Earth, Predicted: Animal Collective, Track: 02 Darkness of the Dream.mp3


### KNN

In [15]:
# Train model (using 5 neighbors as an example, you can adjust this)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [16]:
# Make predictions on the test set
y_pred_knn = knn.predict(X_test)

In [17]:
# Calculate accuracy
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"Accuracy: {accuracy_knn:.2%}")

Accuracy: 45.45%


#### Dive deeper into KNN performance

In [18]:
# Create a DataFrame with actual and predicted labels
results_df_knn = pd.DataFrame({
    'Artist': y_test,
    'Predicted': y_pred_knn,
    'Correct': y_test == y_pred_knn
})

# Add track names (assuming they're in the same order as X_test)
results_df_knn['Track'] = df_music['track_name'].values[test_indices]

# Summary by artist
artist_summary_knn = results_df_knn.groupby('Artist').agg({
    'Correct': ['count', 'sum', lambda x: f"{x.mean():.2%}"]
}).reset_index()
artist_summary_knn.columns = ['Artist', 'Total', 'Correct', 'Accuracy']
print("\nSummary by Artist:")
print(artist_summary_knn)


Summary by Artist:
                     Artist  Total  Correct Accuracy
0         Animal Collective      8        3   37.50%
1                      Pogo     15       10   66.67%
2               The Strokes     13        5   38.46%
3  The Tallest Man on Earth      8        2   25.00%


In [19]:
# Display mislabeled songs
mislabeled_knn = results_df_knn[~results_df_knn['Correct']]
print("\nMislabeled Songs - KNN:")
for _, row in mislabeled_knn.iterrows():
    print(f"Artist: {row['Artist']}, Predicted: {row['Predicted']}, Track: {row['Track']}")


Mislabeled Songs - KNN:
Artist: The Strokes, Predicted: Animal Collective, Track: 04 12_51.m4a
Artist: Pogo, Predicted: The Tallest Man on Earth, Track: Pogo - Star Charts - 10 The Climb.mp3
Artist: Animal Collective, Predicted: The Tallest Man on Earth, Track: 01-02 - Man Of Oil.mp3
Artist: The Tallest Man on Earth, Predicted: Pogo, Track: TheTallestManOnEarth-Shallowgrave-02-PistolDreams.mp3
Artist: Animal Collective, Predicted: The Tallest Man on Earth, Track: 05 Bees.m4a
Artist: The Tallest Man on Earth, Predicted: Pogo, Track: 06 King of Spain.mp3
Artist: Pogo, Predicted: The Tallest Man on Earth, Track: Pogo - Star Charts - 09 Do Something Rhythmic.mp3
Artist: Pogo, Predicted: Animal Collective, Track: 精神 _ Pogo_284667071_soundcloud.mp3
Artist: The Tallest Man on Earth, Predicted: Animal Collective, Track: 05 The Drying of the Lawns.mp3
Artist: Pogo, Predicted: Animal Collective, Track: Buzzwing.mp3
Artist: The Strokes, Predicted: Animal Collective, Track: 11 I Can't Win.m4a
Art

## Report

In [None]:
"""
Without any model tuning the RF does very well, 85%, without much data to work with. 
The KNN did not do very well, but better than a random guess.

These results, to me, suggest there is signal in all of these librosa features to distinguish one artist from another. 
I believe that's enough to support the original hypothesis: 
 - It's possible to build a recommender based on librosa extracted features rather than relying on Collaborative Based Filtering. 
 - i.e. it's possible to suggest songs to a listener based on what they like without relying on what other people like. 
 -  - there's signal in the music itself for affinity
"""