In [21]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [22]:
# Loading data
file_path = Path("top_recommendations_features.csv")
df_song_recs = pd.read_csv(file_path)
df_song_recs.head()

Unnamed: 0.1,Unnamed: 0,name,artist,id,popularity,danceability,energy,key,loudness,mode,...,liveness,valence,tempo,type,uri,track_href,analysis_url,duration_ms,time_signature,duration_s
0,0,Dont Worry (its on me) - Radio Edit,Sir William,0FcjlhzxLTUnNseW0aNjrg,24,0.818,0.837,9.0,-5.915,1.0,...,0.0582,0.737,125.01,audio_features,spotify:track:0FcjlhzxLTUnNseW0aNjrg,https://api.spotify.com/v1/tracks/0FcjlhzxLTUn...,https://api.spotify.com/v1/audio-analysis/0Fcj...,201360.0,4.0,201.36
1,1,Slow It Down,Short Dawg,6GgjPqWh25IVIPNTp1soPd,32,0.426,0.713,7.0,-5.328,1.0,...,0.24,0.641,77.827,audio_features,spotify:track:6GgjPqWh25IVIPNTp1soPd,https://api.spotify.com/v1/tracks/6GgjPqWh25IV...,https://api.spotify.com/v1/audio-analysis/6Ggj...,213814.0,4.0,213.814
2,2,Club Paradise - Remix,Kigity K,30XUZHuO3Zn79Kpe5ylkXI,21,0.611,0.284,2.0,-19.339,1.0,...,0.147,0.712,92.342,audio_features,spotify:track:30XUZHuO3Zn79Kpe5ylkXI,https://api.spotify.com/v1/tracks/30XUZHuO3Zn7...,https://api.spotify.com/v1/audio-analysis/30XU...,181368.0,4.0,181.368
3,3,"Chanel Vintage (feat. Future, Young Thug)",Metro Boomin,6y7mf352StjAm4bMDkeMyH,31,0.823,0.704,1.0,-4.74,1.0,...,0.116,0.114,135.012,audio_features,spotify:track:6y7mf352StjAm4bMDkeMyH,https://api.spotify.com/v1/tracks/6y7mf352StjA...,https://api.spotify.com/v1/audio-analysis/6y7m...,261992.0,4.0,261.992
4,4,Dedicated,The Game,4HLElWheO0EfBtWQFkeOcG,31,0.703,0.54,9.0,-7.336,1.0,...,0.183,0.296,119.988,audio_features,spotify:track:4HLElWheO0EfBtWQFkeOcG,https://api.spotify.com/v1/tracks/4HLElWheO0Ef...,https://api.spotify.com/v1/audio-analysis/4HLE...,345960.0,4.0,345.96


In [23]:
df_song_recs.columns

Index(['Unnamed: 0', 'name', 'artist', 'id', 'popularity', 'danceability',
       'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'uri',
       'track_href', 'analysis_url', 'duration_ms', 'time_signature',
       'duration_s'],
      dtype='object')

In [24]:
df_song_recs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237650 entries, 0 to 237649
Data columns (total 23 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        237650 non-null  int64  
 1   name              237650 non-null  object 
 2   artist            237650 non-null  object 
 3   id                237650 non-null  object 
 4   popularity        237650 non-null  int64  
 5   danceability      237650 non-null  float64
 6   energy            237650 non-null  float64
 7   key               237650 non-null  float64
 8   loudness          237650 non-null  float64
 9   mode              237650 non-null  float64
 10  speechiness       237650 non-null  float64
 11  acousticness      237650 non-null  float64
 12  instrumentalness  237650 non-null  float64
 13  liveness          237650 non-null  float64
 14  valence           237650 non-null  float64
 15  tempo             237650 non-null  float64
 16  type              23

In [25]:
df_song_recs['popularity_bin'] = pd.cut(df_song_recs['popularity'], bins = [0,33,67,100], labels = [1,2,3])

In [26]:
df_song_recs = df_song_recs.drop(['Unnamed: 0','name', 'artist', 'id', 'type', 'uri', 'track_href',
                                  'analysis_url', 'duration_ms', 'popularity'], axis=1)

In [27]:
df_song_recs = df_song_recs.dropna()

In [28]:
df_song_recs

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_s,popularity_bin
0,0.818,0.837,9.0,-5.915,1.0,0.0376,0.00956,0.000412,0.0582,0.737,125.010,4.0,201.360,1
1,0.426,0.713,7.0,-5.328,1.0,0.3600,0.66600,0.000000,0.2400,0.641,77.827,4.0,213.814,1
2,0.611,0.284,2.0,-19.339,1.0,0.0800,0.79300,0.000006,0.1470,0.712,92.342,4.0,181.368,1
3,0.823,0.704,1.0,-4.740,1.0,0.1460,0.02190,0.000000,0.1160,0.114,135.012,4.0,261.992,1
4,0.703,0.540,9.0,-7.336,1.0,0.1980,0.20200,0.000000,0.1830,0.296,119.988,4.0,345.960,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237645,0.728,0.339,11.0,-9.890,0.0,0.2140,0.31800,0.000000,0.1120,0.574,138.015,4.0,163.585,2
237646,0.469,0.685,11.0,-9.643,0.0,0.0374,0.00037,0.911000,0.0998,0.559,160.086,4.0,77.298,2
237647,0.419,0.953,0.0,-5.524,1.0,0.0502,0.05100,0.000007,0.3500,0.683,171.074,4.0,157.193,2
237648,0.698,0.651,0.0,-9.147,1.0,0.0793,0.05000,0.000000,0.1160,0.673,143.937,4.0,143.333,2


In [29]:
X = df_song_recs.copy()
X.drop('popularity_bin', axis=1, inplace=True)
X

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,duration_s
0,0.818,0.837,9.0,-5.915,1.0,0.0376,0.00956,0.000412,0.0582,0.737,125.010,4.0,201.360
1,0.426,0.713,7.0,-5.328,1.0,0.3600,0.66600,0.000000,0.2400,0.641,77.827,4.0,213.814
2,0.611,0.284,2.0,-19.339,1.0,0.0800,0.79300,0.000006,0.1470,0.712,92.342,4.0,181.368
3,0.823,0.704,1.0,-4.740,1.0,0.1460,0.02190,0.000000,0.1160,0.114,135.012,4.0,261.992
4,0.703,0.540,9.0,-7.336,1.0,0.1980,0.20200,0.000000,0.1830,0.296,119.988,4.0,345.960
...,...,...,...,...,...,...,...,...,...,...,...,...,...
237645,0.728,0.339,11.0,-9.890,0.0,0.2140,0.31800,0.000000,0.1120,0.574,138.015,4.0,163.585
237646,0.469,0.685,11.0,-9.643,0.0,0.0374,0.00037,0.911000,0.0998,0.559,160.086,4.0,77.298
237647,0.419,0.953,0.0,-5.524,1.0,0.0502,0.05100,0.000007,0.3500,0.683,171.074,4.0,157.193
237648,0.698,0.651,0.0,-9.147,1.0,0.0793,0.05000,0.000000,0.1160,0.673,143.937,4.0,143.333


In [30]:
y = df_song_recs['popularity_bin']
len(y)

236656

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [32]:
# Creating StandardScaler instance
scaler = StandardScaler()


In [33]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [34]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [35]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [36]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)


In [37]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [38]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Low Pop", "Actual Med Pop","Actual High Pop"], columns=["Predicted Low Pop", "Predicted Med Pop", "Predicted High Pop"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [39]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted Low Pop,Predicted Med Pop,Predicted High Pop
Actual Low Pop,11239,12478,1376
Actual Med Pop,12614,16499,1886
Actual High Pop,1207,1655,210


Accuracy Score : 0.4723818538300318
Classification Report
              precision    recall  f1-score   support

           1       0.45      0.45      0.45     25093
           2       0.54      0.53      0.54     30999
           3       0.06      0.07      0.06      3072

    accuracy                           0.47     59164
   macro avg       0.35      0.35      0.35     59164
weighted avg       0.48      0.47      0.47     59164



In [40]:
importances = model.feature_importances_
# List the most important features
importances_sorted = sorted(zip(model.feature_importances_, X.columns), reverse=True)
importances_sorted

[(0.10517953859940693, 'duration_s'),
 (0.1005729993512144, 'tempo'),
 (0.09987670132390762, 'loudness'),
 (0.09969630279398198, 'acousticness'),
 (0.09560233095083275, 'liveness'),
 (0.09378043621161587, 'danceability'),
 (0.09300200167132353, 'valence'),
 (0.09166385102791716, 'speechiness'),
 (0.09043804788385709, 'energy'),
 (0.06286474915183998, 'instrumentalness'),
 (0.049399755674669255, 'key'),
 (0.011063446057871828, 'mode'),
 (0.0068598393015615305, 'time_signature')]