# V1: Classifying Music Taste with Classifier Algorithms
___

### Goal: Create a model that can classify whether or not a user would choose to save a song with above 90% accuracy based on a given track's audio features.

### Step 1: Load and Inspect Data

In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import warnings

import model_methods
import sklearn

In [2]:
my_tracks = pd.read_csv('../data/allmy_tracks_with_features.csv', index_col=0)

In [3]:
my_tracks.shape

(2543, 24)

In [4]:
my_tracks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2543 entries, 0 to 3617
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                2543 non-null   object 
 1   track_name        2543 non-null   object 
 2   artist            2543 non-null   object 
 3   artist_id         2543 non-null   object 
 4   album_id          2543 non-null   object 
 5   album             2543 non-null   object 
 6   release_date      2543 non-null   object 
 7   playlist_name     2543 non-null   object 
 8   popularity        2543 non-null   int64  
 9   explicit          2543 non-null   bool   
 10  user_liked        2543 non-null   int64  
 11  danceability      2543 non-null   float64
 12  energy            2543 non-null   float64
 13  key               2543 non-null   int64  
 14  loudness          2543 non-null   float64
 15  mode              2543 non-null   int64  
 16  speechiness       2543 non-null   float64


In [5]:
my_tracks.head()

Unnamed: 0,id,track_name,artist,artist_id,album_id,album,release_date,playlist_name,popularity,explicit,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,2Nw6tjb0euV6LApzN4fU0a,Good for You,Spacey Jane,6V70yeZQCoSR2M3fyW8qiA,3zZi1vy6CnNZX7lbcRJtXo,Sunlight,2020-06-12,Liked_Songs,60,True,...,-4.741,1,0.0385,5.6e-05,0.0106,0.165,0.931,174.962,174760,4
427,276zciJ7Fg7Jk6Ta6QuLkp,Come Down,Anderson .Paak,3jK9MiCrA42lLAdMGUZpwa,4VFG1DOuTeDMBjBLZT7hCK,Malibu,2016-01-15,Liked_Songs,66,True,...,-7.135,1,0.104,0.271,0.0151,0.304,0.931,98.401,169727,4
420,2gZUPNdnz5Y45eiGxpHGSc,POWER,Kanye West,5K4W6rqBFWDnAN6FQUkS6x,20r762YmB5HeofjMCiPMLv,My Beautiful Dark Twisted Fantasy,2010-11-22,Liked_Songs,79,True,...,-4.747,0,0.113,0.0161,0.0,0.744,0.576,153.993,292093,4
421,4qikXelSRKvoCqFcHLB2H2,Mercy,Kanye West,5K4W6rqBFWDnAN6FQUkS6x,0hmFRR0pDSZIAvoJqEFSKv,Mercy,2012-01-01,Liked_Songs,73,True,...,-9.381,0,0.406,0.0685,5.8e-05,0.173,0.426,139.993,329320,4
422,3CcvahnsiArpTHYQEWV2Au,Bring Em Out,T.I.,4OBJLual30L7gRl5UkeRcT,1oFucub5OjyG4XPsDUzhil,Urban Legend,2004-11-28,Liked_Songs,66,True,...,-2.983,1,0.257,0.0298,0.0,0.141,0.587,98.579,216707,4


Let's reset our Index

In [6]:
my_tracks['playlist_name'].value_counts()

Fleet Foxes    1909
Liked_Songs     634
Name: playlist_name, dtype: int64

In [7]:
my_tracks.reset_index(drop=True, inplace=True)

Note to Self: Fix get tracks from playlists function so not all playlists say Fleet Foxes

In [8]:
my_tracks.columns

Index(['id', 'track_name', 'artist', 'artist_id', 'album_id', 'album',
       'release_date', 'playlist_name', 'popularity', 'explicit', 'user_liked',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'time_signature'],
      dtype='object')

___
### Data looks good and clean! Let's drop columns that won't be of use for classifying:

In [9]:
to_class_tracks = my_tracks.drop(columns=['id','track_name','artist','artist_id','album_id','album','release_date','playlist_name']).copy()

In [10]:
to_class_tracks.head()

Unnamed: 0,popularity,explicit,user_liked,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,60,True,1,0.436,0.896,0,-4.741,1,0.0385,5.6e-05,0.0106,0.165,0.931,174.962,174760,4
1,66,True,1,0.841,0.898,8,-7.135,1,0.104,0.271,0.0151,0.304,0.931,98.401,169727,4
2,79,True,1,0.542,0.914,0,-4.747,0,0.113,0.0161,0.0,0.744,0.576,153.993,292093,4
3,73,True,1,0.563,0.496,6,-9.381,0,0.406,0.0685,5.8e-05,0.173,0.426,139.993,329320,4
4,66,True,1,0.759,0.891,11,-2.983,1,0.257,0.0298,0.0,0.141,0.587,98.579,216707,4


To include the 'explicit' column let's translate it from Bool to Integer

In [11]:
to_class_tracks['explicit']=to_class_tracks['explicit'].astype('int')

In [12]:
to_class_tracks.head()

Unnamed: 0,popularity,explicit,user_liked,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,60,1,1,0.436,0.896,0,-4.741,1,0.0385,5.6e-05,0.0106,0.165,0.931,174.962,174760,4
1,66,1,1,0.841,0.898,8,-7.135,1,0.104,0.271,0.0151,0.304,0.931,98.401,169727,4
2,79,1,1,0.542,0.914,0,-4.747,0,0.113,0.0161,0.0,0.744,0.576,153.993,292093,4
3,73,1,1,0.563,0.496,6,-9.381,0,0.406,0.0685,5.8e-05,0.173,0.426,139.993,329320,4
4,66,1,1,0.759,0.891,11,-2.983,1,0.257,0.0298,0.0,0.141,0.587,98.579,216707,4


___
### Step 2: Feature Preprocessing and Baseline Score

Now that we have all the Audio Features and specific Track Metadata Features as numerical features, we can:
- Identify our baseline model performance (split between what I have liked and what I haven't)
- Preprocessing to prepare for Pipeline and Modeling

In [13]:
X = to_class_tracks.drop(columns=['user_liked'])
y = to_class_tracks['user_liked']

In [14]:
X.shape, y.shape

((2543, 15), (2543,))

### Baseline Model Score: ~75%

In [34]:
y.value_counts(normalize=True)


0    0.750688
1    0.249312
Name: user_liked, dtype: float64

In [33]:
baseline = y.value_counts(normalize=True).max()

In [35]:
baseline

0.7506881635863154

#### Train & Test Split for Pipeline

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1907, 15), (636, 15), (1907,), (636,))

### StandardScaler:
- In order for our model to be as accurate at classifying which songs have been liked based on Audio Features, we need to ensure the numerical features are on the same scale, which is where StandardScaler comes into the mix

In [18]:
ss = StandardScaler()

In [19]:
X_train_sc = ss.fit_transform(X_train)

In [20]:
X_train_sc = pd.DataFrame(X_train_sc, columns = ss.get_feature_names_out())

In [21]:
X_test_sc = ss.transform(X_test)

In [22]:
X_test_sc = pd.DataFrame(X_test_sc, columns = ss.get_feature_names_out())

In [23]:
X_train_sc.shape, X_test_sc.shape

((1907, 15), (636, 15))

In [24]:
X_train_sc.head()

Unnamed: 0,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,-0.901759,-0.380339,-1.070703,-0.292769,-0.391082,0.110952,0.691588,-0.588591,1.022104,-0.631962,0.348507,-0.679761,-0.613813,1.077363,0.20944
1,1.045942,2.629232,1.531592,0.136118,0.72705,-0.190957,-1.445947,2.082018,-0.938589,-0.633153,0.257829,1.672818,-0.777062,0.41334,0.20944
2,-1.564806,-0.380339,-0.496171,-1.752817,-0.950148,-1.951334,0.691588,-0.441211,1.536185,-0.625476,0.523818,-0.645884,-0.810115,-0.818667,0.20944
3,-0.943199,-0.380339,0.038933,0.051807,0.167984,0.168296,-1.445947,0.169214,-0.881654,-0.633153,-0.584269,1.292642,2.006364,-0.820611,0.20944
4,-1.274723,-0.380339,-0.710212,0.488025,1.006583,-0.163614,-1.445947,-0.215437,0.472093,-0.618025,1.345967,1.187246,1.009524,0.805443,0.20944


### Polynomial Features:
- Currently we have 15 feature columns for our X data to feed our models in an effort to classify whether or not a user will like a given song.
- 15 features might not be enough for our models to beat our baseline score of 75% and so we are going to experiment with Polynomial Features
- Polynomial Features gives us the ability to create new features that are either:
    - individual features times themselves i.e. a^2
    - individual features multiplied i.e. ab
- In order to differentiate which set is the best fit for which model, we will run each set through the RandomizedSearchCV and compare results:
    - StandardScaled Data Only
    - StandardScaled Data and Polynomial Features

In [25]:
poly = PolynomialFeatures(degree=2, interaction_only=False)

In [26]:
X_train_poly = poly.fit_transform(X_train_sc)

In [27]:
X_train_poly = pd.DataFrame(X_train_poly, columns=poly.get_feature_names_out())

In [28]:
X_test_poly = poly.transform(X_test_sc)

In [29]:
X_test_poly = pd.DataFrame(X_test_poly, columns=poly.get_feature_names_out())

In [30]:
X_train_poly.shape, X_test_poly.shape

((1907, 136), (636, 136))

- As you can see from the shapes, we effectively increased our features from only 15 to 136 different features.
- Next step is to feed our different datasets through our RandomizedSearchCV Pipeline to identify the best model for both datasets, capture their scores, compare, and continue to look for the best model we can make for this classification problem.

___

### Step 3: Identify Best Model for this dataset of Audio Track Features 

- In order for us to identify the best classification model for the task of classifying a user's taste based on audio features, we are going to run our data through multiple pipelines, each with a different estimator using `Randomized Search CV`.
- `Randomized Search CV` takes `model tuples` and associating `params_grids` as parameters when finding the best hyperparameters.
- We have created a dictionary called model_dict to store various models and their associated param grids for future use, saved in the model_methods.py file.
- To best prepare our data we tried a couple of preprocessing and feature engineering combinations to find the best orientation of parameters for classifying our user's music taste
- Estimators used in this project are:
    - `MultinomialNB`
    - `BernoulliNB`
    - `RandomForestClasssifier`
    - `LogisticRegression`
    - `AdaBoostClassifier`
    - `BaggingClassifier`
- Once each respective best version has been found we can run it through our evaluation function to capture:
    - `Train and Test Accuracy`
    - `Balanced Accuracy`
    - `Recall`
    - `Precision`
    - `F1`
- Then plot the confusion matrix and analyze peformance

In [39]:
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    warnings.filterwarnings("ignore", category=UserWarning)
    
    start_time = time.time()
    best_models = model_methods.get_best_models(X_train_poly, y_train)
    end_time = time.time()
    print("total time taken: ", end_time-start_time)

Pipeline(steps=[('rfc', RandomForestClassifier())])
Pipeline(steps=[('logreg', LogisticRegression())])




Pipeline(steps=[('abc', AdaBoostClassifier())])


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/christopherjoyce/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/christopherjoyce/opt/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/christopherjoyce/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_weight_boosting.py", line 486, in fit
    return super().fit(X, y, sample_weight)
  File "/Users/christopherjoyce/opt/anaconda3

Pipeline(steps=[('bgc', BaggingClassifier())])
Pipeline(steps=[('svc', SVC())])
total time taken:  2580.889941930771


In [37]:
start_time = time.time()
initial_scores = model_methods.record_scores(baseline, X_train_sc,y_train,X_test_sc,y_test,best_models,model_name='first run')
end_time = time.time()
print("total time taken: ", end_time-start_time)

total time taken:  28.89018702507019


  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
initial_scores.to_csv('../data/model_scores.csv')

In [40]:
v2_scores = model_methods.record_scores(baseline, X_train_poly,y_train,X_test_poly,y_test,best_models,model_name='og_poly', df_scores=initial_scores)

  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
v2_scores.to_csv('../data/model_scores.csv')