# Spotify Modeling

In [1]:
# Load Python libraries
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("C:/Users/ekwaning/Documents/Flatiron/Capstone/Capstone-Project/SpotifyData/data.csv")
genre_data = pd.read_csv('C:/Users/ekwaning/Documents/Flatiron/Capstone/Capstone-Project/SpotifyData/data_by_genres.csv')
year_data = pd.read_csv('C:/Users/ekwaning/Documents/Flatiron/Capstone/Capstone-Project/SpotifyData/data_by_year.csv')

In [26]:
# Object data to category
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].astype('category')
    
# Encoding categorical features
for col in data.select_dtypes(include=['category']).columns:
    data[col] = data[col].cat.codes
    

data['Quantile_rank'] = pd.qcut(data['popularity'], 4,
                               labels = False)  

In [27]:
target = data.pop('Quantile_rank')
feature_names = ['acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence','duration_ms','explicit','key','mode','year']
train_data, test_data, train_labels, test_labels = train_test_split(data[feature_names], target, 
                                                                    test_size = 0.3)

Random Forest

In [28]:
model = RandomForestClassifier(n_estimators=250, max_depth=25)
model.fit(train_data, train_labels)

In [29]:
# Predicting
predict_labels = model.predict(test_data)

In [30]:
print(classification_report(test_labels, predict_labels))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91     12787
           1       0.60      0.75      0.67     12769
           2       0.55      0.52      0.54     13178
           3       0.76      0.61      0.68     12462

    accuracy                           0.70     51196
   macro avg       0.70      0.70      0.70     51196
weighted avg       0.70      0.70      0.70     51196



XGBoost

In [31]:
model = xgb.XGBClassifier(learning_rate=0.1, max_depth=15, min_child_weight=5, n_estimators=250)
model.fit(train_data, train_labels)

In [32]:
# Predicting
predict_labels = model.predict(test_data)

In [33]:
print(classification_report(test_labels, predict_labels))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91     12787
           1       0.60      0.72      0.65     12769
           2       0.54      0.53      0.53     13178
           3       0.75      0.60      0.67     12462

    accuracy                           0.69     51196
   macro avg       0.70      0.69      0.69     51196
weighted avg       0.70      0.69      0.69     51196

