# Modeling and Evaluation

In [7]:
#Importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [8]:
spotify = pd.read_csv('../Data/Cleaned Data/Cleaned_SpotifyFeatures.csv')
spotify.head(5)

Unnamed: 0,popularity,acousticness,danceability,energy,instrumentalness,liveness,loudness,mode,speechiness,tempo,...,key_B,key_C,key_C#,key_D,key_D#,key_E,key_F,key_F#,key_G,key_G#
0,13,0.234,0.617,0.862,0.976,0.141,-12.855,1,0.0514,129.578,...,0,0,0,0,0,0,0,0,1,0
1,5,0.249,0.518,0.805,0.0,0.333,-6.248,1,0.0407,79.124,...,0,0,0,0,0,0,1,0,0,0
2,30,0.366,0.631,0.513,4e-06,0.109,-6.376,1,0.0293,120.365,...,0,0,0,1,0,0,0,0,0,0
3,39,0.815,0.768,0.137,0.922,0.113,-13.284,0,0.0747,76.43,...,0,0,1,0,0,0,0,0,0,0
4,70,0.131,0.748,0.627,0.0,0.0852,-6.029,1,0.0644,120.963,...,0,0,0,0,0,0,0,0,1,0


# KMeans Clusters

In [None]:
#Creating X
X = spotify[['popularity', 'loudness', 'energy', 'danceability', 'time_signature', 'tempo', 
             'valence', 'acousticness', 'instrumentalness', 'liveness', 'mode', 'speechiness']]

#Scaling X
sc = StandardScaler()
X_sc = sc.fit_transform(X)

# Finding the best clusters/silhouette score in range 2-10
row = []

for k in range(2, 11):
    km = KMeans(n_clusters=k, random_state=2024)
    km.fit(X_sc)
    
    inert = km.inertia_
    sil = silhouette_score(X_sc, km.labels_)
    
    rows.append((k, inert, sil))



In [None]:
#Looking at the k's, inertia and silhouette scores
k_df = pd.DataFrame(rows, columns=['k', 'inertia', 'sil'])
k_df.head()

# 3 Classification

In [None]:
# Creating 3 classifications for the popularity column
spotify['popularity'] = np.where(spotify['popularity'] <= 33, 0, spotify['popularity']) 
spotify['popularity'] = np.where((spotify['popularity'] >= 34) & (spotify['popularity'] <= 66), 1, spotify['popularity']) 
spotify['popularity'] = np.where(spotify['popularity'] >= 67, 2, spotify['popularity']) 

spotify.head()

In [None]:
#Trying a DecisionTreeClassifier

# Creating X and y
X = spotify[['loudness', 'energy', 'danceability', 'time_signature', 'tempo', 'valence', 'acousticness', 'instrumentalness', 'liveness', 'mode', 'speechiness']]
y = spotify['popularity']

#TTS on X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=2024)

#Instantiating decision tree model
tree = DecisionTreeClassifier(max_depth=10, min_samples_split=5)

#Fitting X_train and y_train
tree.fit(X_train, y_train)

#Getting the score
tree.score(X_train, y_train), tree.score(X_test, y_test)

In [None]:
#Trying a RandomForest

#Instantiating the rf
rf = RandomForestClassifier(oob_score=True, max_features='sqrt')

#Fitting X_train and y_Train
rf.fit(X_train, y_train)

#Getting the scores
rf.score(X_train, y_train), rf.score(X_test, y_test), rf.oob_score_

In [None]:
#Trying ExtraTreesClassifier

#Intantiating the rf
et = ExtraTreesClassifier(n_estimators=100, max_depth=35, min_samples_leaf=3)

#Fitting the model to X_train and y_Train
et.fit(X_train, y_train)

#Getting the scores
et.score(X_train, y_train), et.score(X_test, y_test)

In [None]:
#Trying DNN

#Creating X and y
X = spotify.drop('popularity', axis=1)
y = spotify['popularity']

#TTS on X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

#Scaling, fitting, and transforming the data
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [None]:
#Checking the shape of X_train 
X_train.shape

In [None]:
#Creating dnn1
col = X_train.shape[1]
dnn1 = Sequential()
dnn1.add(Dense(100, activation='relu', input_shape=(col,)))
dnn1.add(Dense(50, activation='relu'))
dnn1.add(Dense(5, activation='softmax'))

#Compiling Model
dnn1.compile(loss='crossentropy', optimizer='rmsprop', metrics=['acc'])

#Fitting the model
hist1 = dnn1.fit(
    X_train_sc, y_train,
    validation_data=(X_test_sc, y_test),
    batch_size=32,
    epochs=10
)

In [None]:
#Trying DNN with different X's

#Assigning New X and y
X_dnn2 = spotify[['loudness', 'genre_Rap', 'genre_Rock', 'genre_Children\'s Music','genre_Pop', 'energy', 'danceability', 'time_signature', 'tempo', 'valence', 'acousticness', 'instrumentalness', 'liveness', 'mode', 'speechiness']]
y_dnn2 = spotify['popularity']

#Train test splitting
X_dnn2_train, X_dnn2_test, y_dnn2_train, y_dnn2_test = train_test_split(X_dnn2, y_dnn2, random_state=123)

#Scaling the data
sc = StandardScaler()
X_dnn2_train_sc = sc.fit_transform(X_dnn2_train)
X_dnn2_test_sc = sc.transform(X2_test)

In [None]:
#Creating dnn2 with regularization technique "Dropout"

col = X_dnn2_train.shape[1]
dnn2 = Sequential()
dnn2.add(Dense(100, activation='relu', input_shape=(col,)))
dnn2.add(Dropout(0.5))
dnn2.add(Dense(50, activation='relu'))
dnn2.add(Dropout(0.5))
dnn2.add(Dense(50, activation='relu'))
dnn2.add(Dropout(0.5))
dnn2.add(Dense(65, activation='relu'))
dnn2.add(Dropout(0.5))
dnn2.add(Dense(5, activation='softmax'))

#Compiling dnn2
dnn2.compile(loss='crossentropy', optimizer='adam', metrics=['acc'])

#Fitting the model
hist2 = dnn2.fit(
    X_train_sc, y_train,
    validation_data=(X_test_sc, y_test),
    batch_size=32,
    epochs=10
)

In [None]:
#Plotting to see the loss

train_loss = hist2.history['loss']
test_loss = hist2.history['val_loss']

plt.figure(figsize=(12, 8))
plt.plot(train_loss, label='Training loss', color='black')
plt.plot(test_loss, label='Testing loss', color='red')

In [None]:
#Checking the baseline
spotify['popularity'].value_counts(normalize=True).sort_index()

# 5 Classification

In [None]:
#The values are very unbalanced, let's read in the DF again and create 5 classifications instead of 3
spotify2 = pd.read_csv('../Data/Cleaned Data/Cleaned_SpotifyFeatures.csv')


spotify2['popularity'] = np.where(spotify2['popularity'] <= 20, 0, spotify2['popularity']) 
spotify2['popularity'] = np.where((spotify2['popularity'] >= 21) & (spotify2['popularity'] <= 40), 1, spotify2['popularity']) 
spotify2['popularity'] = np.where((spotify2['popularity'] >= 41) & (spotify2['popularity'] <= 60), 2, spotify2['popularity'])
spotify2['popularity'] = np.where((spotify2['popularity'] >= 61) & (spotify2['popularity'] <= 80), 3, spotify2['popularity'])
spotify2['popularity'] = np.where(spotify2['popularity'] >= 81, 4, spotify2['popularity']) 


spotify2.head()

In [None]:
#Checking the balance
spotify2['popularity'].value_counts(normalize=True)

In [None]:
# Creating X2 and y2
X2 = spotify2[['loudness', 'energy', 'danceability', 'time_signature', 'tempo', 'valence']]
y2 = spotify2['popularity']

In [None]:
#Trying RandomForest again

#Instantiating rf2
rf2 = RandomForestClassifier(n_estimators=150, oob_score=True, max_features='sqrt')

#Creating parameters
p = X2.shape[1]
params2 = {
    'max_depth': np.append(np.arange(1, 21), None),
    'max_features': np.arange(1, p2 + 1),
    'min_samples_leaf': np.arange(1, 31)
}

#Using a RandomSearchCV
rs = RandomizedSearchCV(rf3, params2, n_iter=50, cv=5, n_jobs=4)

#Fitting the model
rs.fit(X2_train, y2_train) 

#Getting the score
rs.score(X2_test, y2_test)

In [None]:
#Trying ExtraTrees again

#Instantiating
et2 = ExtraTreesClassifier(n_estimators=200)

#Fitting the model
et2.fit(X2_train, y2_train)

#Getting the score
et2.score(X2_train, y2_train), et2.score(X2_test, y2_test)

In [None]:
# Trying DNN again

In [None]:
#Assigning New X and y
X3 = spotify2[['loudness', 'genre_Rap', 'genre_Rock', 'genre_Children\'s Music','genre_Pop', 'energy', 'danceability', 'time_signature', 'tempo', 'valence', 'acousticness', 'instrumentalness', 'liveness', 'mode', 'speechiness']]
y3 = spotify2['popularity']

#Train test splitting
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, random_state=123)

#Scaling the data
sc = StandardScaler()
X3_train_sc = sc.fit_transform(X3_train)
X3_test_sc = sc.transform(X3_test)

#Creating dnn4
col3 = X3_train.shape[1]
dnn3 = Sequential()
dnn3.add(Dense(100, activation='relu', input_shape=(col3,)))
dnn3.add(Dropout(0.5))
dnn3.add(Dense(50, activation='relu'))
dnn3.add(Dropout(0.5))
dnn3.add(Dense(50, activation='relu'))
dnn3.add(Dropout(0.5))
dnn3.add(Dense(65, activation='relu'))
dnn3.add(Dropout(0.5))
dnn3.add(Dense(5, activation='softmax'))

#Compiling dnn2
dnn3.compile(loss='crossentropy', optimizer='adam', metrics=['acc'])

#Fitting the model
hist5 = dnn3.fit(
    X3_train_sc, y3_train,
    validation_data=(X3_test_sc, y3_test),
    batch_size=32,
    epochs=10
)

In [None]:
#Trying DNN once more with original DNN model

#Creating X and y
X = spotify2.drop('popularity', axis=1)
y = spotify2['popularity']

#TTS on X and y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

#Scaling, fitting, and transforming the data
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [None]:
#Creating dnn1
col = X_train.shape[1]
dnn1 = Sequential()
dnn1.add(Dense(100, activation='relu', input_shape=(col,)))
dnn1.add(Dense(50, activation='relu'))
dnn1.add(Dense(5, activation='softmax'))

#Compiling Model
dnn1.compile(loss='crossentropy', optimizer='rmsprop', metrics=['acc'])

#Fitting the model
hist1 = dnn1.fit(
    X_train_sc, y_train,
    validation_data=(X_test_sc, y_test),
    batch_size=32,
    epochs=10
)