In [1]:
# Import Pandas
import pandas as pd

# Import preprocessing methods and train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Import Counter
from collections import Counter

# Import Sampling Methods
from imblearn.over_sampling import RandomOverSampler

# Import Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
import tensorflow as tf

# Import Methods for Metric Reporting
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# Load and inspect the merged_spotify_songs.csv dataset
spotify_df = pd.read_csv("../Resources/merged_spotify_songs.csv")
spotify_df.head()

Unnamed: 0,id,name,artists,release_date,year,duration_ms,acousticness,danceability,energy,explicit,...,key,liveness,loudness,loudness_scaled,mode,popularity,speechiness,tempo,tempo_scaled,valence
0,02GDntOXexBFUvSgaXLPkd,Morceaux de fantaisie Op. 3: No. 2 Prélude in ...,['Sergei Rachmaninoff'],1921-01-01,1921,218773,0.993,0,0.088,0,...,1,0.363,-21.091,0.609334,0,0.02,0.0456,92.867,0.380461,0.0731
1,08zfJvRLp7pjAb94MA9JmF,Il Etait Syndiqué,['Fortugé'],1921-01-01,1921,196560,0.982,1,0.257,0,...,8,0.504,-16.415,0.682562,1,0.0,0.399,109.378,0.448103,0.771
2,0BMkRpQtDoKjcgzCpnqLNa,Dans La Vie Faut Pas S'en Faire,['Maurice Chevalier'],1921-01-01,1921,147133,0.995,0,0.26,0,...,9,0.258,-16.894,0.675061,1,0.0,0.0557,85.146,0.348829,0.826
3,0eQsdik7GTEy7M3UytCbSN,Morceaux de fantaisie Op. 3: No. 2 Prélude in ...,['Sergei Rachmaninoff'],1921-01-01,1921,218773,0.993,0,0.088,0,...,1,0.363,-21.091,0.609334,0,0.0,0.0456,92.867,0.380461,0.0731
4,0H3k2CvJvHULnWChlbeFgx,La Vipère,['Georgel'],1921-01-01,1921,190800,0.99,0,0.363,0,...,5,0.292,-12.562,0.742902,0,0.0,0.0546,174.532,0.715028,0.493


In [3]:
# Drop non numberical and repeat columns from main DataFrame
spotify_df = spotify_df.drop(columns=["id", "name", "artists", "release_date", "loudness_scaled", "tempo_scaled"])
spotify_df.head()

Unnamed: 0,year,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
0,1921,218773,0.993,0,0.088,0,0.527,1,0.363,-21.091,0,0.02,0.0456,92.867,0.0731
1,1921,196560,0.982,1,0.257,0,0.0,8,0.504,-16.415,1,0.0,0.399,109.378,0.771
2,1921,147133,0.995,0,0.26,0,0.0,9,0.258,-16.894,1,0.0,0.0557,85.146,0.826
3,1921,218773,0.993,0,0.088,0,0.527,1,0.363,-21.091,0,0.0,0.0456,92.867,0.0731
4,1921,190800,0.99,0,0.363,0,0.0,5,0.292,-12.562,0,0.0,0.0546,174.532,0.493


In [4]:
# key column represents the key the track is in:
# i.e 0 = C, 1 = C#/D♭, 2 = D, 3 = D#/E♭, ... , 11 = B
# Therefore, key column should be encoded

# Create OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded Dataframe
encode_df = pd.DataFrame(enc.fit_transform(spotify_df.key.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names_out(['key'])
encode_df.head()

Unnamed: 0,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Merge the two DataFrames together and drop the key column
spotify_df = spotify_df.merge(encode_df,left_index=True,right_index=True).drop(columns=["key"])
spotify_df.head()

Unnamed: 0,year,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,liveness,loudness,mode,...,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,1921,218773,0.993,0,0.088,0,0.527,0.363,-21.091,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1921,196560,0.982,1,0.257,0,0.0,0.504,-16.415,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1921,147133,0.995,0,0.26,0,0.0,0.258,-16.894,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1921,218773,0.993,0,0.088,0,0.527,0.363,-21.091,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1921,190800,0.99,0,0.363,0,0.0,0.292,-12.562,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Separate the dataset into features (X) and target (y)
y = spotify_df["danceability"]
X = spotify_df.drop(columns=["danceability"])

In [7]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, train_size=0.75)

In [8]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Define the Balanced Random Forest Classifier Model
rf_model = BalancedRandomForestClassifier(n_estimators=500, random_state=1)

# Resample the training data with BalancedRandomForestClassifier
rf_model.fit(X_train_scaled, y_train)

# Make prediction
y_pred = rf_model.predict(X_test_scaled)

In [10]:
# Create a Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [11]:
# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_pred)

In [12]:
# Displaying results
print("Model: Balanced Random Forest Classifier\n")
print("Confusion Matrix")
display(cm_df)
print(f"Balanced Accuracy Score: {acc_score}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model: Balanced Random Forest Classifier

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,13954,2982
Actual 1,4505,21009


Balanced Accuracy Score: 0.8236778198296866

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.82      0.79     16936
           1       0.88      0.82      0.85     25514

    accuracy                           0.82     42450
   macro avg       0.82      0.82      0.82     42450
weighted avg       0.83      0.82      0.82     42450



In [16]:
d = {'y_pred': y_pred,
     'y_actual': y_test}
df = pd.DataFrame(data=d)
df

Unnamed: 0,y_pred,y_actual
74471,0,0
111692,1,1
25632,0,0
59694,0,0
157590,0,0
...,...,...
133804,1,1
85883,1,1
15110,0,0
12840,0,0


In [18]:
df2 = spotify_df["year"]
df2

0         1921
1         1921
2         1921
3         1921
4         1921
          ... 
169792    2020
169793    2020
169794    2020
169795    2020
169796    2020
Name: year, Length: 169797, dtype: int64

In [19]:
df3 = df.merge(df2, left_index=True, right_index=True)
df3

Unnamed: 0,y_pred,y_actual,year
74471,0,0,1973
111692,1,1,1991
25632,0,0,1948
59694,0,0,1965
157590,0,0,2014
...,...,...,...
133804,1,1,2002
85883,1,1,1978
15110,0,0,1940
12840,0,0,1939


In [21]:
df3.to_csv("../Resources/machine_learning_data.csv")