In [1]:
# Import Pandas
import pandas as pd

# Import preprocessing methods and train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Import Machine Learning Models
from imblearn.ensemble import BalancedRandomForestClassifier

# Import Methods for Metric Reporting
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

# Import matplotlib
import matplotlib.pyplot as plt

# Import Statistical Analysis metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
from sklearn.feature_selection import chi2

In [2]:
# Load and inspect the merged_spotify_songs.csv dataset
spotify_df = pd.read_csv("../Resources/merged_spotify_songs.csv")
spotify_df.head()

Unnamed: 0,id,name,artists,release_date,year,duration_ms,acousticness,danceability,energy,explicit,...,key,liveness,loudness,loudness_scaled,mode,popularity,speechiness,tempo,tempo_scaled,valence
0,02GDntOXexBFUvSgaXLPkd,Morceaux de fantaisie Op. 3: No. 2 Prélude in ...,['Sergei Rachmaninoff'],1921-01-01,1921,218773,0.993,0,0.088,0,...,1,0.363,-21.091,0.609334,0,0.02,0.0456,92.867,0.380461,0.0731
1,08zfJvRLp7pjAb94MA9JmF,Il Etait Syndiqué,['Fortugé'],1921-01-01,1921,196560,0.982,1,0.257,0,...,8,0.504,-16.415,0.682562,1,0.0,0.399,109.378,0.448103,0.771
2,0BMkRpQtDoKjcgzCpnqLNa,Dans La Vie Faut Pas S'en Faire,['Maurice Chevalier'],1921-01-01,1921,147133,0.995,0,0.26,0,...,9,0.258,-16.894,0.675061,1,0.0,0.0557,85.146,0.348829,0.826
3,0eQsdik7GTEy7M3UytCbSN,Morceaux de fantaisie Op. 3: No. 2 Prélude in ...,['Sergei Rachmaninoff'],1921-01-01,1921,218773,0.993,0,0.088,0,...,1,0.363,-21.091,0.609334,0,0.0,0.0456,92.867,0.380461,0.0731
4,0H3k2CvJvHULnWChlbeFgx,La Vipère,['Georgel'],1921-01-01,1921,190800,0.99,0,0.363,0,...,5,0.292,-12.562,0.742902,0,0.0,0.0546,174.532,0.715028,0.493


## Drop Non-Numerical Columns

In [3]:
# Drop non numberical and repeat columns from main DataFrame
spotify_df = spotify_df.drop(columns=["id", "name", "artists", "loudness_scaled", "tempo_scaled"])
spotify_df.head()

Unnamed: 0,release_date,year,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence
0,1921-01-01,1921,218773,0.993,0,0.088,0,0.527,1,0.363,-21.091,0,0.02,0.0456,92.867,0.0731
1,1921-01-01,1921,196560,0.982,1,0.257,0,0.0,8,0.504,-16.415,1,0.0,0.399,109.378,0.771
2,1921-01-01,1921,147133,0.995,0,0.26,0,0.0,9,0.258,-16.894,1,0.0,0.0557,85.146,0.826
3,1921-01-01,1921,218773,0.993,0,0.088,0,0.527,1,0.363,-21.091,0,0.0,0.0456,92.867,0.0731
4,1921-01-01,1921,190800,0.99,0,0.363,0,0.0,5,0.292,-12.562,0,0.0,0.0546,174.532,0.493


## Feature Engineering

In [4]:
# Extract month from release_date
spotify_df["release_date"] = pd.to_datetime(spotify_df["release_date"])
spotify_df["month"] = spotify_df["release_date"].dt.month
spotify_df = spotify_df.drop(columns=["release_date"])

# Removing outliers in duration_ms with Percentiles - remove top 5% and bottom 1% of duration
upper_lim = spotify_df['duration_ms'].quantile(.95)
lower_lim = spotify_df['duration_ms'].quantile(.01)

spotify_df = spotify_df[(spotify_df['duration_ms'] < upper_lim) & (spotify_df['duration_ms'] > lower_lim)]
spotify_df.head()

Unnamed: 0,year,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo,valence,month
0,1921,218773,0.993,0,0.088,0,0.527,1,0.363,-21.091,0,0.02,0.0456,92.867,0.0731,1
1,1921,196560,0.982,1,0.257,0,0.0,8,0.504,-16.415,1,0.0,0.399,109.378,0.771,1
2,1921,147133,0.995,0,0.26,0,0.0,9,0.258,-16.894,1,0.0,0.0557,85.146,0.826,1
3,1921,218773,0.993,0,0.088,0,0.527,1,0.363,-21.091,0,0.0,0.0456,92.867,0.0731,1
4,1921,190800,0.99,0,0.363,0,0.0,5,0.292,-12.562,0,0.0,0.0546,174.532,0.493,1


## Encoded key and month column

In [5]:
# key column represents the key the track is in:
# i.e 0 = C, 1 = C#/D♭, 2 = D, 3 = D#/E♭, ... , 11 = B
# Therefore,## Drop Non-Numerical Columns key column should be encoded

# Create OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded Dataframe
encode_df = pd.DataFrame(enc.fit_transform(spotify_df.key.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names_out(['key'])

# Merge the two DataFrames together and drop the key column
spotify_df = spotify_df.merge(encode_df,left_index=True,right_index=True).drop(columns=["key"])
spotify_df.head()

Unnamed: 0,year,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,liveness,loudness,mode,...,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,1921,218773,0.993,0,0.088,0,0.527,0.363,-21.091,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1921,196560,0.982,1,0.257,0,0.0,0.504,-16.415,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1921,147133,0.995,0,0.26,0,0.0,0.258,-16.894,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1921,218773,0.993,0,0.088,0,0.527,0.363,-21.091,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1921,190800,0.99,0,0.363,0,0.0,0.292,-12.562,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Encoding the month column

# Fit the encoder and produce encoded Dataframe
encode_df = pd.DataFrame(enc.fit_transform(spotify_df.month.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names_out(['month'])

# Merge the two DataFrames together and drop the month column
spotify_df = spotify_df.merge(encode_df,left_index=True,right_index=True).drop(columns=["month"])
spotify_df.head()

Unnamed: 0,year,duration_ms,acousticness,danceability,energy,explicit,instrumentalness,liveness,loudness,mode,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,1921,218773,0.993,0,0.088,0,0.527,0.363,-21.091,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1921,196560,0.982,1,0.257,0,0.0,0.504,-16.415,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1921,147133,0.995,0,0.26,0,0.0,0.258,-16.894,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1921,218773,0.993,0,0.088,0,0.527,0.363,-21.091,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1921,190800,0.99,0,0.363,0,0.0,0.292,-12.562,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Split data into input (X) and output (y)

In [7]:
# Separate the dataset into features (X) and target (y)
y = spotify_df["danceability"]
X = spotify_df.drop(columns=["danceability"])

## Split the Data into Training and Testing (75%/25%)

In [8]:
## # Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y, train_size=0.75)

## Scale input (X) data

In [9]:
## # Create a StandardScaler instance
scaler = StandardScaler()

# Fit the Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Define, Train, and Make Predictions for model

In [10]:
# ## Define the Balanced Random Forest Classifier Model
rf_model = BalancedRandomForestClassifier(n_estimators=256, random_state=1)

# Resample the training data with BalancedRandomForestClassifier
rf_model.fit(X_train_scaled, y_train)

# Make prediction
y_pred = rf_model.predict(X_test_scaled)

## Validate model

In [11]:
## # Create a Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [12]:
# Calculating the accuracy score.
acc_score = balanced_accuracy_score(y_test, y_pred)

In [13]:
# Displaying results
print("Model: Balanced Random Forest Classifier\n")
print("Confusion Matrix")
display(cm_df)
print(f"Balanced Accuracy Score: {acc_score}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model: Balanced Random Forest Classifier

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,11593,2482
Actual 1,3831,16997


Balanced Accuracy Score: 0.8198619412111241

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.82      0.79     14075
           1       0.87      0.82      0.84     20828

    accuracy                           0.82     34903
   macro avg       0.81      0.82      0.81     34903
weighted avg       0.82      0.82      0.82     34903



## Creating a dataframe with y_pred, y_test, and year

In [14]:
# Creating a dataframe with y_pred and y_test
d = {'y_pred': y_pred,
     'y_actual': y_test}
y_df = pd.DataFrame(data=d)
y_df

Unnamed: 0,y_pred,y_actual
130598,1,1
114444,1,0
28802,0,0
68804,1,1
71072,0,0
...,...,...
113170,0,1
91519,0,0
47272,0,0
17747,1,0


In [15]:
# Extracting year from spotify_df
year_df = spotify_df["year"]
year_df

0         1921
1         1921
2         1921
3         1921
4         1921
          ... 
149489    2010
149490    2010
149491    2010
149492    2010
149493    2010
Name: year, Length: 139611, dtype: int64

In [16]:
# Merging year_df and y_df
viz_df = y_df.merge(year_df, left_index=True, right_index=True)
viz_df

Unnamed: 0,y_pred,y_actual,year
130598,1,1,2001
114444,1,0,1993
28802,0,0,1950
68804,1,1,1970
71072,0,0,1971
...,...,...,...
113170,0,1,1992
91519,0,0,1981
47272,0,0,1959
17747,1,0,1942


## Save viz_df for visualization on Tableau

In [17]:
viz_df.to_csv("../Resources/machine_learning_data.csv", index=False)