In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

df = pd.read_csv("spotify_top_music.csv")

# Quick look at the data

In [6]:
df.head()

Unnamed: 0,title,artist,top genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
0,"Hey, Soul Sister",Train,neo mellow,2010,97,89,67,-4,8,80,217,19,4,83
1,Love The Way You Lie,Eminem,detroit hip hop,2010,87,93,75,-5,52,64,263,24,23,82
2,TiK ToK,Kesha,dance pop,2010,120,84,76,-3,29,71,200,10,14,80
3,Bad Romance,Lady Gaga,dance pop,2010,119,92,70,-4,8,71,295,0,4,79
4,Just the Way You Are,Bruno Mars,pop,2010,109,84,64,-5,9,43,221,2,4,78


In [7]:
df.describe()

Unnamed: 0,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
count,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0
mean,2014.59204,118.545605,70.504146,64.379768,-5.578773,17.774461,52.225539,224.674959,14.3267,8.358209,66.52073
std,2.607057,24.795358,16.310664,13.378718,2.79802,13.102543,22.51302,34.130059,20.766165,7.483162,14.517746
min,2010.0,0.0,0.0,0.0,-60.0,0.0,0.0,134.0,0.0,0.0,0.0
25%,2013.0,100.0,61.0,57.0,-6.0,9.0,35.0,202.0,2.0,4.0,60.0
50%,2015.0,120.0,74.0,66.0,-5.0,12.0,52.0,221.0,6.0,5.0,69.0
75%,2017.0,129.0,82.0,73.0,-4.0,24.0,69.0,239.5,17.0,9.0,76.0
max,2019.0,206.0,98.0,97.0,-2.0,74.0,98.0,424.0,99.0,48.0,99.0


# Initial Data Analysis

Firstly, we'll look at the top 5 genres in our dataset:

In [12]:
# Load the data
df = pd.read_csv("spotify_top_music.csv")

# Count the number of songs in each genre
genre_counts = df["top genre"].value_counts()

# Get the top 5 genres as a DataFrame
top_genres = genre_counts[:5].reset_index()
top_genres.columns = ['Genre', 'Number of Songs']

# Create a bar plot
fig = px.bar(top_genres, x='Genre', y='Number of Songs', title='Top 5 Most Popular Genres')
fig.update_layout(xaxis_title="Genre", yaxis_title="Number of Songs")

# Show the plot
fig.show()


As we can see, the top 4 genres are subgenres of pop music. We can deduce that the pop music is indeed the most popular, as seen below:

In [14]:
# Load the data
df = pd.read_csv("spotify_top_music.csv")

df['is_pop'] = df['top genre'].apply(lambda x: 'pop' in x.lower())

# Count the number of pop and non-pop genres
genre_counts = df['is_pop'].value_counts().reset_index()

# Rename the columns for the plot
genre_counts.columns = ['Genre', 'Count']

# Create the interactive pie chart
fig = px.pie(genre_counts, values='Count', names='Genre', title='Music Genres')
fig.show()

We can also look at the most popular artists:

In [23]:
# Load the data
df = pd.read_csv("spotify_top_music.csv")

# Count the number of songs for each artist
artist_counts = df["artist"].value_counts()

# Get the top 5 artists as a DataFrame
top_artists = artist_counts[:5].reset_index()
top_artists.columns = ['Artist', 'Number of Songs']

# Create a bar plot
fig = px.bar(top_artists, x='Artist', y='Number of Songs', title='Top 5 Most Popular Artists')
fig.update_layout(xaxis_title="Artist", yaxis_title="Number of Songs")

# Show the plot
fig.show()


Let's analyse the data in another way (data variation over the years):

In [24]:
# Load the data
df = pd.read_csv("spotify_top_music.csv")

# Convert year column to datetime format
df['year'] = pd.to_datetime(df['year'], format='%Y')

# Group by year and calculate mean of numeric columns
yearly_data = df.groupby('year').mean(numeric_only=True)

yearly_data['dur'] = yearly_data['dur']

# Reset index so that year is a column (needed for Plotly)
yearly_data.reset_index(inplace=True)

# Create a line plot
fig = px.line(yearly_data, x='year', y='dur', title='Duration Time-Series by Year')
fig.update_layout(xaxis_title="Year", yaxis_title="Duration (minutes)")

# Show the plot
fig.show()


In [17]:
import pandas as pd
import plotly.express as px

# Load the data
df = pd.read_csv("spotify_top_music.csv")

# Convert year column to datetime format
df['year'] = pd.to_datetime(df['year'], format='%Y')

# Group by year and calculate mean of numeric columns
yearly_data = df.groupby('year').mean(numeric_only=True)

# Reset index so that year is a column (needed for Plotly)
yearly_data.reset_index(inplace=True)

# Create a line plot
fig = px.line(yearly_data, x='year', y='dnce', title='Danceability Time-Series by Year')
fig.update_layout(xaxis_title="Year", yaxis_title="Danceability")

# Show the plot
fig.show()


As we can see in the 2 graphs, the song average duration is lower in recent times. This is due to the fact that all songs in 2018 and 2019 are pop subgenres and those songs have a lower duration on average.

In [19]:
# Load the data
df = pd.read_csv("spotify_top_music.csv")

df[df["year"] >= 2018]

Unnamed: 0,title,artist,top genre,year,bpm,nrgy,dnce,dB,live,val,dur,acous,spch,pop
508,One Kiss (with Dua Lipa),Calvin Harris,dance pop,2018,124,86,79,-3,8,59,215,4,11,86
509,Havana (feat. Young Thug),Camila Cabello,dance pop,2018,105,52,77,-4,13,39,217,18,3,85
510,I Like It,Cardi B,pop,2018,136,73,82,-4,37,65,253,10,13,85
511,New Rules,Dua Lipa,dance pop,2018,116,70,76,-6,15,61,209,0,7,84
512,There's Nothing Holdin' Me Back,Shawn Mendes,canadian pop,2018,122,81,87,-4,8,97,199,38,6,84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
598,Find U Again (feat. Camila Cabello),Mark Ronson,dance pop,2019,104,66,61,-7,20,16,176,1,3,75
599,Cross Me (feat. Chance the Rapper & PnB Rock),Ed Sheeran,pop,2019,95,79,75,-6,7,61,206,21,12,75
600,"No Brainer (feat. Justin Bieber, Chance the Ra...",DJ Khaled,dance pop,2019,136,76,53,-5,9,65,260,7,34,70
601,Nothing Breaks Like a Heart (feat. Miley Cyrus),Mark Ronson,dance pop,2019,114,79,60,-6,42,24,217,1,7,69


Another thing we can observe is that danceability had also increased in recent times, because most songs in 2018-2019 are easy to dance to (dance pop genre / electropop genre)

# Data correlation

We can start with a simple correlation matrix to gather some insights on how some variables are correlated to one another

In [26]:
# Load the data
df = pd.read_csv("spotify_top_music.csv")

# Calculate correlation matrix
corr_matrix = df.corr(numeric_only=True)

# Generate a heatmap
heatmap = ff.create_annotated_heatmap(z=corr_matrix.values, x=list(corr_matrix.columns), y=list(corr_matrix.index), 
                                      annotation_text=corr_matrix.round(2).values, colorscale='Viridis')

heatmap.update_layout(title='Correlation Heatmap', xaxis_title='Variable', yaxis_title='Variable')
heatmap.show()


We can spot some high correlations between:

- loudness (db) and energy (nrgy) - positive correlation
- valence (val) and energy (nrgy) - positive correlation
- valence (val) and danceability (dnce) - positive correlation
- acousticness (acous) and energy (nrgy) - negative correlation

In [30]:
# Load the data
df = pd.read_csv("spotify_top_music.csv")

fig = px.scatter(df, x='val', y='nrgy', trendline='ols', title='Valence vs Energy')

# Show the plot
fig.show()


We can say that a positive song is also energetic

In [29]:
# Load the data
df = pd.read_csv("spotify_top_music.csv")

fig = px.scatter(df, x='val', y='dnce', trendline='ols', title='Valence vs Danceability')

# Show the plot
fig.show()

We can say that if a song is positive, it's also easier to dance to

In [33]:
# Load the data
df = pd.read_csv("spotify_top_music.csv")

fig = px.scatter(df, x='nrgy', y='dB', trendline='ols', title='Energy vs Loudness')

# Show the plot
fig.show()

We can say that if a song has a high energy, it's louder

In [34]:
# Load the data
df = pd.read_csv("spotify_top_music.csv")

fig = px.scatter(df, x='acous', y='nrgy', trendline='ols', title='Acousticness vs Energy')

# Show the plot
fig.show()

We can say that if a song is acoustic (high value) it also has low energy levels

# ML model to predict the genre

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv("spotify_top_music.csv")

features = df.drop(['top genre', 'title'], axis=1)
target = df['top genre']

# the categorical feature to be encoded
categorical_features = ['artist']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create the pipeline
rfc_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', RandomForestClassifier())])

# Fit the model
rfc_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = rfc_pipeline.predict(X_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))

Accuracy:  0.9090909090909091
F1 Score:  0.8808035220027483


In [52]:
import plotly.express as px

# Get the feature importances from the random forest classifier
feature_importances = rfc_pipeline.named_steps['classifier'].feature_importances_

# Get feature names from the OneHotEncoder
ohe_feature_names = rfc_pipeline.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(categorical_features)

# Combine the OneHotEncoded feature names with the remaining feature names
all_feature_names = np.concatenate([ohe_feature_names, features.drop(categorical_features, axis=1).columns])

# Sort features and their importances
sorted_idx = np.argsort(feature_importances)[::-1]
sorted_feature_importances = feature_importances[sorted_idx]
sorted_feature_names = all_feature_names[sorted_idx]

# Create a dataframe to plot
df_plot = pd.DataFrame({'Feature': sorted_feature_names[:10], 'Importance': sorted_feature_importances[:10]})

# Create the interactive bar chart
fig = px.bar(df_plot, y='Feature', x='Importance', orientation='h', title="Feature Importance in RandomForest Classifier")

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    yaxis={'categoryorder':'total ascending'}
)

fig.show()

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

df = pd.read_csv("spotify_top_music.csv")

features = df.drop(['top genre', 'artist', 'title'], axis=1)
target = df['top genre']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Fit the RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Print the metrics
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred, average='weighted'))

Accuracy:  0.5619834710743802
F1 Score:  0.44664830119375576


In [55]:
import plotly.express as px

# Get feature importances from the RandomForestClassifier
feature_importances = classifier.feature_importances_

# Sort features and their importances
sorted_idx = np.argsort(feature_importances)[::-1]
sorted_feature_importances = feature_importances[sorted_idx]
sorted_feature_names = features.columns[sorted_idx]

# Create a dataframe to plot
df_plot = pd.DataFrame({'Feature': sorted_feature_names, 'Importance': sorted_feature_importances})

# Create the interactive bar chart
fig = px.bar(df_plot, y='Feature', x='Importance', orientation='h', title="Feature Importance in RandomForest Classifier")

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    yaxis={'categoryorder':'total ascending'}
)

fig.show()
