# Writing the next hit song
An exploratory analysis on maximising my chances of making the next hit song.  
Explore the Spotify dataset for features that may give me the edge on writing a popular song.  
  
Dataset: https://www.kaggle.com/yamaerenay/spotify-dataset-19212020-160k-tracks

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Setup folder heirarchy and load in data

In [2]:
DIR_DATA_RAW = os.path.join(".", "data", "external")

for dir, dirs, files in os.walk(DIR_DATA_RAW):
    for file in files:
        print(os.path.join(DIR_DATA_RAW, file))

In [3]:
# Load data
df = pd.read_csv(os.path.join(DIR_DATA_RAW, 'tracks.csv'), index_col=0, parse_dates=['release_date'])

FileNotFoundError: [Errno 2] File b'.\\data\\external\\tracks.csv' does not exist: b'.\\data\\external\\tracks.csv'

# Get a feel for the dataset

In [None]:
df

In [None]:
YEARS_OF_DATA = 100
df = df[df.release_date > df.release_date.max() - pd.DateOffset(years=YEARS_OF_DATA)].sort_values(by='release_date', ascending=False)

In [None]:
df.describe()

In [None]:
df.info()  # 587k rows, all numerical except name, artists, id_artists and release_date

In [None]:
df.isnull().sum()  # look at NaN entries

In [None]:
# see if there's anything interesting about NaN rows that might indicate larger problems with dataset
df.loc[df.name.isnull()]

In [None]:
#no larger issues identified - drop rows
df.dropna(inplace=True)

# drop rows with time_signature as 0
df = df[df.time_signature != 0]
df.describe()['time_signature']['min']

# duration in seconds
df['duration_ms'] = df.duration_ms/1.0e3
df.rename(columns = {'duration_ms': 'duration_s'}, inplace=True)
dtypes = df.dtypes.value_counts()

In [None]:
# separate categorical and numerical entries as they are analysed differently
y = ['popularity']  # column of interest
cols_numerical = ['duration_s', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
cols_categorical = ['explicit', 'release_date', 'key', 'mode', 'time_signature']


# plot distributions of each numerical feature
cols = 3  # 3 column grid
rows = int(np.ceil(1.0*len(cols_numerical)/3))
figure, axes = plt.subplots(rows, cols, figsize=(14,8)) # create grid (may be a few spare plots in the final row)

i = 0
j = 0
for col in cols_numerical:
    sns.distplot(df[col], ax=axes[i, j]) 
    j += 1 # next column
    if(j >= cols): # if we would next plot at column 4, move to first column in the next row
        i += 1
        j = 0

figure.tight_layout() # fix spacing issues

# Top 10 tracks in key of C minor

In [None]:
df[(df['key'] == 0) & (df['mode'] == 0)].sort_values('popularity', ascending=False)[['name', 'artists', 'popularity', 'key', 'mode']].head(10)

# Top 3 most popular solo artists
### does not account for solo + collaborations; accounts for solo releases only

In [None]:
a = df.copy()
a = df.groupby('artists').mean().sort_values(by='popularity', ascending=False)  # group artists and sort by mean popularity
a['num_artists'] = [len(x.split(', ')) for x in a.index]  # create column that counts number of artists for each song
a[a['num_artists'] == 1].head(3)  # display top 3 solo artists (already sorted in descending order from two lines ago)

# What makes a song popular?

In [None]:
df_corr = df[cols_numerical + y].corr() # correlation coefficients don't make sense for categorical columns

## Explore correlation coefficients

In [None]:
CORRELATION_THRESH = 0.2  # arbitrary correlation threshold (0 to 1) to highlight most correlated entries on right-hand side
highly_correlated_indices = np.multiply(df_corr.abs() > CORRELATION_THRESH, df_corr.abs() != 1)  
# .abs() to account for negatively-correlated entries
# !=1 to remove the main diagonal (entries are, of course, perfectly correlated with themselves)

np.multiply(highly_correlated_indices, df_corr).style.background_gradient(cmap='viridis')

f, a = plt.subplots(1,2, figsize=(20,8));
plt.subplot(1,2,1)
sns.heatmap(df_corr);
a[0].set

In [None]:
# arbitrary correlation threshold (0 to 1) to highlight most correlated entries on right-hand side
CORRELATION_THRESH = 0.15
highly_correlated_indices = np.multiply(df_corr.abs() > CORRELATION_THRESH, df_corr.abs() != 1)  
# .abs() to account for negatively-correlated entries
# !=1 to remove the main diagonal (entries are, of course, perfectly correlated with themselves)

f, a = plt.subplots(1,2, figsize=(20,8));
f.suptitle('Correlation coefficients of numerical data', fontsize=26)

plt.subplot(1,2,1).set_title('Correlation coefficients', fontsize=16)
sns.heatmap(df_corr);

plt.subplot(1,2,2).set_title('Correlation coefficients above threshold', fontsize=16)
sns.heatmap(np.multiply(highly_correlated_indices, df_corr));

# energy and loudness are highly correlated
# drop loudness or energy to avoid highly correlated columns disproportionately effecting results

In [None]:
# look at popularity column and remove entry w.r.t. itself
df_corr['popularity'].drop('popularity').sort_values(ascending=False)

In [None]:
# energy is highly correlated with loudness, but is less correlated with popularity than loudness, so drop energy
cols_numerical.remove('energy')
df.drop(['energy'], axis=1, inplace=True)

## Explore attributes that contain discrete data

In [None]:
rows = 1
cols = 3
plt.subplots(rows,cols, figsize=(13,4))
sns.set_palette('colorblind')

# musical key only makes sense plotted with the mode (major or minor)
plt.subplot(rows,cols,1)
plt.title('Musical key vs mean popularity')
sns.barplot(x='key', y='popularity', hue='mode', data=df)

plt.subplot(rows,cols,2)
plt.title('Time signature vs mean popularity')
sns.barplot(x='time_signature', y='popularity', data=df);

plt.subplot(rows,cols,3)
plt.title('Explicit vs mean popularity')
sns.barplot(x='explicit', y='popularity', data=df);

plt.tight_layout()

## Findings so far
- Most popular keys are 1, 6 and 7, with mode 0, which corresponds to C# minor, F# minor and G# minor
- 4/4 time signature is most popular on average
- Explicit songs are more popular than non-explicit songs

# Explore trends over time

In [None]:
df['year'] = [x.year for x in df.release_date]  # add column for year of release
df[['year','release_date']]

In [None]:
tracks_by_year = df.groupby('year').mean()  # group by year
tracks_by_year = tracks_by_year.sort_values('year')
# doesn't make sense to use mean value of categorical fields, so drop categorical columns
tracks_by_year.drop(['explicit', 'key', 'mode', 'time_signature'], axis=1, inplace=True)
tracks_by_year

In [None]:
# filter out low correlation noise
tracks_by_year_normalised = tracks_by_year.copy()
CORRELATION_THRESH = 0.15 

# normalise variables to improve readibility from graph
for col in tracks_by_year.columns:  
    mean = tracks_by_year[col].mean()
    std_dev = tracks_by_year[col].std()
    tracks_by_year_normalised[col] = (tracks_by_year[col] - mean)/std_dev
    
# create plot
figure, axes = plt.subplots(figsize=(15,6));
l = []  # initiate list for legend
for col in tracks_by_year_normalised.columns:  # plot if correlated enough with popularity
    if(df_corr['popularity'].abs()[col] > CORRELATION_THRESH):
        sns.lineplot(x=tracks_by_year_normalised.index, y=col, data=tracks_by_year_normalised)
        l.append(col)

#  looks like at least 4 local minima and maxima on popularity curve
#  choose 5th degree polynomial or greater
d = 6  # polynomial degree
curve = np.poly1d(np.polyfit(tracks_by_year_normalised.index, tracks_by_year_normalised.popularity, d));  # get poly coefficients
x = range(tracks_by_year.index.min(), tracks_by_year.index.max());  # create array of integers from start year to end year
plt.plot(x, curve(x), color='black')  # overlay best fit curve with calculated (x, curve(x))

l.append('popularity line of best fit')
plt.legend(labels=l);
axes.set_ylabel('magnitude', fontsize=15);
axes.set_xlabel('year', fontsize=15);
axes.set_title('variables more highly correlated with popularity (normalised)', fontsize=20);


# Conclusion

## To maximise chances at writing a pop hit, write a song:
- in one of the following keys: C#, F# or G# minor
- in 4/4 time
- that you can dance to (turn up the volume)
- include some explicit language
- must not be purely instrumental
- must not be accoustic
