In [None]:
# !pip install plotly

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scipy.stats as sp
import plotly.express as px
%matplotlib inline

In [None]:
df=pd.read_csv('stats/stat_audio_features.txt',index_col='track_id')
df.head()

Valence, Energy and Danceability are intersting because these are somewhat subjective criteria defined by the spotify team.
key and mode can be easily determined. The question is : does Spotify is right.
loudness, tempo, duration are objective criteria that can be easily calculated.
Let's add a combined score of danceability, energy and valence to get an overall idea of the mood of the music

In [None]:
df['mood']= df['danceability']*df['energy']*df['valence']

In [None]:
df.info()

transforming key and mode in plain text. Converting milliseconds to seconds

In [None]:
def minute_seconds(x) : 
    rounded = round(x/1000)
    minutes = rounded//60
    seconds = rounded%60
    return str(str(minutes)+"""'"""+str(seconds))

df['duration_seconds']=round(df['duration_ms']/1000)
df["minute_seconds"] = df["duration_ms"].apply(minute_seconds)

In [None]:
dico_mode = {0:"minor",1:"major"}
df["major_minor"]= df["mode"].map(dico_mode)

In [None]:
dico_key = {0:"C",1:"C#",2:"D",3:"D#",4:"E",5:"F",6:"F#",7:"G",8:"G#",9:"A",10:"A#",11:"B,"}
df["key_name"]= df["key"].map(dico_key)

In [None]:

df['key_mode'] = df['key_name'].str.cat(df['major_minor'],sep=' ')

we can add more info by adding track popularity and track name(for readability)

In [None]:
df_tracks=pd.read_csv('stats/stat_tracks_popularity.txt',index_col='track_id',sep=";")
df_tracks.head()

In [None]:
df = df.merge(right = df_tracks,on="track_id")
df.head()

In [None]:
df.describe(include="all")

Let's analyse valence, danceability and energy

In [None]:
plt.figure(figsize=(50,10))
plt.subplot(131)
sns.histplot(data=df,x='valence')
plt.title('valence')
plt.subplot(132)
sns.histplot(data=df,x='danceability')
plt.title('danceability')
plt.subplot(133)
sns.histplot(data=df,x='energy')
plt.title('energy');

In [None]:
plt.figure(figsize=(30,10))
plt.subplot(131)
sns.boxplot(data=df,x='valence')
plt.title('valence')
plt.subplot(132)
sns.boxplot(data=df,x='danceability')
plt.title('danceability')
plt.subplot(133)
sns.boxplot(data=df,x='energy')
plt.title('energy');

In [None]:
sm.qqplot(df['valence'], fit = True, line = '45');

In [None]:
sm.qqplot(df['danceability'], fit = True, line = '45');

In [None]:
sm.qqplot(df['energy'], fit = True, line = '45');

In [None]:
sns.heatmap(data=df.select_dtypes(include=['number']).corr(),annot=True,fmt='.2f')

In [None]:
sns.countplot(data=df,x='key_name')

Key is defined by the Pitch Class Notation. https://en.wikipedia.org/wiki/Pitch_class
where 0 is the key of C, 1 is C# and so on .
Interessingly, the most occuring key is not C but C#.
The least occuring key is the key of D# / E flat

the key is not everything. One needs to know the mode to get a better grasp of the mood of the music.

In [None]:
sns.countplot(data=df,x='major_minor')

In [None]:
sns.countplot(data=df,x='key_name',hue='major_minor')

so it seems that some key are mostly associated with a specific mode : G, D, C and C#. while others are mainly played in minor however not by a big margin : D#, F, F#, A#, B.

Is the higher occurence of minor mode in B key vs other group is statistically significant?
the Chi2 independance test can help assess wether two categorical variables are independant or not.

In [None]:
df_key_mode = df[["major_minor","key_name"]].reset_index().groupby(["major_minor","key_name"]).count().reset_index()
df_key_mode = df_key_mode.pivot(index="key_name",columns="major_minor",values="track_id")
df_key_mode.head(20)

In [None]:
chi2, p, dof, expected = sp.chi2_contingency(df_key_mode)

print("Chi-square statistic:", chi2)
print("P-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:")
print(expected)

based on these results we can conclude that there is a significant relationship between key and mode.
Meaning that some keys are statistically more often used with a preferred mode than the other in a significant way.

Is  Louder more energetic? We can see that correlation is quite strong at 0.75. But is it statistically significant?
We can use a spearman test to analyse that as these are two continuous numeric variables

In [None]:
correl,p_value_pearson = sp.spearmanr(df["loudness"],df["energy"])
print(correl,p_value_pearson)

looking at the p value we can assess that the correlation between loudness and energy is statistically significant

what about energy and valence? Is Happy music more energetic than sad music?

In [None]:
correl,p_value_pearson = sp.spearmanr(df["loudness"],df["valence"])
print(correl,p_value_pearson)

Once again correlation is statistically significant. Happier music seems to be noted more energetic.

What about valence vs danceability? Is Dance music, happier music?

In [None]:
correl,p_value_pearson = sp.spearmanr(df["danceability"],df["valence"])
print(correl,p_value_pearson)

looks like it is.

So the more energetic the music, the more likely it is to be of a positive mood and the more likely it is to be danceable

lets cross check key/mode vs the mood (danceability x energy x valence)

In [None]:
result=smf.ols('mood ~ key_mode', data=df).fit()
sm.stats.anova_lm(result)

In [None]:
tukey_results = pairwise_tukeyhsd(df['mood'], df['key_mode'])



In [None]:
df_results = pd.DataFrame(data=tukey_results._results_table.data[1:], columns=tukey_results._results_table.data[0])

In [None]:
df_results.info()

In [None]:
df_turkey_significant = df_results[df_results['p-adj']<0.05]
df_turkey_significant.info()

In [None]:
sns.countplot(data=df,x='time_signature')

what genre are associated of 5/4? is it accurate?
same question for 3/4

In [None]:
sns.boxplot(data=df,x='duration_seconds')

In [None]:
df_temp = df[df['time_signature']==5]
df_temp.head()

In [None]:
px.box(df,x='duration_seconds') 

La durée des morceaux ne semble pas suivre une loi normale

In [None]:
sm.qqplot(df['duration_seconds'], fit = True, line = '45') ;

le tempo semble suivre une loi normale

In [None]:
sm.qqplot(df['tempo'], fit = True, line = '45');

In [None]:
correl,p_value_pearson = sp.pearsonr(df['loudness'],df['danceability'])
print(correl,p_value_pearson)