In [127]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import ast
import json

# Computation of aggregated audio features by genre

In [128]:
path = "../data/"
artists = pd.read_csv(path+"artists.csv")
tracks = pd.read_csv(path+"tracks.csv")

In [129]:
artists.head()

Unnamed: 0,id,followers,genres,name,popularity
0,0DheY5irMjBUeLybbCUEZ2,0.0,[],Armid & Amir Zare Pashai feat. Sara Rouzbehani,0
1,0DlhY15l3wsrnlfGio2bjU,5.0,[],ปูนา ภาวิณี,0
2,0DmRESX2JknGPQyO15yxg7,0.0,[],Sadaa,0
3,0DmhnbHjm1qw6NCYPeZNgJ,0.0,[],Tra'gruda,0
4,0Dn11fWM7vHQ3rinvWEl4E,2.0,[],Ioannis Panoutsopoulos,0


In [130]:
tracks.head()

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,35iwgR4jXetI318WEWsa1Q,Carve,6,126903,0,['Uli'],['45tIt06XoI0Iio4LBEVpls'],1922-02-22,0.645,0.445,0,-13.338,1,0.451,0.674,0.744,0.151,0.127,104.851,3
1,021ht4sdgPcrDgSk7JTbKY,Capítulo 2.16 - Banquero Anarquista,0,98200,0,['Fernando Pessoa'],['14jtPCOoNZwquk5wd9DxrY'],1922-06-01,0.695,0.263,0,-22.136,1,0.957,0.797,0.0,0.148,0.655,102.009,1
2,07A5yehtSnoedViJAZkNnc,Vivo para Quererte - Remasterizado,0,181640,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.434,0.177,1,-21.18,1,0.0512,0.994,0.0218,0.212,0.457,130.418,5
3,08FmqUhxtyLTn6pAh6bk45,El Prisionero - Remasterizado,0,176907,0,['Ignacio Corsini'],['5LiOoJbxVSAMkBS2fUm3X2'],1922-03-21,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918,0.104,0.397,169.98,3
4,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,['Dick Haymes'],['3BiJGZsyX9sJchTqcSA7Su'],1922,0.402,0.158,3,-16.9,0,0.039,0.989,0.13,0.311,0.196,103.22,4


In [131]:
# Convert release date to only the year
tracks['release_date'] = pd.to_datetime(tracks['release_date'], errors='raise')
tracks['release_date'] = tracks['release_date'].dt.strftime('%Y')

## All genres

In [132]:
features = ["danceability", "energy", "acousticness", "instrumentalness", "liveness", "valence"]

In [133]:
count = tracks['id'].count()
stats = tracks[['release_date'] + features].groupby('release_date').mean()
stats

Unnamed: 0_level_0,danceability,energy,acousticness,instrumentalness,liveness,valence
release_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1900,0.659000,0.791000,0.139000,0.000002,0.161000,0.956000
1922,0.533320,0.324054,0.898857,0.324971,0.250670,0.563605
1923,0.637332,0.266977,0.859965,0.157659,0.225396,0.671967
1924,0.593344,0.356725,0.866266,0.339628,0.203447,0.554935
1925,0.617391,0.263749,0.912170,0.275384,0.255157,0.635196
...,...,...,...,...,...,...
2017,0.624972,0.659730,0.278547,0.089228,0.200736,0.493965
2018,0.634510,0.652202,0.278446,0.098398,0.198216,0.483251
2019,0.649724,0.637066,0.293902,0.081428,0.189971,0.494212
2020,0.657129,0.639669,0.271750,0.113089,0.192968,0.501712


In [134]:
# year 1900 shouldn't be here
stats = stats.drop(index='1900')

In [135]:
results = {'all': (count, stats.to_dict())}

## By genre

In [None]:
def str_to_list(x):
    if type(x) == str:
        return ast.literal_eval(x)
    return x

tracks['id_artists'] = tracks['id_artists'].apply(str_to_list)
artists = artists[artists['genres'] != "[]"]
artists['genres'] = artists['genres'].apply(str_to_list)

In [None]:
tracks_exploded = tracks.explode('id_artists')
artists_exploded = artists.explode('genres')

merged = pd.merge(tracks_exploded, artists_exploded, left_on='id_artists', right_on='id')
# Remove duplicated tracks due to multiple artists. Still keep one duplicate per genre
merged = merged[['id_x', 'genres', 'release_date'] + features].drop_duplicates()
genres = merged.groupby('genres')
for genre, df in genres:
    count = df['id_x'].count()
    stats = df[['release_date'] + features]
    # Remove years that have only song, it's most likely incorrect (e.g. hip hop in the 30's)
    stats = stats.groupby('release_date').filter(lambda x: len(x) > 1)
    stats = stats.groupby('release_date').mean()
    stats = stats.drop(index='1900', errors='ignore')
    results[genre] = (count, stats.to_dict())

In [None]:
# Some genres have strong overlaps in the songs
def compare_genres(g1, g2):
    print(merged[merged.genres == g1].id_x.nunique())
    print(merged[merged.genres == g2].id_x.nunique())
    print("Overlap", pd.merge(merged[merged.genres == g1], merged[merged.genres == g2], left_on="id_x", right_on="id_x").id_x.nunique())
    
compare_genres("rock", "metal")

In [None]:
# Keep only a few genres, most of them are under-represented anyway

In [None]:
#results_best = dict(sorted(results.items(), key = lambda x : x[1][0], reverse=True))
#list(results_best.keys())[:50]

In [None]:
popular_genres = ["all", "pop", "rap", "rock", "jazz", "adult standards", "classical", "folk", "soul", 
                  "blues", "r&b", "indie", "latino"]

results_popular = {}
for genre, values in results.items():
    if genre in popular_genres:
        results_popular[genre] = values[1]

## Example plot

In [None]:
rock = pd.DataFrame.from_dict(results_popular['rock']['danceability'], orient='index', columns=['Rock'])
all_genres = pd.DataFrame.from_dict(results_popular['all']['danceability'], orient='index', columns=['All genres'])
plot = pd.concat((all_genres, rock), axis=1).plot(title="Danceability", ylim=(0,1))
fig = plot.get_figure()
#fig.savefig("../img/audio_features_example.png")

## Save results

In [None]:
# Change the hierarchy : feature-genre-year
results_final = {}
for feature in features:
    results_final[feature] = {}
for genre, features_values in results_popular.items():
    for feature, values in features_values.items():
        results_final[feature][genre] = values

In [None]:
with open("../data/audio_features_by_genre.json", "w") as outfile: 
    json.dump(results_final, outfile)