Since version, codec and bitrate all seem to have an effect, it is difficult to get a clear 'best case', thus we can also just calculate the correlations for the slices in the dataset for which the sample size allows for this:

In [1]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import numpy as np
from scipy import stats
# Use seaborn style defaults and set the default figure size
sns.set(rc={'figure.figsize':(15, 15)})

from tqdm.notebook import tqdm
tqdm().pandas()

# Define correlations to study
cols = [
    ('acousticness', ('mood_acoustic', 'acoustic')),
    ('danceability', ('danceability', 'danceable')),
    ('energy', ('mood_relaxed', 'not_relaxed')),
    ('instrumentalness', ('voice_instrumental', 'instrumental')),
    ('valence', ('mood_happy', 'happy')),
    ('valence', ('mood_sad', 'not_sad'))
]

def pearsonr_ci(x,y,alpha=0.05):
    ''' calculate Pearson correlation along with the confidence interval using scipy and numpy
    Parameters
    ----------
    x, y : iterable object such as a list or np.array
      Input for correlation calculation
    alpha : float
      Significance level. 0.05 by default
    Returns
    -------
    r : float
      Pearson's correlation coefficient
    pval : float
      The corresponding p value
    lo, hi : float
      The lower and upper bound of confidence intervals
    '''

    r, p = stats.pearsonr(x,y)
    r_z = np.arctanh(r)
    se = 1/np.sqrt(x.size-3)
    z = stats.norm.ppf(1-alpha/2)
    lo_z, hi_z = r_z-z*se, r_z+z*se
    lo, hi = np.tanh((lo_z, hi_z))
    return r, p, lo, hi

def get_correlations(data, columns):
    corrs = {
        'Pearson r': [],
        'p': [],
        'low': [],
        'high': []
    }
    for t in columns:
        pearson_r = pearsonr_ci(data[t[0]], data[t[1]])
        corrs['Pearson r'].append(pearson_r[0])
        corrs['p'].append(pearson_r[1])
        corrs['low'].append(pearson_r[2])
        corrs['high'].append(pearson_r[3])
    return corrs

def get_correlations_2(data, columns):
    res = {}
    
    for t in columns:        
        pearson_r = pearsonr_ci(data[t[0]], data[t[1]])
        res[t] = pearson_r[0]
    return res

# Pooled variance metric
def get_variance(df):
    variances = df.groupby(level=0).var()
    samplesizes = df.groupby(level=0).size()
    
    a = variances.loc[variances.index.repeat(samplesizes)].reset_index(drop=True)
    return a

def filter_submissions(df):
    filt = df.groupby(level=0).size() > 1
    return df[filt[df.index.get_level_values(level=0)].values]


# Load in the acousticbrainz dataset into the variable 'acousticbrainz' (V3 also include build SHA to make sure that software
# version is the same on a commit basis.
acousticbrainz = pd.read_hdf(Path.cwd() / 'datasets' / 'acousticbrainzV3.h5')
spotify = pd.read_hdf(Path.cwd() / 'datasets' / 'spotify.h5')
mapping = pd.read_hdf(Path.cwd() / 'datasets' / 'mapping.h5')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  from pandas import Panel





In [2]:
# First, match the spotify and acousticbrainz datasets
acousticbrainz = acousticbrainz.reset_index().rename(columns={"level_0": "mbid", "level_1": "submission"})
merged = acousticbrainz.merge(mapping, on='mbid').merge(spotify, left_on='spotify', right_on='id')

In [4]:
# Then, group by bitrate version combinations and save the sample sizes
groups = merged.groupby(['bit_rate', 'codec', 'essentia_low', 'essentia_git_sha_low', 'essentia_build_sha_low'])
counts = groups.size().sort_values(ascending=False)

In [5]:
# Only check groups with at least n submissions
tocheck = counts[counts > 1000].keys()
corrs = {}

for name, group in groups:
    if name in tocheck:
        corrs[name] = get_correlations_2(group, cols)

In [6]:
corrs = pd.DataFrame(corrs).transpose()

In [7]:
corrs

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,acousticness,danceability,energy,instrumentalness,valence,valence
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,"(mood_acoustic, acoustic)","(danceability, danceable)","(mood_relaxed, not_relaxed)","(voice_instrumental, instrumental)","(mood_happy, happy)","(mood_sad, not_sad)"
0,flac,2.1-beta1,v2.1_beta1-28-g21ef5f4,5c9e65ec81695c916afb4cba055c226bfc528ee6,0.641988,0.400917,0.548273,0.426673,0.376957,0.318614
0,flac,2.1-beta1,v2.1_beta1-28-g21ef5f4-dirty,ca57ba49d9b1854bd80e60cf9ccf267278fb7d6b,0.729157,0.31115,0.563573,0.354332,0.312667,0.167524
0,flac,2.1-beta1,v2.1_beta1-6-g5578087,7598a69e7b456135ce4c070efb04a3d0ebb71fdd,0.727883,0.265258,0.531152,0.436154,0.313758,0.230211
0,flac,2.1-beta1,v2.1_beta1-6-g5578087,7f154e5731828a63663f0144680ef47bd640f88b,0.646839,0.552018,0.571878,0.392111,0.476783,0.481604
0,flac,2.1-beta1,v2.1_beta1-6-g5578087,85931cb11b241c07ac43bfc3827c398894034a25,0.69477,0.220256,0.672652,0.176731,0.379398,0.014595
0,flac,2.1-beta1,v2.1_beta1-7-ge0e83e8,ed039d7e5a6e3dec64cf0be7c36ef885fb432306,0.734092,0.312348,0.55407,0.34581,0.24209,0.292984
0,flac,2.1-beta1,v2.1_beta1-7-ge0e83e8-dirty,12879b851c9f4b0d899e1af4e221aa110f283a84,0.729716,0.324401,0.509615,0.452542,0.308493,0.391356
0,flac,2.1-beta1,v2.1_beta1-7-ge0e83e8-dirty,50a0fbec89d6a9cedea3d45b6611406f7e8c7b1a,0.735877,0.304956,0.520386,0.385283,0.219627,0.264905
0,flac,2.1-beta2,v2.1_beta2,70f2e5ece6736b2c40cc944ad0e695b16b925413,0.734669,0.294265,0.573405,0.42834,0.3047,0.229682
0,flac,2.1-beta2,v2.1_beta2-1-ge3940c0,2d9f1f26377add8aeb1075a9c2973f962c4f09fd,0.74586,0.416328,0.556963,0.522287,0.273433,0.330536


In [8]:
print(corrs.describe().to_latex())

\begin{tabular}{lrrrrrr}
\toprule
{} &              acousticness &              danceability &                      energy &                   instrumentalness & \multicolumn{2}{l}{valence} \\
{} & (mood\_acoustic, acoustic) & (danceability, danceable) & (mood\_relaxed, not\_relaxed) & (voice\_instrumental, instrumental) & (mood\_happy, happy) & (mood\_sad, not\_sad) \\
\midrule
count &                 26.000000 &                 26.000000 &                   26.000000 &                          26.000000 &           26.000000 &           26.000000 \\
mean  &                  0.597379 &                  0.265644 &                    0.468363 &                           0.297532 &            0.235755 &            0.187254 \\
std   &                  0.207730 &                  0.148028 &                    0.180931 &                           0.170126 &            0.136819 &            0.147677 \\
min   &                  0.082376 &                 -0.041901 &                    0.04314

### Taking the best case for each classifier

In [9]:
corrs.max()

acousticness      (mood_acoustic, acoustic)             0.761782
danceability      (danceability, danceable)             0.552018
energy            (mood_relaxed, not_relaxed)           0.672652
instrumentalness  (voice_instrumental, instrumental)    0.522287
valence           (mood_happy, happy)                   0.476783
                  (mood_sad, not_sad)                   0.481604
dtype: float64

### Which subsets work best for which classifier?

In [10]:
corrs.idxmax()

acousticness      (mood_acoustic, acoustic)             (320000, vorbis, 2.1-beta2, v2.1_beta2-1-ge394...
danceability      (danceability, danceable)             (0, flac, 2.1-beta1, v2.1_beta1-6-g5578087, 7f...
energy            (mood_relaxed, not_relaxed)           (0, flac, 2.1-beta1, v2.1_beta1-6-g5578087, 85...
instrumentalness  (voice_instrumental, instrumental)    (0, flac, 2.1-beta2, v2.1_beta2-1-ge3940c0, 2d...
valence           (mood_happy, happy)                   (0, flac, 2.1-beta1, v2.1_beta1-6-g5578087, 7f...
                  (mood_sad, not_sad)                   (0, flac, 2.1-beta1, v2.1_beta1-6-g5578087, 7f...
dtype: object

In [11]:
corrs.idxmax().values

array([(320000, 'vorbis', '2.1-beta2', 'v2.1_beta2-1-ge3940c0', 'cead25079874084f62182a551b7393616cd33d87'),
       (0, 'flac', '2.1-beta1', 'v2.1_beta1-6-g5578087', '7f154e5731828a63663f0144680ef47bd640f88b'),
       (0, 'flac', '2.1-beta1', 'v2.1_beta1-6-g5578087', '85931cb11b241c07ac43bfc3827c398894034a25'),
       (0, 'flac', '2.1-beta2', 'v2.1_beta2-1-ge3940c0', '2d9f1f26377add8aeb1075a9c2973f962c4f09fd'),
       (0, 'flac', '2.1-beta1', 'v2.1_beta1-6-g5578087', '7f154e5731828a63663f0144680ef47bd640f88b'),
       (0, 'flac', '2.1-beta1', 'v2.1_beta1-6-g5578087', '7f154e5731828a63663f0144680ef47bd640f88b')],
      dtype=object)

Clear winners: 
- **Flac and lossless** seem to result in maximum correlation between spotify and acousticbrainz
- **v2.1_beta1-6-g5578087** seem to perform well in most cases, being beaten by the recommended version (supplied by acousticbrainz now) only on mood_acoustic and instrumentalness...

### Which classifiers have the most variance in correlation with spotify when using different version subsets?

In [9]:
corrs.var().sort_values()

valence           (mood_happy, happy)                   0.018719
                  (mood_sad, not_sad)                   0.021809
danceability      (danceability, danceable)             0.021912
instrumentalness  (voice_instrumental, instrumental)    0.028943
energy            (mood_relaxed, not_relaxed)           0.032736
acousticness      (mood_acoustic, acoustic)             0.043152
dtype: float64

# Can we get even higher correlation if we only group by lossless flac, including all versions?

In [10]:
groups = merged.groupby(['bit_rate', 'codec'])
counts = groups.size().sort_values(ascending=False)

In [11]:
counts

bit_rate  codec
0         flac     71637
192000    mp3      34634
128000    mp3      20607
320000    mp3      17181
160000    mp3       7915
                   ...  
222335    mp3          1
222331    mp3          1
222329    mp3          1
222326    mp3          1
206518    mp3          1
Length: 45682, dtype: int64

In [12]:
for k,v in groups:
    if k == (0, 'flac'):
        display(pd.Series(get_correlations_2(v, cols)))

acousticness      (mood_acoustic, acoustic)             0.743794
danceability      (danceability, danceable)             0.336953
energy            (mood_relaxed, not_relaxed)           0.591516
instrumentalness  (voice_instrumental, instrumental)    0.443026
valence           (mood_happy, happy)                   0.319518
                  (mood_sad, not_sad)                   0.253815
dtype: float64

##### No!
So the combination of codec *and* version does matter