In [1]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import numpy as np
# Use seaborn style defaults and set the default figure size
sns.set(rc={'figure.figsize':(15, 15)})

from tqdm.notebook import tqdm
tqdm().pandas()

from scipy.stats import ttest_rel

acousticbrainz = pd.read_hdf(Path.cwd() / 'datasets' / 'acousticbrainzV3.h5')
spotify = pd.read_hdf(Path.cwd() / 'datasets' / 'spotify.h5')
mapping = pd.read_hdf(Path.cwd() / 'datasets' / 'mapping.h5')

def findpairs(merged, versions):
    # Resulting dataframes
    res = {}
    for version in versions:
        res[version] = []
        
    # Must have at least two submissions
    grouped = merged.groupby('mbid').filter(lambda x: len(x) > 1)
    
    for name, group in tqdm(grouped.groupby('mbid')):
        for i in range(0, len(group)-1):
            if group.iloc[i]['essentia_low'] in versions:
                version_a = group.iloc[i]['essentia_low']
                
                for j in range(i+1, len(group)):
                    if group.iloc[j]['essentia_low'] in versions:
                        version_b = group.iloc[j]['essentia_low']
                        
                        # Pair found!
                        if version_a != version_b:
                            res[version_a].append(group.iloc[i])
                            res[version_b].append(group.iloc[j])
    
    return res

def significancetest(series_a, series_b, col_1, col_2):
    assert(len(series_a) == len(series_b))
    
    a = []
    b = []
    
    for i in range(0, len(series_a)):
        a.append(abs(series_a[i][col_1] - series_a[i][col_2]))
        b.append(abs(series_b[i][col_1] - series_b[i][col_2]))
    
    statistic, p = ttest_rel(a, b)
    return (np.mean(a), np.mean(b), statistic, p)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  from pandas import Panel





In [2]:
acousticbrainz['mbid'] = acousticbrainz.index.get_level_values(0)
merged = acousticbrainz.merge(mapping, on='mbid').merge(spotify, left_on='spotify', right_on='id')

In [3]:
versions = findpairs(merged, ['2.1-beta1', '2.1-beta2'])
n = len(versions['2.1-beta1'])

HBox(children=(FloatProgress(value=0.0, max=50820.0), HTML(value='')))




In [4]:
cols = [
    ('acousticness', ('mood_acoustic', 'acoustic')),
    ('danceability', ('danceability', 'danceable')),
    ('energy', ('mood_relaxed', 'not_relaxed')),
    ('instrumentalness', ('voice_instrumental', 'instrumental')),
    ('valence', ('mood_happy', 'happy')),
    ('valence', ('mood_sad', 'not_sad'))
]

res = pd.DataFrame()

for col in cols:
    ttest = significancetest(versions['2.1-beta1'], versions['2.1-beta2'], col[0], col[1])
    res = res.append(pd.Series([ttest[0], ttest[1], ttest[2], ttest[3]], name=col))
    
display(res)

Unnamed: 0,0,1,2,3
"(acousticness, (mood_acoustic, acoustic))",0.174532,0.148726,54.231864,0.0
"(danceability, (danceability, danceable))",0.430322,0.311341,242.415456,0.0
"(energy, (mood_relaxed, not_relaxed))",0.424959,0.259122,267.787354,0.0
"(instrumentalness, (voice_instrumental, instrumental))",0.603558,0.302066,333.656219,0.0
"(valence, (mood_happy, happy))",0.303454,0.265384,81.741517,0.0
"(valence, (mood_sad, not_sad))",0.353758,0.310109,123.898038,0.0


In [5]:
columns = pd.MultiIndex.from_tuples([('MAE', '2.1-beta1'), ('MAE', '2.1-beta2'), ('t',''), ('p','')])

In [6]:
res.columns = columns
print(f"n={n}")
res

n=229569


Unnamed: 0_level_0,MAE,MAE,t,p
Unnamed: 0_level_1,2.1-beta1,2.1-beta2,Unnamed: 3_level_1,Unnamed: 4_level_1
"(acousticness, (mood_acoustic, acoustic))",0.174532,0.148726,54.231864,0.0
"(danceability, (danceability, danceable))",0.430322,0.311341,242.415456,0.0
"(energy, (mood_relaxed, not_relaxed))",0.424959,0.259122,267.787354,0.0
"(instrumentalness, (voice_instrumental, instrumental))",0.603558,0.302066,333.656219,0.0
"(valence, (mood_happy, happy))",0.303454,0.265384,81.741517,0.0
"(valence, (mood_sad, not_sad))",0.353758,0.310109,123.898038,0.0


In [8]:
print(res.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} & \multicolumn{2}{l}{MAE} &           t &    p \\
{} & 2.1-beta1 & \multicolumn{3}{l}{2.1-beta2} \\
\midrule
(acousticness, (mood\_acoustic, acoustic))          &  0.174532 &  0.148726 &   54.231864 &  0.0 \\
(danceability, (danceability, danceable))          &  0.430322 &  0.311341 &  242.415456 &  0.0 \\
(energy, (mood\_relaxed, not\_relaxed))              &  0.424959 &  0.259122 &  267.787354 &  0.0 \\
(instrumentalness, (voice\_instrumental, instrum... &  0.603558 &  0.302066 &  333.656219 &  0.0 \\
(valence, (mood\_happy, happy))                     &  0.303454 &  0.265384 &   81.741517 &  0.0 \\
(valence, (mood\_sad, not\_sad))                     &  0.353758 &  0.310109 &  123.898038 &  0.0 \\
\bottomrule
\end{tabular}

