# Datasets Statistics MSD BB MBID

In [None]:
import pandas as pd

msd_mbid_map = pd.read_csv(
    '../data/hit_song_prediction_ismir2020/raw/msd-mbid-2016-01-results-ab.csv',
    names=['msd_id', 'mbid', 'title', 'artist'],
)
msd_bb_ismir2019 = pd.read_csv('../data/hit_song_prediction_ismir2019/msd_bb_matches.csv', header=0, index_col=0)
msd_bb_ismir2019 = msd_bb_ismir2019.merge(msd_mbid_map, on=['msd_id'])

msd_bb_exact = pd.read_csv('../data/hit_song_prediction_ismir2020/interim/msd_bb_exact_matches.csv', header=0, index_col=0)
msd_bb_cleaned = pd.read_csv('../data/hit_song_prediction_ismir2020/interim/msd_bb_cleaned_matches.csv', header=0, index_col=0)

msd_bb_mbid_exact_matches = pd.read_csv('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_exact_matches.csv', header=0, index_col=0)
msd_bb_mbid_cleaned_matches = pd.read_csv('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_cleaned_matches.csv', header=0, index_col=0)
msd_bb_mbid_non_matches = pd.read_csv('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_non_matches.csv', header=0, index_col=0)

counts = pd.DataFrame({
    'msd_bb_ismir2019': {
        'msd_id_count': msd_bb_ismir2019['msd_id'].nunique(),
        'mbid_count': msd_bb_ismir2019['mbid'].nunique(),
        'song_count': len(msd_bb_ismir2019.drop_duplicates(subset=['artist_x', 'title_x'])),
    },
    'msd_bb_exact': {
        'msd_id_count': msd_bb_exact['msd_id'].nunique(),
        'song_count': len(msd_bb_exact.drop_duplicates(subset=['artist', 'title'])),
    },
    'msd_bb_mbid_exact': {
        'msd_id_count': msd_bb_mbid_exact_matches['msd_id'].nunique(),
        'mbid_count': msd_bb_mbid_exact_matches['mbid'].nunique(),
        'song_count': len(msd_bb_mbid_exact_matches.drop_duplicates(subset=['artist_msd_bb', 'title_msd_bb'])),
    },
    'msd_bb_cleaned': {
        'msd_id_count': msd_bb_cleaned['msd_id'].nunique(),
        'song_count': len(msd_bb_cleaned.drop_duplicates(subset=['artist_clean', 'title_clean'])),
    },
    'msd_bb_mbid_cleaned': {
        'msd_id_count': msd_bb_mbid_cleaned_matches['msd_id'].nunique(),
        'mbid_count': msd_bb_mbid_cleaned_matches['mbid'].nunique(),
        'song_count': len(msd_bb_cleaned.drop_duplicates(subset=['artist_clean', 'title_clean'])),
    },
    'msd_bb_mbid_non': {
        'msd_id_count': msd_bb_mbid_non_matches['msd_id'].nunique(),
        'mbid_count': msd_bb_mbid_non_matches['mbid'].nunique(),
        'song_count': len(msd_bb_mbid_non_matches.drop_duplicates(subset=['artist_msd_bb', 'title_msd_bb'])),
    },
}).T

counts['mbid_msd_ratio'] = counts['mbid_count'] / counts['msd_id_count']
counts['mbid_song_ratio'] = counts['mbid_count'] / counts['song_count']
counts['msd_song_ratio'] = counts['msd_id_count'] / counts['song_count']

display(counts)

In [None]:
display(msd_bb_cleaned[~msd_bb_cleaned.msd_id.isin(msd_bb_exact['msd_id'])][['artist_msd', 'artist_bb', 'artist_clean', 'title_msd', 'title_bb', 'title_clean']])

## Merged dataset sizes for non-hits

In [None]:
ab_hl_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_non_matches_ab_hl_features.parquet')[['mbid']]
ab_hl_features = ab_hl_features.merge(msd_bb_mbid_non_matches, on=['mbid'])[['artist_msd_bb', 'title_msd_bb', 'msd_id', 'mbid']]
essentia_hl_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_non_matches_essentia_hl_features.parquet')[['msd_id']]
essentia_hl_features = essentia_hl_features.merge(msd_bb_mbid_non_matches[['artist_msd_bb', 'title_msd_bb', 'msd_id']].drop_duplicates(subset=['msd_id']), on=['msd_id'])

ab_ll_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_non_matches_ab_ll_features.parquet')[['mbid']]
ab_ll_features = ab_ll_features.merge(msd_bb_mbid_non_matches, on=['mbid'])[['artist_msd_bb', 'title_msd_bb', 'msd_id', 'mbid']]
essentia_ll_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_non_matches_essentia_ll_features.parquet')[['msd_id']]
essentia_ll_features = essentia_ll_features.merge(msd_bb_mbid_non_matches[['artist_msd_bb', 'title_msd_bb', 'msd_id']].drop_duplicates(subset=['msd_id']), on=['msd_id'])

hl_features = ab_hl_features.merge(essentia_hl_features, on=['msd_id'])
ll_features = ab_ll_features.merge(essentia_ll_features, on=['msd_id'])

all_features = ll_features.merge(hl_features, on=['msd_id'])
all_ab_features = ab_hl_features.merge(ab_ll_features, on=['msd_id', 'mbid'])
all_essentia_features = essentia_hl_features.merge(essentia_ll_features, on=['msd_id'])


counts = pd.DataFrame({
   'ab_hl': {
       'msd_id_count': ab_hl_features['msd_id'].nunique(),
       'mbid_count': ab_hl_features['mbid'].nunique(),
       'song_count': len(ab_hl_features.drop_duplicates(subset=['artist_msd_bb', 'title_msd_bb'])),
   },
    'essentia_hl': {
        'msd_id_count': essentia_hl_features['msd_id'].nunique(),
        'song_count': len(essentia_hl_features.drop_duplicates(subset=['artist_msd_bb', 'title_msd_bb'])),
    },
    'hl': {
        'msd_id_count': hl_features['msd_id'].nunique(),
        'mbid_count': hl_features['mbid'].nunique(),
        'song_count': len(hl_features.drop_duplicates(subset=['artist_msd_bb_x', 'title_msd_bb_x'])),
    },
    'ab_ll': {
        'msd_id_count': ab_ll_features['msd_id'].nunique(),
        'mbid_count': ab_ll_features['mbid'].nunique(),
        'song_count': len(ab_ll_features.drop_duplicates(subset=['artist_msd_bb', 'title_msd_bb'])),
    },
    'essentia_ll': {
        'msd_id_count': essentia_ll_features['msd_id'].nunique(),
        'song_count': len(essentia_ll_features.drop_duplicates(subset=['artist_msd_bb', 'title_msd_bb'])),
    },
    'll': {
        'msd_id_count': ll_features['msd_id'].nunique(),
        'mbid_count': ll_features['mbid'].nunique(),
        'song_count': len(ll_features.drop_duplicates(subset=['artist_msd_bb_x', 'title_msd_bb_x'])),
    },
    'all': {
        'msd_id_count': all_features['msd_id'].nunique(),
        'song_count': len(all_features.drop_duplicates(subset=['artist_msd_bb_x_x', 'title_msd_bb_x_x'])),
    },
    'all_ab': {
        'msd_id_count': all_ab_features['msd_id'].nunique(),
        'mbid_count': all_ab_features['mbid'].nunique(),
        'song_count': len(all_ab_features.drop_duplicates(subset=['artist_msd_bb_x', 'title_msd_bb_x'])),
    },
    'all_essentia': {
        'msd_id_count': all_essentia_features['msd_id'].nunique(),
        'song_count': len(all_essentia_features.drop_duplicates(subset=['artist_msd_bb_x', 'title_msd_bb_x'])),
    },
})
display(counts)

del ab_hl_features
del essentia_hl_features
del ab_ll_features
del essentia_ll_features

del hl_features
del ll_features

del all_features
del all_ab_features
del all_essentia_features

## Merged dataset sizes for hits

In [None]:
ab_hl_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_cleaned_matches_ab_hl_features.parquet')[['mbid']]
ab_hl_features = ab_hl_features.merge(msd_bb_mbid_cleaned_matches, on=['mbid'])[['artist_clean', 'title_clean', 'msd_id', 'mbid']]
essentia_hl_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_cleaned_matches_essentia_hl_features.parquet')[['msd_id']]
essentia_hl_features = essentia_hl_features.merge(msd_bb_mbid_cleaned_matches[['artist_clean', 'title_clean', 'msd_id']].drop_duplicates(subset=['msd_id']), on=['msd_id'])


ab_ll_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_cleaned_matches_ab_ll_features.parquet')[['mbid']]
ab_ll_features = ab_ll_features.merge(msd_bb_mbid_cleaned_matches, on=['mbid'])[['artist_clean', 'title_clean', 'msd_id', 'mbid']]
essentia_ll_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_cleaned_matches_essentia_ll_features.parquet')[['msd_id']]
essentia_ll_features = essentia_ll_features.merge(msd_bb_mbid_cleaned_matches[['artist_clean', 'title_clean', 'msd_id']].drop_duplicates(subset=['msd_id']), on=['msd_id'])

hl_features = ab_hl_features.merge(essentia_hl_features, on=['msd_id'])
ll_features = ab_ll_features.merge(essentia_ll_features, on=['msd_id'])

all_features = ll_features.merge(hl_features, on=['msd_id'])
all_ab_features = ab_hl_features.merge(ab_ll_features, on=['msd_id', 'mbid'])
all_essentia_features = essentia_hl_features.merge(essentia_ll_features, on=['msd_id'])

counts = pd.DataFrame({
   'ab_hl': {
       'msd_id_count': ab_hl_features['msd_id'].nunique(),
       'mbid_count': ab_hl_features['mbid'].nunique(),
       'song_count': len(ab_hl_features.drop_duplicates(subset=['artist_clean', 'title_clean'])),
   },
    'essentia_hl': {
        'msd_id_count': essentia_hl_features['msd_id'].nunique(),
        'song_count': len(essentia_hl_features.drop_duplicates(subset=['artist_clean', 'title_clean'])),
    },
    'hl': {
        'msd_id_count': hl_features['msd_id'].nunique(),
        'mbid_count': hl_features['mbid'].nunique(),
        'song_count': len(hl_features.drop_duplicates(subset=['artist_clean_x', 'title_clean_x'])),
    },
    'ab_ll': {
        'msd_id_count': ab_ll_features['msd_id'].nunique(),
        'mbid_count': ab_ll_features['mbid'].nunique(),
        'song_count': len(ab_ll_features.drop_duplicates(subset=['artist_clean', 'title_clean'])),
    },
    'essentia_ll': {
        'msd_id_count': essentia_ll_features['msd_id'].nunique(),
        'song_count': len(essentia_ll_features.drop_duplicates(subset=['artist_clean', 'title_clean'])),
    },
    'll': {
        'msd_id_count': ll_features['msd_id'].nunique(),
        'mbid_count': ll_features['mbid'].nunique(),
        'song_count': len(ll_features.drop_duplicates(subset=['artist_clean_x', 'title_clean_x'])),
    },
    'all': {
        'msd_id_count': all_features['msd_id'].nunique(),
        'song_count': len(all_features.drop_duplicates(subset=['artist_clean_x_x', 'title_clean_x_x'])),
    },
    'all_ab': {
        'msd_id_count': all_ab_features['msd_id'].nunique(),
        'mbid_count': all_ab_features['mbid'].nunique(),
        'song_count': len(all_ab_features.drop_duplicates(subset=['artist_clean_x', 'title_clean_x'])),
    },
    'all_essentia': {
        'msd_id_count': all_essentia_features['msd_id'].nunique(),
        'song_count': len(all_essentia_features.drop_duplicates(subset=['artist_clean_x', 'title_clean_x'])),
    },
})
display(counts)

del ab_hl_features
del essentia_hl_features
del ab_ll_features
del essentia_ll_features

del hl_features
del ll_features

In [None]:
feature = 'highlevel.gender.all.female'

ab_hl_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_cleaned_matches_ab_hl_features.parquet')
data = ab_hl_features.merge(msd_bb_mbid_cleaned_matches, on=['mbid'])

dup = data[data.duplicated(['msd_id'], keep=False)]

describtion = dup[['msd_id', feature]].groupby('msd_id').apply(lambda x: x.describe().T)
describtion['diff'] = describtion['max'] - describtion['min']

stddev = data[feature].std()
describtion['ratio'] = describtion['std'] / stddev

display(describtion.describe())

describtion[describtion['ratio'] >= 1.]