# Datasets Statistics MSD BB MBID

In [None]:
import pandas as pd


msd_bb_mbid_matches = pd.read_csv('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_matches.csv', index_col=0)
msd_bb_mbid_non_matches = pd.read_csv('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_non_matches.csv', index_col=0)

counts = pd.DataFrame({
    'matches': {
        'msd_id_count': msd_bb_mbid_matches['msd_id'].nunique(),
        'mbid_count': msd_bb_mbid_matches['mbid'].nunique(),
    },
    'non_matches': {
        'msd_id_count': msd_bb_mbid_non_matches['msd_id'].nunique(),
        'mbid_count': msd_bb_mbid_non_matches['mbid'].nunique(),
    },
}).T

counts['ratio'] = counts['mbid_count'] / counts['msd_id_count']
display(counts)

In [None]:
from hit_prediction_code.dataloaders import millionsongdataset
import re

msd = millionsongdataset.read_msd_unique_tracks('..')

bb = pd.read_csv('../data/billboard/billboard_hot-100_1958-08-11_2019-07-06.csv', header=0)

msd['artist_lower'] = msd['artist'].apply(lambda s: re.sub(r'\s+', ' ', str(s).lower().strip()))
msd['title_lower'] = msd['title'].apply(lambda s: re.sub(r'\s+', ' ', str(s).lower().strip()))

bb['artist_lower'] = bb['artist'].apply(lambda s: re.sub(r'\s+', ' ', str(s).lower().strip()))
bb['title_lower'] = bb['title'].apply(lambda s: re.sub(r'\s+', ' ', str(s).lower().strip()))

msd_bb = msd.merge(bb, on=['artist', 'title'])
msd_bb_lower = msd.merge(bb, on=['artist_lower', 'title_lower'])

In [None]:
clean_re = r'[^\w\s]'

msd['artist_clean'] = msd['artist_lower'].apply(lambda s: re.sub(clean_re, '', s))
msd['title_clean'] = msd['title_lower'].apply(lambda s: re.sub(clean_re, '', s))

bb['artist_clean'] = bb['artist_lower'].apply(lambda s: re.sub(clean_re, '', s))
bb['title_clean'] = bb['title_lower'].apply(lambda s: re.sub(clean_re, '', s))

msd_bb_clean = msd.merge(bb, on=['artist_clean', 'title_clean'])

display(len(msd_bb), len(msd_bb_lower), len(msd_bb_clean))

display(msd_bb_clean[~msd_bb_clean.msd_id.isin(msd_bb_lower['msd_id'])][['artist_x', 'artist_y', 'title_x', 'title_y']])

In [None]:
msd_mbid_map = pd.read_csv(
    '../data/hit_song_prediction_ismir2020/raw/msd-mbid-2016-01-results-ab.csv',
    names=['msd_id', 'mbid', 'title', 'artist'],
)

msd_mbid = pd.read_csv(
    '../data/hit_song_prediction_ismir2019/msd_bb_matches.csv',
    names=['msd_id', 'mbid', 'title', 'artist'],
)

msd_bb = msd_bb_clean.merge(msd_mbid_map, on=['msd_id'])
msd_bb = msd_bb.drop_duplicates(['artist_clean', 'title_clean'])
display(msd_bb['msd_id'].nunique(), msd_mbid['msd_id'].nunique())

## Merged dataset sizes for non-hits

In [None]:
ab_hl_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_non_matches_ab_hl_features.parquet')[['mbid']]
ab_hl_features = ab_hl_features.merge(msd_bb_mbid_non_matches, on=['mbid'])[['msd_id', 'mbid']]
msd_hl_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_non_matches_msd_hl_features.parquet')[['msd_id']]


ab_ll_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_non_matches_ab_ll_features.parquet')[['mbid']]
ab_ll_features = ab_ll_features.merge(msd_bb_mbid_non_matches, on=['mbid'])[['msd_id', 'mbid']]
msd_ll_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_non_matches_msd_ll_features.parquet')[['msd_id']]

hl_features = ab_hl_features.merge(msd_hl_features, on=['msd_id'])
ll_features = ab_ll_features.merge(msd_ll_features, on=['msd_id'])

all_features = ll_features.merge(hl_features, on=['msd_id'])

counts = pd.DataFrame({
    'ab_hl': {
        'msd_id_count': ab_hl_features['msd_id'].nunique(),
    },
    'ab_ll': {
        'msd_id_count': ab_ll_features['msd_id'].nunique(),
    },
    'msd_hl': {
        'msd_id_count': msd_hl_features['msd_id'].nunique(),
    },
    'msd_ll': {
        'msd_id_count': msd_ll_features['msd_id'].nunique(),
    },
    'hl': {
        'msd_id_count': hl_features['msd_id'].nunique(),
    },
    'll': {
        'msd_id_count': ll_features['msd_id'].nunique(),
    },
    'all': {
        'msd_id_count': all_features['msd_id'].nunique(),
    },
})
display(counts)

## Merged dataset sizes for hits

In [None]:
ab_hl_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_matches_ab_hl_features.parquet')[['mbid']]
ab_hl_features = ab_hl_features.merge(msd_bb_mbid_matches, on=['mbid'])[['msd_id', 'mbid']]
msd_hl_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_matches_msd_hl_features.parquet')[['msd_id']]


ab_ll_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_matches_ab_ll_features.parquet')[['mbid']]
ab_ll_features = ab_ll_features.merge(msd_bb_mbid_matches, on=['mbid'])[['msd_id', 'mbid']]
msd_ll_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_matches_msd_ll_features.parquet')[['msd_id']]

hl_features = ab_hl_features.merge(msd_hl_features, on=['msd_id'])
ll_features = ab_ll_features.merge(msd_ll_features, on=['msd_id'])
mel_features = pd.read_pickle('../data/hit_song_prediction_ismir2020/interim/msd_bb_matches.pickle')[['msd_id']]

all_features = ll_features.merge(hl_features, on=['msd_id'])
all_features = all_features.merge(mel_features, on=['msd_id'])

counts = pd.DataFrame({
    'ab_hl': {
        'msd_id_count': ab_hl_features['msd_id'].nunique(),
    },
    'ab_ll': {
        'msd_id_count': ab_ll_features['msd_id'].nunique(),
    },
    'msd_hl': {
        'msd_id_count': msd_hl_features['msd_id'].nunique(),
    },
    'msd_ll': {
        'msd_id_count': msd_ll_features['msd_id'].nunique(),
    },
    'hl': {
        'msd_id_count': hl_features['msd_id'].nunique(),
    },
    'll': {
        'msd_id_count': ll_features['msd_id'].nunique(),
    },
    'mel': {
        'msd_id_count': mel_features['msd_id'].nunique(),
    },
    'all': {
        'msd_id_count': all_features['msd_id'].nunique(),
    },
})
display(counts)

In [None]:
feature = 'highlevel.gender.all.female'

ab_hl_features = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_matches_ab_hl_features.parquet')
data = ab_hl_features.merge(msd_bb_mbid_matches, on=['mbid'])

dup = data[data.duplicated(['msd_id'], keep=False)]

describtion = dup[['msd_id', feature]].groupby('msd_id').apply(lambda x: x.describe().T)
describtion['diff'] = describtion['max'] - describtion['min']

stddev = data[feature].std()
describtion['ratio'] = describtion['std'] / stddev

display(describtion.describe())

describtion[describtion['ratio'] >= 1.]