# Clustering

In [None]:
import pandas as pd

from hit_prediction_code import common
from hit_prediction_code.dataloaders import matcher


metadata = [
    'metadata.audio_properties.analysis_sample_rate',
    'metadata.audio_properties.bit_rate',
    'metadata.audio_properties.codec',
    'metadata.audio_properties.downmix',
    'metadata.audio_properties.equal_loudness',
    'metadata.audio_properties.length',
    'metadata.audio_properties.lossless',
    'metadata.audio_properties.md5_encoded',
    'metadata.audio_properties.replay_gain',
    'metadata.audio_properties.sample_rate',
    'metadata.version.highlevel.essentia',
    'metadata.version.highlevel.essentia_build_sha',
    'metadata.version.highlevel.essentia_git_sha',
    'metadata.version.highlevel.extractor',
    'metadata.version.highlevel.gaia',
    'metadata.version.highlevel.gaia_git_sha',
    'metadata.version.highlevel.models_essentia_git_sha',
    'metadata.version.lowlevel.essentia',
    'metadata.version.lowlevel.essentia_build_sha',
    'metadata.version.lowlevel.essentia_git_sha',
    'metadata.version.lowlevel.extractor',
]

hl = pd.read_parquet('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_cleaned_matches_ab_hl_features.parquet')
features = common.filter_features(hl.columns, common.highlevel_regex())
hl = hl[['mbid', 'file', 'file_id'] + features + metadata]

msd_bb_mbid_cleaned_matches = pd.read_csv('../data/hit_song_prediction_ismir2020/interim/msd_bb_mbid_cleaned_matches.csv', header=0, index_col=0)
msd_bb_mbid_cleaned_matches = matcher.filter_duplicates(msd_bb_mbid_cleaned_matches, id_cols=['mbid'], target_col='peakPos', keep_lowest=True)

hl = hl.merge(msd_bb_mbid_cleaned_matches, on=['mbid']).dropna()
hl = hl.drop_duplicates(['artist_clean', 'title_clean', 'file_id'])

In [None]:
min_length = 2 * 60
max_length = 10 * 60

song_count = len(hl[['artist_clean', 'title_clean']].drop_duplicates())
hl_len = len(hl)

hl = hl[hl['metadata.audio_properties.length'] >= min_length]
hl = hl[hl['metadata.audio_properties.length'] < max_length]

print(song_count, hl_len, len(hl[['artist_clean', 'title_clean']].drop_duplicates()), len(hl))

hl = hl[hl.duplicated(['artist_clean', 'title_clean'], keep=False)]
hl = hl.head(400)

In [None]:
from hit_prediction_code.cluster import select_features_with_clustering

data = select_features_with_clustering(hl, song_id_cols=['artist_clean', 'title_clean'], feature_cols=features, certainty=1.2, feature_id='file_id')
display(len(hl), len(data))
hl = data

In [None]:
from sklearn import manifold

dim_red = manifold.TSNE(n_components=2, init='pca', random_state=0)

data = pd.DataFrame(dim_red.fit_transform(hl[features]))
data['cluster_id'] = hl['cluster_id']
data['song_id'] = hl['song_id']

display(data.plot.scatter(x=0, y=1, c='song_id', colormap='nipy_spectral', title='TSNE song_id'))
display(data.plot.scatter(x=0, y=1, c='cluster_id', colormap='nipy_spectral', title='TSNE cluster_id'))

In [None]:
for _, group in hl.groupby(['cluster_id']):
    if group['song_id'].nunique() > 1:
        display(group[['cluster_id', 'song_id', 'artist_clean', 'title_clean', 'mbid', 'peakPos', 'weeks']])