# Analyze Dataset

In [None]:
import glob
import json

import numpy as np
import pandas as pd


dataset = pd.read_parquet('../data/hit_song_prediction_lfm_popularity/raw/dataset_20.parquet')
dataset_cols = list(filter(lambda x: not x.startswith('highlevel') and not x.startswith('lowlevel') and not x.startswith('rhythm') and not x.startswith('tonal'), list(dataset.columns)))

charts = pd.read_csv('/storage/nas3/datasets/music/billboard/billboard_hot-100_1958-08-11_2019-07-06.csv', header=0, index_col=0)
yt = pd.read_pickle('../data/interim/lfm_popularity/mp3s/yt_info.pickle')

dataset_charts = dataset.merge(charts, left_on=['artist_name', 'recording_name'], right_on=['artist', 'title'])
dataset_yt = dataset.merge(yt, left_on=['recording_mbid'], right_on=['mbid'])
dataset_all = dataset_charts.merge(yt, left_on=['recording_mbid'], right_on=['mbid'])

## Show duplicates

In [None]:
duplicates = dataset[dataset.duplicated(['artist_name', 'recording_name'], keep=False)]
duplicates.sort_values(['artist_name', 'recording_name'], inplace=True)
duplicates = duplicates[duplicates['listener_count'] > 100_000]
display(duplicates[dataset_cols])

## Charts

In [None]:
import seaborn as sns

corr = dataset_charts[['listener_count','play_count','peakPos', 'rank', 'weeks']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=0., vmax=1.).set_title('Pearson')

In [None]:
kendall_corr = dataset_charts[['listener_count','play_count','peakPos', 'rank', 'weeks']].corr(method='kendall')
sns.heatmap(kendall_corr, annot=True, cmap='coolwarm', vmin=0., vmax=1.).set_title('kendall')

## Target Values

In [None]:
targets = ['play_count', 'listener_count', 'yt_view_count', 'yt_like_count', 'yt_dislike_count', 'log_pc', 'log_lc', 'log_yt_vc', 'lfm_hit_score', 'log_lfm_hit_score']

dataset_yt = dataset_yt[dataset_yt['listener_count'] > 0]
dataset_yt = dataset_yt[dataset_yt['play_count'] > 0]
dataset_yt.drop_duplicates(['artist_name', 'recording_name'], inplace=True)

dataset_yt['log_pc'] = np.log(dataset_yt['play_count'])
dataset_yt['log_lc'] = np.log(dataset_yt['listener_count'])

dataset_yt['log_yt_vc'] = np.log(dataset_yt['yt_view_count'])

dataset_yt['lfm_hit_score'] = dataset_yt['log_pc'] * dataset_yt['log_lc']
dataset_yt['log_lfm_hit_score'] = np.log(dataset_yt['lfm_hit_score'])

corr = dataset_yt[targets].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=0., vmax=1.).set_title('Pearson')

In [None]:
kendall_corr = dataset_yt[targets].corr(method='kendall')
sns.heatmap(kendall_corr, annot=True, cmap='coolwarm', vmin=0., vmax=1.).set_title('kendall')

In [None]:
reduced_dataset = dataset_yt[~dataset_yt['yt_release_date'].isna()]

kendall_corr = reduced_dataset[targets].corr(method='kendall')
sns.heatmap(kendall_corr, annot=True, cmap='coolwarm', vmin=0., vmax=1.).set_title('kendall')
display(len(dataset_yt), len(reduced_dataset))

display(reduced_dataset[targets].head(10))

In [None]:
plot_columns = ['log_pc', 'log_lc', 'log_yt_vc', 'lfm_hit_score', 'log_lfm_hit_score']

plot_data = dataset_yt.sort_values(by=['log_yt_vc']).reset_index()

display(plot_data[['artist_name', 'recording_name'] + plot_columns].tail(10))

plot_data[plot_columns].plot()

In [None]:
targets = ['play_count', 'listener_count', 'peakPos', 'rank', 'weeks', 'log_pc', 'log_lc', 'lfm_hit_score', 'log_lfm_hit_score']

dataset_charts = dataset_charts[dataset_charts['listener_count'] > 0]
dataset_charts = dataset_charts[dataset_charts['play_count'] > 0]

dataset_charts['log_pc'] = np.log(dataset_charts['play_count'])
dataset_charts['log_lc'] = np.log(dataset_charts['listener_count'])

dataset_charts['lfm_hit_score'] = dataset_charts['log_pc'] * dataset_charts['log_lc']
dataset_charts['log_lfm_hit_score'] = np.log(dataset_charts['lfm_hit_score'])

corr = dataset_charts[targets].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=0., vmax=1.).set_title('Pearson')

In [None]:
kendall_corr = dataset_charts[targets].corr(method='kendall')
sns.heatmap(kendall_corr, annot=True, cmap='coolwarm', vmin=0., vmax=1.).set_title('kendall')

In [None]:
plot_columns = ['log_pc', 'log_lc', 'lfm_hit_score', 'log_lfm_hit_score']

plot_data = dataset_yt[dataset_yt['lfm_hit_score'] > 10].sort_values(by=['lfm_hit_score']).reset_index()
plot_data = plot_data
display(plot_data[['artist_name', 'recording_name'] + plot_columns].tail(10))

plot_data[plot_columns].plot()