# Map tracks in 30Music dataset to MSD

In [None]:
%matplotlib inline
import os, sys
import numpy as np
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
from urllib.parse import unquote_plus

In [None]:
data_dir = 'data/30music'
ftrack = os.path.join(data_dir, 'tracks.csv')
fartist = os.path.join(data_dir, 'persons.csv')
flastfm_track = os.path.join(data_dir, 'lastfm/lastfm_tracks.pkl')

## Load data

Tracks data.

In [None]:
tracks = pd.read_csv(ftrack, sep=';', keep_default_na=False)  #, index_col='ID')

In [None]:
#tracks.set_index('ID', inplace=True)

In [None]:
print(tracks.shape[0])
print('#tracks:', tracks.index.unique().shape[0])
tracks.head()

In [None]:
tracks.dtypes

In [None]:
tnames = tracks['Name'].values

In [None]:
tnames[9]

In [None]:
unquote_plus(tnames[9])

In [None]:
unquote_plus(tnames[9]).split('/')[-1].split('_')[-1]

Artist data.

In [None]:
artists = pd.read_csv(fartist, index_col='ID', sep=';')

In [None]:
print(artists.shape[0])
print('#artists:', artists.index.unique().shape[0])
artists.head()

In [None]:
anames = artists['Name'].values

In [None]:
unquote_plus(anames[1])

## Build mapping

### 1. Match artist

LastFM artists.

In [None]:
lastfm_tracks = pkl.load(open(flastfm_track, 'rb'))

In [None]:
print(len(lastfm_tracks))
lastfm_tracks[0]

In [None]:
lastfm_artists = sorted({str(t[2]).lower() for t in lastfm_tracks})

In [None]:
print(len(lastfm_artists))
lastfm_artists[1100]

In [None]:
dat = np.random.rand(3, 5)
dat

In [None]:
np.mean(dat, axis=0)

In [None]:
np.var(dat, axis=0)

In [None]:
from scipy.stats import moment, kurtosis, skew, describe
moment(dat, moment=[1,2,3], axis=0)

In [None]:
rset = describe(dat, axis=0)

In [None]:
type(rset)

In [None]:
dat

In [None]:
rset

In [None]:
describe([1,2,3,4,5])

In [None]:
ab = np.zeros(6)

In [None]:
ab.ndim

In [None]:
np.zeros(6).tolist()

In [None]:
aa = []
aa += [1, 2]
aa += [3, 4]
aa += [5, 6]
aa

In [None]:
rset.kurtosis

In [None]:
rset.skewness

In [None]:
skew(dat, axis=0)

30Music artists.

In [None]:
def parse_artist_name(artist_name):
    name = unquote_plus(artist_name).split('/')[-1].split('_')[-1].split('!')[-1]
    return name.strip()

In [None]:
artists_30music = sorted({parse_artist_name(str(x)) for x in artists['Name'].values})

In [None]:
print(len(artists_30music))
artists_30music[1700]

LastFM `(title, artist) <--> [track_id, ...]` mapping, one (title, artist) pair can have more than one tracks.

In [None]:
lastfm_tracks = pkl.load(open(flastfm_track, 'rb'))

In [None]:
ta2tid = dict()

In [None]:
for i in range(len(lastfm_tracks)):
    if (i+1) % 1000 == 0:
        sys.stdout.write('\r%d / %d' % (i+1, len(lastfm_tracks)))
        sys.stdout.flush()
        
    item = lastfm_tracks[i]
    tid = item[0]
    key = (item[1], item[2])
    try:
        ta2tid[key].append(tid)
    except KeyError:
        ta2tid[key] = [tid]

In [None]:
len(ta2tid)

In [None]:
np.sum([len(x) for x in ta2tid.values()])

30Music `(title, artist) <--> track_id` mapping.

In [None]:
aa = 'hello (year 2018)'
aa

In [None]:
bb = aa.replace('\(.*\)', '')
bb

In [None]:
def parse_track_name(track_name):
    name = unquote_plus(track_name).split('/')[-1].split('_')[-1]
    name.replace

In [None]:
ta2num = dict()

In [None]:
for ix in tracks.index:
    if (ix+1) % 1000 == 0:
        sys.stdout.write('\r%d / %d' % (ix+1, tracks.shape[0]))
        sys.stdout.flush()
        
    num, title, aid = tracks.loc[ix][['ID', 'Name', 'ArtistsID']]
    artist = artists.loc[aid, 'Name']
    key = (title, artist)
    try:
        ta2num[key].append(num)
    except KeyError:
        ta2num[key] = [num]

In [None]:
len(ta2num)

In [None]:
np.sum([len(x) for x in ta2num.values()])

## Match

In [None]:
intersection = set(ta2tid.keys()) & set(ta2num.keys())

In [None]:
len(intersection)

### Playlist

In [None]:
fplaylist = os.path.join(data_dir, 'playlist.csv')

In [None]:
playlist = pd.read_csv(fplaylist, index_col='ID', sep=';')

In [None]:
playlist.head()

Filtering out playlists without tracks data.

In [None]:
#playlist[playlist['TracksID'].isin([np.nan])].head()
playlist[playlist['TracksID'].isnull()].head()

In [None]:
playlist[playlist['TracksID'].notnull()].shape

In [None]:
playlist = playlist[playlist['TracksID'].notnull()]
print(playlist.shape[0])
print('#playlist:', playlist.index.unique().shape[0])

Histogram of playlist length (i.e., the number of tracks/songs).

In [None]:
ax = plt.subplot(111)
playlist['#Tracks'].hist(ax=ax)
ax.set_xlabel('playlist length')
ax.set_ylabel('#playlists')
ax.set_yscale('log')

In [None]:
playlist['#Tracks'].describe()

In [None]:
playlist['#Tracks'].median()