## Map song/artist to tracks

In [1]:
import os, sys, time, gzip
import sqlite3
import pickle as pkl
import numpy as np
from collections import Counter

In [2]:
data_dir = 'data'
fdb = os.path.join(data_dir, 'msd/track_metadata.db')
fsong2track = os.path.join(data_dir, 'msd/song2tracks.pkl.gz')
fsong2artist = os.path.join(data_dir, 'msd/song2artist.pkl.gz')

Connect the SQLite MSD metadata DB.

In [3]:
conn = sqlite3.connect(fdb)

Get table names.

In [4]:
res = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
data = res.fetchall()
conn.commit()

In [5]:
data

[('songs',)]

Get column names.

In [6]:
res = conn.execute('PRAGMA table_info(songs)')
data = res.fetchall()
conn.commit()

In [7]:
data

[(0, 'track_id', 'text', 0, None, 1),
 (1, 'title', 'text', 0, None, 0),
 (2, 'song_id', 'text', 0, None, 0),
 (3, 'release', 'text', 0, None, 0),
 (4, 'artist_id', 'text', 0, None, 0),
 (5, 'artist_mbid', 'text', 0, None, 0),
 (6, 'artist_name', 'text', 0, None, 0),
 (7, 'duration', 'real', 0, None, 0),
 (8, 'artist_familiarity', 'real', 0, None, 0),
 (9, 'artist_hotttnesss', 'real', 0, None, 0),
 (10, 'year', 'int', 0, None, 0)]

## Map `song_id` to `artist_mbid`

In [None]:
song2artist = dict()

In [None]:
res = conn.execute("SELECT song_id, artist_mbid FROM songs")
data = res.fetchall()
conn.commit()

In [None]:
#type(data)

In [None]:
#data[0]

In [None]:
#data[100]

In [None]:
#type(data[0][1])

In [None]:
for i in range(len(data)):
    sid, aid = data[i]
    if len(aid.strip()) < 1: continue
    song2artist[sid] = aid
    if (i+1) % 10000 == 0:
        sys.stdout.write('\r%d / %d' % (i+1, len(data)))
        sys.stdout.flush()

In [None]:
len(song2artist)

In [None]:
len(set(song2artist.values()))

In [None]:
pkl.dump(song2artist, gzip.open(fsong2artist, 'wb'))

## Map `song_id` to a list of `track_id`

In [None]:
res = conn.execute("SELECT track_id, song_id FROM songs")
data = res.fetchall()
conn.commit()

In [None]:
type(data)

In [None]:
len(data)

In [None]:
data[0]

In [None]:
song2tracks = dict()

In [None]:
for i in range(len(data)):
    tid, sid = data[i]
    try:
        song2tracks[sid].append(tid)
    except KeyError:
        song2tracks[sid] = [tid]
    if (i+1) % 10000 == 0:
        sys.stdout.write('\r%d / %d' % (i+1, len(data)))
        sys.stdout.flush()

In [None]:
len(song2tracks)

In [None]:
sorted(song2tracks.items())[:10]

In [None]:
ntids = [len(v) for v in song2tracks.values()]

In [None]:
len(ntids)

In [None]:
max(ntids)

In [None]:
np.sum(ntids)

In [None]:
counter = Counter(ntids)
counter

Close SQLite DB connection.

In [None]:
conn.close()

Save mapping.

In [None]:
pkl.dump(song2tracks, gzip.open(fsong2track, 'wb'))