In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import json
%matplotlib inline
import matplotlib.pyplot as plt

In [3]:
import os
import sys
import time
import glob
import datetime
import sqlite3

import re

In [28]:
artist_id_file = open("mxm_779k_matches.txt")
msd_name_to_mxm = {} # map MSD artist name to MXM artist name
artist_name_to_tracks = {} # map MSD artist name to list of (MSD, MXM track IDs)

lines = artist_id_file.readlines()
for line in lines:
    if line[0] == '#':
        continue
    terms = re.split('<SEP>', line.rstrip())
    # we know each line is formatted as such:
    # tid|artist name|title|mxm tid|artist_name|title
    msd_name_to_mxm[terms[1]] = terms[4]
    if terms[1] in artist_name_to_tracks:
        artist_name_to_tracks[terms[1]] += (terms[0], terms[3])
    else:
        artist_name_to_tracks[terms[1]] = [(terms[0], terms[3])]
    
artist_id_file.close()

In [30]:
print(artist_name_to_tracks['Beyonce'])

[('TRGQFGC12903D0612A', '5480231'), 'TRRRDQK128F4240B88', '4610816', 'TRQSFVW128F93517D6', '6490445', 'TRNEUCT128F14A5161', '1505921', 'TRYWTDN12903CC1EC3', '9209890']


In [52]:
def construct_track_to_words(lines):
    track_to_words = {} # maps MSD track ID to np.array of word counts, indexed by all_words
    all_words = []
    for line in lines:
        if line[0] == '#':
            continue
        if line[0] == '%':
            all_words = re.split(',', line[1:].rstrip())
            continue
        content = re.split(',', line.rstrip())
        # know that lines are formatted as TID,MXMID,idx:cnt,idx:cnt,...
        word_counts = np.zeros(len(all_words))
        for term in content[2:]:
            index, count = re.split(':', term)
            index, count = int(index), int(count)
            word_counts[index-1] = count
        track_to_words[content[0]] = word_counts
    return all_words, track_to_words

In [57]:
words_dataset = open("mxm_dataset_train.txt")
lines = words_dataset.readlines()
all_words, track_to_words = construct_track_to_words(lines)
words_dataset.close()

In [58]:
words_dataset = open("mxm_dataset_test.txt")
lines = words_dataset.readlines()
output = construct_track_to_words(lines)
words_dataset.close()

In [59]:
print(len(track_to_words))
track_to_words.update(output[1])
print(len(track_to_words))

210519
237662


In [60]:
artist_name_to_word_count = {} # maps MSD artist name to np array of word count

for artist in artist_name_to_tracks:
    word_counts = np.zeros(len(all_words))
    for track_id in artist_name_to_tracks[artist]:
        track_id = track_id[0]
        if track_id in track_to_words:
            word_counts += track_to_words[track_id]
    artist_name_to_word_count[artist] = word_counts

In [61]:
artists_in_dataset = [name for name in artist_name_to_word_count if np.any(artist_name_to_word_count[name])]

In [62]:
print(len(artists_in_dataset))
print(len(artist_name_to_word_count))

11228
52313
