In [1]:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 14 09:52:15 2017

@author: zhouyi
"""

import json
import os
import sqlite3

import numpy as np
import pandas as pd


# Change this to wherever you keep the data
TPS_DIR = '/Users/zhouyi/Documents/GraduatePJ/MSD'

# The dataset can be obtained here:
# http://labrosa.ee.columbia.edu/millionsong/sites/default/files/challenge/train_triplets.txt.zip
TP_file = os.path.join(TPS_DIR, 'train_triplets.txt')
# track_metadata.db contains all the metadata, which is not required to subsample the data, but only used when 
# referring to the actual information about particular pieces (e.g. artist, song name, etc.)
# Available here: http://labrosa.ee.columbia.edu/millionsong/sites/default/files/AdditionalFiles/track_metadata.db
md_dbfile = os.path.join(TPS_DIR, 'track_metadata.db')

tp = pd.read_table(TP_file, header=None, names=['uid', 'sid', 'count'])
# We only keep songs that are listened to by at least MIN_SONG_COUNT users and users who have listened 
# to at least MIN_USER_COUNT songs
MIN_USER_COUNT = 20
MIN_SONG_COUNT = 50

In [2]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id, 'count']].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, min_uc=MIN_USER_COUNT, min_sc=MIN_SONG_COUNT):
    # Only keep the triplets for songs which were listened to by at least min_sc users. 
    songcount = get_count(tp, 'sid')
    tp = tp[tp['sid'].isin(songcount.index[songcount >= min_sc])]
    
    # Only keep the triplets for users who listened to at least min_uc songs
    # After doing this, some of the songs will have less than min_uc users, but should only be a small proportion
    usercount = get_count(tp, 'uid')
    tp = tp[tp['uid'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and songcount after filtering
    usercount, songcount = get_count(tp, 'uid'), get_count(tp, 'sid') 
    return tp, usercount, songcount

tp, usercount, songcount = filter_triplets(tp)

sparsity_level = float(tp.shape[0]) / (usercount.shape[0] * songcount.shape[0])
print "After filtering, there are %d triplets from %d users and %d songs (sparsity level %.3f%%)" % (tp.shape[0], 
                                                                                                      usercount.shape[0], 
                                                                                                      songcount.shape[0], 
                                                                                                      sparsity_level * 100)


After filtering, there are 39730795 triplets from 629112 users and 98485 songs (sparsity level 0.064%)


In [3]:
# take a look at the User and Songs count
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 4))
usercount.hist(bins=100)
plt.xlabel('number of songs each user listens to')

plt.figure(figsize=(10, 4))
songcount.hist(bins=100)
plt.xlabel('number of users by which each song is listened to')

# take a look at the top 10 most listened songs
def get_song_info_from_sid(conn, sid):
    cur = conn.cursor()
    cur.execute("SELECT title, artist_name FROM songs WHERE song_id = '%s'" % (sid))
    title, artist = cur.fetchone()
    return title, artist

songcount.sort(ascending=False)

with sqlite3.connect(md_dbfile) as conn:
    for i in xrange(10):
        sid = songcount.index[i]
        title, artist = get_song_info_from_sid(conn, sid)
        print "%s BY %s -- count: %d" % (title, artist, songcount[i])




Sehr kosmisch BY Harmonia -- count: 82524
Dog Days Are Over (Radio Edit) BY Florence + The Machine -- count: 73359
Undo BY Björk -- count: 64711
Secrets BY OneRepublic -- count: 62270
You're The One BY Dwight Yoakam -- count: 61191
Revelry BY Kings Of Leon -- count: 60286
Fireflies BY Charttraxx Karaoke -- count: 51811
Hey_ Soul Sister BY Train -- count: 51280
Horn Concerto No. 4 in E flat K495: II. Romance (Andante cantabile) BY Barry Tuckwell/Academy of St Martin-in-the-Fields/Sir Neville Marriner -- count: 50840
Tive Sim BY Cartola -- count: 45128


In [4]:
# Subsample a subset     
unique_uid = usercount.index

np.random.seed(98765)

n_users = 10000
p_users = usercount / usercount.sum()
idx = np.random.choice(len(unique_uid), size=n_users, replace=False, p=p_users.tolist())
unique_uid = unique_uid[idx]
tp = tp[tp['uid'].isin(unique_uid)]

unique_sid = songcount.index
n_songs = 1000
p_songs = songcount / songcount.sum()
idx = np.random.choice(len(unique_sid), size=n_songs, replace=False, p=p_songs.tolist())
unique_sid = unique_sid[idx]
tp = tp[tp['sid'].isin(unique_sid)]

tp, usercount, songcount = filter_triplets(tp, min_uc=MIN_USER_COUNT, min_sc=MIN_SONG_COUNT)
unique_uid = usercount.index
unique_sid = songcount.index

sparsity_level = float(tp.shape[0]) / (usercount.shape[0] * songcount.shape[0])
print "After subsampling and filtering, there are %d triplets from %d users and %d songs (sparsity level %.3f%%)" % \
(tp.shape[0], usercount.shape[0], songcount.shape[0], sparsity_level * 100)

song2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))

with open(os.path.join(TPS_DIR, 'unique_uid_sub.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)

with open(os.path.join(TPS_DIR, 'unique_sid_sub.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

After subsampling and filtering, there are 34147 triplets from 1013 users and 400 songs (sparsity level 8.427%)
