In [1]:
# usual imports
import os
import sys
import time
import glob
import datetime
import sqlite3
import pandas as pd
import numpy as np # get it at: http://numpy.scipy.org/
# path to the Million Song Dataset subset (uncompressed)
# CHANGE IT TO YOUR LOCAL CONFIGURATION
msd_subset_path='./data/MSD/MillionSongSubset'
msd_subset_data_path=os.path.join(msd_subset_path,'data')
msd_subset_addf_path=os.path.join(msd_subset_path,'AdditionalFiles')
assert os.path.isdir(msd_subset_path),'wrong path' # sanity check
# path to the Million Song Dataset code
# CHANGE IT TO YOUR LOCAL CONFIGURATION
msd_code_path='./data/MSD/MSongsDB'
assert os.path.isdir(msd_code_path),'wrong path' # sanity check
# we add some paths to python so we can import MSD code
# Ubuntu: you can change the environment variable PYTHONPATH
# in your .bashrc file so you do not have to type these lines
sys.path.append(os.path.join(msd_code_path,'PythonSrc') )

taste_profile_data_path="./data/TeenyTinyEchoNestTasteProfileSubset.csv"
assert os.path.isfile(taste_profile_data_path)

# imports specific to the MSD
import hdf5_getters as GETTERS

from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [113]:
def reduce_taste_subset(path='./data/FullEchoNestTasteProfileSubset.txt', 
                        to_path='./data/TeenyTinyEchoNestTasteProfileSubset.csv', downsample=0.001):
    data = pd.read_csv(path, sep="\t", header=None)
    data.columns = ['user', 'song', 'play_count']
    data.astype({'user': np.str, 'song': np.str, 'play_count': np.int32})
    data.sample(frac=downsample).to_csv(to_path, index=False)
# the following function simply gives us a nice string for
# a time lag in seconds
def strtimedelta(starttime,stoptime):
    return str(datetime.timedelta(seconds=stoptime-starttime))

In [2]:
data = pd.read_csv(taste_profile_data_path)

In [3]:
data.head()

Unnamed: 0,user,song,play_count
0,e45503e1d7c3c88fcea0da0947292afdbb26d27c,SOISQID12A6D4FD97C,1
1,ec04a29f15f25f1413256035c76be47675861682,SODSESK12A81C2178C,1
2,144889203bd3fbd4dcb1a64dca945c7aece4c14b,SOLNDSO12AF72ABA6C,1
3,d4ed4f49f3020cc18dd413acc8824ee02d72c48b,SOBOFXF12A6D4F83F0,2
4,67ab466c740e28e8afc3762b75571f5162ae9411,SOIGGWG12A6D4F85D4,1


In [118]:
song_id = data.iloc[1]['song']
print(data.shape)

(48374, 3)


In [10]:
a = "e45503e1d7c3c88fcea0da0947292afdbb26d27c"


ValueError: invalid literal for int() with base 10: 'e45503e1d7c3c88fcea0da0947292afdbb26d27c'

In [119]:
# Get the song ids which are also in the database. 
valid_song_ids = []

for song_id in data['song'].values:
    # let's redo all this work in SQLite in a few seconds
    t1 = time.time()
    # connect to database to get the metadata from MSD
    conn = sqlite3.connect(os.path.join(msd_subset_addf_path,
                                        'subset_track_metadata.db'))

    q = "SELECT DISTINCT song_id,artist_id,artist_name,Count(track_id) FROM songs"
    q += " WHERE song_id='" + song_id + "'"
    res = conn.execute(q)
    res = res.fetchall()
    conn.close()
    t2 = time.time()
    if res[0] is not None:
        valid_song_ids.append(song_id)


In [4]:


def tune_ALS(train_data, validation_data, maxIter, regParams, ranks):
    """
    grid search function to select the best model based on RMSE of
    validation data
    Parameters
    ----------
    train_data: spark DF with columns ['user', 'song', 'play_count']
    
    validation_data: spark DF with columns ['user', 'song', 'play_count']
    
    maxIter: int, max number of learning iterations
    
    regParams: list of float, one dimension of hyper-param tuning grid
    
    ranks: list of float, one dimension of hyper-param tuning grid
    
    Return
    ------
    The best fitted ALS model with lowest RMSE score on validation data
    """
    # initial
    min_error = float('inf')
    best_rank = -1
    best_regularization = 0
    best_model = None
    for rank in ranks:
        for reg in regParams:
            # get ALS model
            als = ALS(userCol="user", itemCol="song", ratingCol="play_count").setMaxIter(maxIter).setRank(rank).setRegParam(reg)
            # train ALS model
            model = als.fit(train_data)
            # evaluate the model by computing the RMSE on the validation data
            predictions = model.transform(validation_data)
            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(predictions)
            print('{} latent factors and regularization = {}: '
                  'validation RMSE is {}'.format(rank, reg, rmse))
            if rmse < min_error:
                min_error = rmse
                best_rank = rank
                best_regularization = reg
                best_model = model
    print('\nThe best model has {} latent factors and '
          'regularization = {}'.format(best_rank, best_regularization))
    return best_model

In [138]:
split_frac = 0.7
train_data = data.iloc[:int(data.shape[0] * split_frac)]
val_data = data.iloc[int(data.shape[0] * split_frac):]

In [5]:
spark_data = spark.createDataFrame(data)
(training, val) = spark_data.randomSplit([0.8, 0.2])


In [7]:
tune_ALS(training, val, 5, [0.1], [5])

IllegalArgumentException: u'requirement failed: Column user must be of type numeric but was actually of type string.'