In [0]:
%spark.pyspark
from pyspark.sql import SQLContext
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", ACCESS_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", SECRET_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.amazonaws.com")
sqlContext = SQLContext(sc)
dfMSD = sqlContext.read.parquet("s3a://msdbucket/parquet")
dfMSD.printSchema()


In [1]:
%spark.pyspark
# Load taste profile dataset and convert to df with defined schema
from pyspark.sql.types import IntegerType, StringType, StructField, StructType
rddTaste = sc.textFile("file:///home/hadoop/train_triplets.txt").map(lambda x: x.split("\t")).map(lambda p: (p[0], p[1], int(p[2])))
schemaTaste = StructType([StructField("user", StringType(), True), StructField("song", StringType(), True), StructField("playCount", IntegerType(), True)])
dfTaste = sqlContext.createDataFrame(rddTaste, schema=schemaTaste)

In [2]:
%spark.pyspark
from pyspark.sql import functions as F
# change ids from strings to integers
user_change = dfTaste.select('user').distinct().select('user', F.monotonically_increasing_id().alias('new_user'))
song_change = dfTaste.select('song').distinct().select('song', F.monotonically_increasing_id().alias('new_song'))

# get total unique users and songs
unique_users = user_change.count()
unique_songs = song_change.count()
print('Number of unique users: {0}'.format(unique_users))
print('Number of unique songs: {0}'.format(unique_songs))

In [3]:
%spark.pyspark
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
# Run string indexer on serveral column
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(dfTaste) for column in ["user", "song"]]
pipeline = Pipeline(stages=indexers)
dfTaste_idx = pipeline.fit(dfTaste).transform(dfTaste)

dfTaste_idx.show(5)

In [4]:
%spark.pyspark
# Cast index column to Integer
dfTaste_idx = dfTaste_idx.withColumn("user_index", dfTaste_idx['user_index'].cast(IntegerType()))
dfTaste_idx = dfTaste_idx.withColumn("song_index", dfTaste_idx['song_index'].cast(IntegerType()))

In [5]:
%spark.pyspark
# cache
tasteDf_with_songId = dfTaste_idx
tasteDf_with_songId.cache()
tasteDf_with_songId.show(5)

In [6]:
%spark.pyspark
# Hold out 60% for training, 20% of the data for validation, and leave 20% for testing
seed = 1800009193L
(split_60_df, split_a_20_df, split_b_20_df) = tasteDf_with_songId.randomSplit([0.6, 0.2, 0.2], seed = seed)

# Let's cache these datasets for performance
training_df = split_60_df.cache()
validation_df = split_a_20_df.cache()
test_df = split_b_20_df.cache()

print('Training: {0}, validation: {1}, test: {2}\n'.format(
  training_df.count(), validation_df.count(), test_df.count())
)
training_df.show(3)
validation_df.show(3)
test_df.show(3)

In [7]:
%spark.pyspark
from pyspark.sql.types import DoubleType

#Number of plays needs to be double type, not integers
validation_df = validation_df.withColumn("playCount", validation_df["playCount"].cast(DoubleType()))
validation_df.show(10)

In [8]:
%spark.pyspark
# Alternating least squares
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Let's initialize the ALS learner
als = ALS()

# Now set the parameters for the method
als.setMaxIter(5)\
   .setSeed(seed)\
   .setItemCol("song_index")\
   .setRatingCol("playCount")\
   .setUserCol("user_index")

# Create regression evaluator
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="playCount", metricName="rmse")

# Hyperparameter tuning to find best rank and reg param
tolerance = 0.03
ranks = [4, 8, 12, 16]
regParams = [0.15, 0.2, 0.25]
errors = [[0]*len(ranks)]*len(regParams)
models = [[0]*len(ranks)]*len(regParams)
err = 0
min_error = float('inf')
best_rank = -1
i = 0
for regParam in regParams:
  j = 0
  for rank in ranks:
    # Set the rank here:
    als.setParams(rank = rank, regParam = regParam)
    # Create the model with these parameters.
    model = als.fit(training_df)
    # Run the model to create a prediction. Predict against the validation_df.
    predict_df = model.transform(validation_df)

    # Remove NaN values from prediction (due to SPARK-14489)
    predicted_plays_df = predict_df.filter(predict_df.prediction != float('nan'))
    predicted_plays_df = predicted_plays_df.withColumn("prediction", F.abs(F.round(predicted_plays_df["prediction"],0)))
    # Run the previously created RMSE evaluator, reg_eval, on the predicted_ratings_df DataFrame
    error = reg_eval.evaluate(predicted_plays_df)
    errors[i][j] = error
    models[i][j] = model
    print 'For rank %s, regularization parameter %s the RMSE is %s' % (rank, regParam, error)
    if error < min_error:
      min_error = error
      best_params = [i,j]
    j += 1
  i += 1

als.setRegParam(regParams[best_params[0]])
als.setRank(ranks[best_params[1]])
print 'The best model was trained with regularization parameter %s' % regParams[best_params[0]]
print 'The best model was trained with rank %s' % ranks[best_params[1]]
my_model = models[best_params[0]][best_params[1]]

In [9]:
%spark.pyspark
predicted_plays_df.show(10)

In [10]:
%spark.pyspark
# Testing the model
# In ML Pipelines, this next step has a bug that produces unwanted NaN values. 
# I have to filter them out. See https://issues.apache.org/jira/browse/SPARK-14489

test_df = test_df.withColumn("playCount", test_df["playCount"].cast(DoubleType()))
predict_df = my_model.transform(test_df)

# Remove NaN values from prediction (due to SPARK-14489)
predicted_test_df = predict_df.filter(predict_df.prediction != float('nan'))

# Round floats to whole numbers
predicted_test_df = predicted_test_df.withColumn("prediction", F.abs(F.round(predicted_test_df["prediction"],0)))
# Run the previously created RMSE evaluator, reg_eval, on the predicted_test_df DataFrame
test_RMSE = reg_eval.evaluate(predicted_test_df)

print('The model had a RMSE on the test set of {0}'.format(test_RMSE))

In [11]:
%spark.pyspark
# Comparing the model with the error from a test set where every rating is the averge number of plays from the training set
avg_plays_df = training_df.groupBy().avg('Plays').select(F.round('avg(Plays)'))

avg_plays_df.show(3)
# Extract the average rating value. (This is row 0, column 0.)
training_avg_plays = avg_plays_df.collect()[0][0]

print('The average number of plays in the dataset is {0}'.format(training_avg_plays))

# Add a column with the average rating
test_for_avg_df = test_df.withColumn('prediction', F.lit(training_avg_plays))

# Run the previously created RMSE evaluator, reg_eval, on the test_for_avg_df DataFrame
test_avg_RMSE = reg_eval.evaluate(test_for_avg_df)

print("The RMSE on the average set is {0}".format(test_avg_RMSE))