# Compare to Random Models

In [1]:
import math
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))
from src import EDA
from src import ModelEvaluation
%load_ext autoreload
%autoreload 2

## This notebook was used to explore the differences in evaluation
### Between randomly guessing ratings and ALS

In [2]:
steam_df = EDA.load_without_cold_start(5)
steam_df = steam_df[steam_df['purchase_action'] == 'play']
steam_df["playtime_rank"] = steam_df['playtime'].map(lambda value: EDA.rank_playtime(value))
steam_df = EDA.get_uids(steam_df, from_column='game_name', to_column='game_uid')
with_summaries_df = EDA.add_summaries(steam_df)
with_summaries_df.head(2)

  return filtered_users[steam_df['game_name'].isin(usable_games['game_name'].values)]
  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,uid,game_name,purchase_action,playtime,playtime_rank,game_uid,playtime_mean,playtime_min,playtime_max,game_counts,min_max
1,151603712,The Elder Scrolls V Skyrim,play,273.0,3,0,105.72153,0.1,1986.0,562,0.412256
3,151603712,Fallout 4,play,87.0,3,1,65.274172,0.2,629.0,151,0.414122


# Random Model

In [3]:
min_score = with_summaries_df['min_max'].min()
max_score = with_summaries_df['min_max'].max()
def get_random_scores(n, max_score):
    return np.random.random_sample(n, ) * max_score

In [4]:
get_random_scores(100, max_score)[:5]

array([ 2.17048661,  0.20939214,  1.72070346,  2.94262984,  0.49252574])

In [5]:
steam_df_random_predictions = with_summaries_df.copy()
steam_df_random_predictions['random_prediction'] = get_random_scores(steam_df_random_predictions.shape[0], max_score)

In [6]:
steam_df_random_predictions.head(10)

Unnamed: 0,uid,game_name,purchase_action,playtime,playtime_rank,game_uid,playtime_mean,playtime_min,playtime_max,game_counts,min_max,random_prediction
1,151603712,The Elder Scrolls V Skyrim,play,273.0,3,0,105.72153,0.1,1986.0,562,0.412256,2.088341
3,151603712,Fallout 4,play,87.0,3,1,65.274172,0.2,629.0,151,0.414122,0.200003
5,151603712,Spore,play,14.9,2,2,26.016667,0.1,417.0,54,0.1065,0.627496
7,151603712,Fallout New Vegas,play,12.1,2,3,52.247843,0.1,417.0,255,0.086352,1.654936
9,151603712,Left 4 Dead 2,play,8.9,2,4,43.288872,0.1,2710.0,665,0.009742,1.304388
11,151603712,HuniePop,play,8.5,2,5,20.82,0.8,70.0,20,0.333815,1.34987
13,151603712,Path of Exile,play,8.1,2,6,52.956395,0.1,1158.0,172,0.020727,0.933534
15,151603712,Poly Bridge,play,7.5,2,7,5.825,1.7,11.8,8,1.722772,2.033377
17,151603712,Left 4 Dead,play,3.3,1,8,46.823977,0.1,817.0,171,0.011752,0.444031
19,151603712,Team Fortress 2,play,2.8,1,9,96.972904,0.1,3503.0,1181,0.002312,2.324478


In [7]:
spark = SparkSession.builder.getOrCreate()
spark_df = spark.createDataFrame(steam_df_random_predictions)
spark_df.count()
train, test = spark_df.randomSplit([0.5, 0.5], seed=427471138)
print('Training size: ', train.count())
print('Test size: ', test.count())

Training size:  29069
Test size:  28720


In [8]:
als_model = ALS(
    itemCol='game_uid',
    userCol='uid',
    ratingCol='min_max',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy="drop", # Drops if user or item in test was not in train
    rank=10) 

fitted_als_model = als_model.fit(train)

predictions = fitted_als_model.transform(test)
evaluator = RegressionEvaluator() \
    .setMetricName("rmse") \
    .setLabelCol("min_max") \
    .setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)
rmse

0.7367555831067734

In [9]:
# Random RMSE
# Would need multiple random models to be confident since it will be... random

prediction_count = predictions.count() # less than test item count (NaNs)
predictions_rdd = predictions.rdd
random_SSE = predictions_rdd.map(lambda r: (r['min_max'] - r['random_prediction'])**2) \
    .reduce(lambda total, x: total + x)
random_rmse = math.sqrt(random_SSE / prediction_count)

In [10]:
print('Random RMSE: ', random_rmse)
print('% Increase in Random Model RMSE: ', ((random_rmse / rmse) - 1) * 100)

Random RMSE:  1.5662927061617042
% Increase in Random Model RMSE:  112.59325915888057


In [11]:
# Huge RMSE difference and rmse value
# mean that model may be useful to predict # hours played for a given game

In [12]:
# For NDCG Calculation
# fitted_als_model.recommendForAllUsers(n) may be an interesting alternative on train
# predictions.groupBy('user_id') may be more efficient - df is more efficient than rdd

test_point = (309404240,
  [(0.0006331785563528914, 0.000634816475212574),
   (0.42567567567567566, 0.008724773302674294)])

def sort_predictions_slice(arr, n):
    actual_and_pred = np.array(arr)
    # sort by predictions
    indeces = np.argsort(actual_and_pred[:, 1])
    return actual_and_pred[indeces[::-1]][:n].tolist()

# lambda functions in rdds cant import modules
def dcg_at_k(scores, k):
    """
    Discounted cumulative gain
    See http://fastml.com/evaluating-recommender-systems/
    Args:
        r: List - Relevance scores in rank order
        k: Number of results to consider
    Returns:
        Float
    """
    r = np.asfarray(scores)[:k]
    if r.size:
        # item 1 and 2 have same weights
        return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        # use below for more emphasis on first rank
        # return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(scores, k):
    """
    Normalized Discounted cumulative gain
    See http://fastml.com/evaluating-recommender-systems/
    Args:
        r: List - Relevance scores in rank order
        k: Number of results to consider
    Returns:
        Float from 0 to 1
    """
    dcg_max = dcg_at_k(sorted(scores, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(scores, k) / dcg_max

x = sort_predictions_slice(test_point[1], 3)
print('x: ', x)

x:  [[0.42567567567567566, 0.008724773302674294], [0.0006331785563528914, 0.000634816475212574]]


In [13]:
# use actual values for gain
sampled = predictions_rdd.sample(False, 1, 1)
ndcg = sampled.map(lambda row: (row['uid'], [(row['min_max'], row['prediction'])])) \
    .reduceByKey(lambda total, val: total + val) \
    .map(lambda kv: (kv[0], sort_predictions_slice(kv[1], 1000))) \
    .map(lambda kv: ndcg_at_k(np.array(kv[1])[:, 0], 3)) \
    .reduce(lambda total, gain: total + gain) 
average_ndcg = ndcg / prediction_count
ndcg

1573.6568822738438

In [14]:
# use actual values for gain
sampled = predictions_rdd.sample(False, 1, 1)
random_ndcg = sampled.map(lambda row: (row['uid'], [(row['min_max'], row['random_prediction'])])) \
    .reduceByKey(lambda total, val: total + val) \
    .map(lambda kv: (kv[0], sort_predictions_slice(kv[1], 1000))) \
    .map(lambda kv: ndcg_at_k(np.array(kv[1])[:, 0], 3)) \
    .reduce(lambda total, gain: total + gain) 
average_random_dcg = random_ndcg / prediction_count
random_ndcg

1301.6262041264006

In [15]:
# 7.4% with 80/20 split
# 20% with 50/50 split
print('Increase in average cumulative gain: ', (ndcg / random_ndcg - 1) * 100)

Increase in average cumulative gain:  20.8992933059


## With small number of test items, we can randomly guess their order
More data is needed

In [17]:
# use actual values for gain
sampled = predictions_rdd.sample(False, 1, 1)
average_value_count = sampled.map(lambda row: (row['uid'], 1)) \
    .reduceByKey(lambda total, val: total + val)
#     .reduce(lambda total, gain: total + gain) 
average_value_count.take(2)

[(64350600, 3), (163432200, 38)]

In [24]:
# TODO Median
med = predictions.groupby('uid').count().approxQuantile("count", [0.5], 0.25)
med

[6.0]

In [125]:
unique_users_in_predictions = predictions.groupby('uid').count().count()

In [126]:
print("Average number of predictions per user: ", prediction_count / unique_users_in_predictions)

Average number of predictions per user:  5.345505617977528


# MAP will weight first higher
Skewed by useless data (like 1 recommendation).  
Imagine 1000 things with 1 rec and 1 thing with many ranks ranked well.   
Really anything that cannot be predicted well will improve real vs random recommendation