# Alternative Train Test Split

In [1]:
import math
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

spark_sess = SparkSession.builder.getOrCreate()
spark_session = spark_sess.newSession()
path = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
# spark_context = spark_session.sparkContext
# spark_context.addFile(path + '/src/', recursive=True)

import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))
from src import EDA
from src import ModelEvaluation
%load_ext autoreload
%autoreload 2

In [2]:
steam_df = EDA.load_without_cold_start(5)
steam_df = steam_df[steam_df['purchase_action'] == 'play']
steam_df["playtime_rank"] = steam_df['playtime'].map(lambda value: EDA.rank_playtime(value))
steam_df = EDA.get_uids(steam_df, from_column='game_name', to_column='game_uid')
with_summaries_df = EDA.add_summaries(steam_df)
with_summaries_df.head(2)

  return filtered_users[steam_df['game_name'].isin(usable_games['game_name'].values)]
  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,uid,game_name,purchase_action,playtime,playtime_rank,game_uid,playtime_mean,playtime_min,playtime_max,game_counts,min_max
1,151603712,The Elder Scrolls V Skyrim,play,273.0,3,0,105.72153,0.1,1986.0,562,0.412256
3,151603712,Fallout 4,play,87.0,3,1,65.274172,0.2,629.0,151,0.414122


In [3]:
min_score = with_summaries_df['min_max'].min()
max_score = with_summaries_df['min_max'].max()
def get_random_scores(n, max_score):
    return np.random.random_sample(n, ) * max_score

In [4]:
steam_df_random_predictions = with_summaries_df.copy()
steam_df_random_predictions['random_prediction'] = get_random_scores(steam_df_random_predictions.shape[0], max_score)

In [67]:
# split
train_df, test_df = EDA.recommender_train_test_split(
    steam_df_random_predictions,
    user_split_train=.7,
    game_split_train=.3,
    seed=1)

In [68]:
train_spark_df = spark_session.createDataFrame(train_df)
test_spark_df = spark_session.createDataFrame(test_df)
train_spark_df.count()

46520

In [69]:
als_model = ALS(
    itemCol='game_uid',
    userCol='uid',
    ratingCol='min_max',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy="drop", # Drops if user or item in test was not in train
    rank=10) 

fitted_als_model = als_model.fit(train_spark_df)

predictions = fitted_als_model.transform(test_spark_df)
evaluator = RegressionEvaluator() \
    .setMetricName("rmse") \
    .setLabelCol("min_max") \
    .setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)
rmse
# compare to 0.7685464618302161 for 80/20 split

0.7440154765345693

In [70]:
# Random RMSE
# Would need multiple random models to be confident since it will be... random

prediction_count = predictions.count() # less than test item count (NaNs)
predictions_rdd = predictions.rdd
random_SSE = predictions_rdd.map(lambda r: (r['min_max'] - r['random_prediction'])**2) \
    .reduce(lambda total, x: total + x)
random_rmse = math.sqrt(random_SSE / prediction_count)

In [71]:
print('Random RMSE: ', random_rmse)
print('% Increase in Random Model RMSError: ', ((random_rmse / rmse) - 1) * 100)
# base 80/20 split is 104.35477288510509

Random RMSE:  1.576392282698209
% Increase in Random Model RMSError:  111.87627575176724


In [72]:
def sort_predictions_slice(arr, n):
    actual_and_pred = np.array(arr)
    # sort by predictions
    indeces = np.argsort(actual_and_pred[:, 1])
    return actual_and_pred[indeces[::-1]][:n].tolist()

def dcg_at_k(scores, k):
    """
    Discounted cumulative gain
    See http://fastml.com/evaluating-recommender-systems/
    Args:
        r: List - Relevance scores in rank order
        k: Number of results to consider
    Returns:
        Float
    """
    r = np.asfarray(scores)[:k]
    if r.size:
        # item 1 and 2 have same weights
#         return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        # use below for more emphasis on first rank
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(scores, k):
    """
    Normalized Discounted cumulative gain
    See http://fastml.com/evaluating-recommender-systems/
    Args:
        r: List - Relevance scores in rank order
        k: Number of results to consider
    Returns:
        Float from 0 to 1
    """
    dcg_max = dcg_at_k(sorted(scores, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(scores, k) / dcg_max

In [73]:
# use actual values for gain
sampled = predictions_rdd.sample(False, 1, 1)
ndcg = sampled.map(lambda row: (row['uid'], [(row['min_max'], row['prediction'])])) \
    .reduceByKey(lambda total, val: total + val) \
    .map(lambda kv: (kv[0], sort_predictions_slice(kv[1], 1000))) \
    .map(lambda kv: ndcg_at_k(np.array(kv[1])[:, 0], 3)) \
    .reduce(lambda total, gain: total + gain) 
average_ndcg = ndcg / prediction_count
ndcg

# Calling other files with Spark may not be worth it 

# ndcg, average_ndcg = ModelEvaluation.spark_ndcg_at_k(predictions_rdd, 3)
# ndcg

387.31994747111906

In [74]:
# use actual values for gain
# evaluating more than half of the games doesnt seem to be useful - ends up rating everything
# need a cutoff for ndcg
sampled = predictions_rdd.sample(False, 1, 1)
random_ndcg = sampled.map(lambda row: (row['uid'], [(row['min_max'], row['random_prediction'])])) \
    .reduceByKey(lambda total, val: total + val) \
    .map(lambda kv: (kv[0], sort_predictions_slice(kv[1], 1000))) \
    .map(lambda kv: ndcg_at_k(np.array(kv[1])[:, 0], 3)) \
    .reduce(lambda total, gain: total + gain) 
average_random_dcg = random_ndcg / prediction_count
random_ndcg

284.94678683327874

In [75]:
# 7.4% with 80/20 split
# 20% with 50/50 split
# 7.4763370166 with .8 and .75 
# 11.2840101805 with .9 and .5
# 35.9 with .7 and .3 (method 2)
# increase compared to random seems to be completely based on games split
print('Increase in average cumulative gain: ', (ndcg / random_ndcg - 1) * 100)

Increase in average cumulative gain:  35.927115296


In [76]:
# 0.056835339579378932 with 50/50
# 0.14969597434654067 with 80/20
# 0.064924338587312933 with .8 and .5
# increase in ndcg is always related to the games split as well.  
# Holding out 75% of games increases random and model ndcg because there is less to get wrong
# In both cases it is more likely to be the same as ideal
average_ndcg

0.035016720682679602

In [77]:
average_random_dcg

0.025761394705115157

In [29]:
predictions_rdd.count()

19840