# Finding Ideal Train Test Split

In [1]:
import math
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

import os, sys
parent_folder = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))
sys.path.append(parent_folder)
from src import EDA, ModelEvaluation, Preprocess, Split
%load_ext autoreload
%autoreload 2

spark_sess = SparkSession.builder.getOrCreate()
spark_session = spark_sess.newSession()

# This notebook was used for exploring splitting the data
### Ultimately, kfold cross validation was decided upon

In [3]:
steam_df = EDA.load_without_cold_start(min_games_played=5)
preprocessor = Preprocess.PandasALSPreprocessor(steam_df)
preprocessor.process_general()
preprocessor.process_buckets()
preprocessor.process_min_max()
preprocessor.keep_standard_columns()
steam_df = preprocessor.get_df()
steam_df.head(2)

Unnamed: 0,uid,playtime,playtime_min_max,game_name,game_uid
1,151603712,273.0,1.412256,The Elder Scrolls V Skyrim,0
3,151603712,87.0,1.414122,Fallout 4,1


In [4]:
min_score = steam_df['playtime_min_max'].min()
max_score = steam_df['playtime_min_max'].max()
def get_random_scores(n, max_score):
    return np.random.random_sample(n, ) * max_score

steam_df_random_predictions = steam_df.copy()
steam_df_random_predictions['random_prediction'] = get_random_scores(steam_df_random_predictions.shape[0], max_score)
steam_df_random_predictions.head(2)

Unnamed: 0,uid,playtime,playtime_min_max,game_name,game_uid,random_prediction
1,151603712,273.0,1.412256,The Elder Scrolls V Skyrim,0,3.274247
3,151603712,87.0,1.414122,Fallout 4,1,1.967614


## Recommender-Specific Split

In [25]:
pandas_train_test = Split.PandasTrainTest(steam_df_random_predictions, seed=1)
custom_train_df, custom_test_df = pandas_train_test.train_test_split(
    user_split_train=.7,
    game_split_train=.3
)
(custom_train_df.shape[0], custom_test_df.shape[0])

(46520, 11269)

In [None]:
custom_train_spark_df = spark_session.createDataFrame(custom_train_df)
custom_test_spark_df = spark_session.createDataFrame(custom_test_df)
custom_train_spark_df.count()

### RMSE

In [59]:
als_model = ALS(
    itemCol='game_uid',
    userCol='uid',
    ratingCol='playtime_min_max',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy="drop", # Drops if user or item in test was not in train
    rank=10) 

fitted_als_model = als_model.fit(custom_train_spark_df)

predictions = fitted_als_model.transform(custom_test_spark_df)
evaluator = RegressionEvaluator() \
    .setMetricName("rmse") \
    .setLabelCol("playtime_min_max") \
    .setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)
rmse
# compare to 0.7685464618302161 for 80/20 split
# graph with 80/20, 70/30, custom

0.7494777490045165

In [49]:
# Random RMSE
# Would need multiple random models to be confident since it will be... random
# or do we?  it has many rows of random data

prediction_count = predictions.count() # less than test item count (NaNs)
predictions_rdd = predictions.rdd
random_SSE = predictions_rdd.map(lambda r: (r['playtime_min_max'] - r['random_prediction'])**2) \
    .reduce(lambda total, x: total + x)
random_rmse = math.sqrt(random_SSE / prediction_count)

In [50]:
print('Random RMSE: ', random_rmse)
print('% Increase in Random Model RMSError: ', ((random_rmse / rmse) - 1) * 100)
# base 80/20 split is 104.35477288510509

Random RMSE:  1.581882058073222
% Increase in Random Model RMSError:  111.06457932531492


In [93]:
def sort_predictions_slice(arr, n):
    actual_and_pred = np.array(arr)
    # sort by predictions
    indeces = np.argsort(actual_and_pred[:, 1])
    return actual_and_pred[indeces[::-1]][:n].tolist()

def dcg_at_k(scores, k):
    """
    Discounted cumulative gain
    See http://fastml.com/evaluating-recommender-systems/
    Args:
        r: List - Relevance scores in rank order
        k: Number of results to consider
    Returns:
        Float
    """
    r = np.asfarray(scores)[:k]
    if r.size:
        # item 1 and 2 have same weights
#         return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        # use below for more emphasis on first rank
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(scores, k):
    """
    Normalized Discounted cumulative gain
    See http://fastml.com/evaluating-recommender-systems/
    Args:
        r: List - Relevance scores in rank order
        k: Number of results to consider
    Returns:
        Float from 0 to 1
    """
    dcg_max = dcg_at_k(sorted(scores, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(scores, k) / dcg_max

## NDCG

In [52]:
# use actual values for gain
sampled = predictions_rdd.sample(False, 1, 1)
ndcg = sampled.map(lambda row: (row['uid'], [(row['playtime_min_max'], row['prediction'])])) \
    .reduceByKey(lambda total, val: total + val) \
    .map(lambda kv: (kv[0], sort_predictions_slice(kv[1], 1000))) \
    .map(lambda kv: ndcg_at_k(np.array(kv[1])[:, 0], 3)) \
    .reduce(lambda total, gain: total + gain) 
average_ndcg = ndcg / prediction_count
ndcg

# Calling other files with Spark may not be worth it 

# ndcg, average_ndcg = ModelEvaluation.spark_ndcg_at_k(predictions_rdd, 3)
# ndcg

384.57723404575029

In [53]:
# use actual values for gain
# evaluating more than half of the games doesnt seem to be useful - ends up rating everything
# need a cutoff for ndcg
sampled = predictions_rdd.sample(False, 1, 1)
random_ndcg = sampled.map(lambda row: (row['uid'], [(row['playtime_min_max'], row['random_prediction'])])) \
    .reduceByKey(lambda total, val: total + val) \
    .map(lambda kv: (kv[0], sort_predictions_slice(kv[1], 1000))) \
    .map(lambda kv: ndcg_at_k(np.array(kv[1])[:, 0], 3)) \
    .reduce(lambda total, gain: total + gain) 
average_random_dcg = random_ndcg / prediction_count
random_ndcg

284.63854600419319

In [54]:
# 7.4% with 80/20 split
# 20% with 50/50 split
# 7.4763370166 with .8 and .75 
# 11.2840101805 with .9 and .5
# 35.9 with .7 and .3 (method 2)
# increase compared to random seems to be completely based on games split
print('Increase in average normalized cumulative gain: ', (ndcg / random_ndcg - 1) * 100)

Increase in average cumulative gain:  35.1107358594


In [55]:
# 0.056835339579378932 with 50/50
# 0.14969597434654067 with 80/20
# 0.064924338587312933 with .8 and .5
# increase in ndcg is always related to the games split as well.  
# Holding out 75% of games increases random and model ndcg because there is less to get wrong
# In both cases it is more likely to be the same as ideal
average_ndcg

0.034768758163434617

In [56]:
average_random_dcg

0.025733527348720114

In [57]:
predictions_rdd.count()

11061

# K Folds

In [130]:
pandas_train_test = Split.PandasTrainTest(steam_df_random_predictions, seed=1)
k_fold_dfs = pandas_train_test.get_k_folds(5, game_split_train=.3)
for i, d in enumerate(k_fold_dfs):
    printed = 'train test split size ' + str(i) + ': '
    print(printed, (d[0].shape[0], d[1].shape[0]))

Number of users:  2436
train test split size 0:  (49573, 8170)
train test split size 1:  (49130, 8613)
train test split size 2:  (49552, 8191)
train test split size 3:  (49453, 8290)
train test split size 4:  (50201, 7542)


In [131]:
sparkified_k_fold_dfs = [(spark_session.createDataFrame(a), spark_session.createDataFrame(b)) for a, b in k_fold_dfs]

In [132]:
als_model = ALS(
    itemCol='game_uid',
    userCol='uid',
    ratingCol='playtime_min_max',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy="drop", # Drops if user or item in test was not in train
    rank=10) 

In [133]:
rmses = []
for train, test in sparkified_k_fold_dfs:
    fitted_als_model = als_model.fit(train)
    predictions = fitted_als_model.transform(test)
    evaluator = RegressionEvaluator() \
        .setMetricName("rmse") \
        .setLabelCol("playtime_min_max") \
        .setPredictionCol("prediction")
    rmse = evaluator.evaluate(predictions)
    print(rmse)
    rmses.append(rmse)
print('mean rmse: ', sum(rmses) / len(rmses))

0.7598384142749224
0.7994124502969006
0.7217479391715643
0.7309140997686916
0.7614382143765143
mean rmse:  0.7546702235777187


In [150]:
def get_rdd_average_ndcg(rdd, top_n=3, label_col='playtime_min_max', prediction_col='prediction', id_col='uid'):
    count = rdd.count()
    ndcg = sampled.map(lambda row: (row[id_col], [(row[label_col], row[prediction_col])])) \
        .reduceByKey(lambda total, val: total + val) \
        .map(lambda kv: (kv[0], sort_predictions_slice(kv[1], 1000))) \
        .map(lambda kv: ndcg_at_k(np.array(kv[1])[:, 0], top_n)) \
        .reduce(lambda total, gain: total + gain)
    average_ndcg = ndcg / count
    return average_ndcg

In [155]:
prediction_ndcgs = []
for train, test in sparkified_k_fold_dfs:
    fitted_als_model = als_model.fit(train)
    predictions_rdd = fitted_als_model.transform(test).rdd
    sampled = predictions_rdd.sample(False, 1, 1)
    average_ndcg = get_rdd_average_ndcg(sampled, prediction_col='prediction')
    print(average_ndcg)
    prediction_ndcgs.append(average_ndcg)
mean_prediction_ndcg = sum(prediction_ndcgs) / len(prediction_ndcgs)
mean_prediction_ndcg

0.0321947145581
0.030551569029
0.0311043463975
0.031237701536
0.0341942590432


0.031856518112747832

In [156]:
random_ndcgs = []
for train, test in sparkified_k_fold_dfs:
    fitted_als_model = als_model.fit(train)
    predictions_rdd = fitted_als_model.transform(test).rdd
    sampled = predictions_rdd.sample(False, 1, 1)
    average_ndcg = get_rdd_average_ndcg(sampled, prediction_col='random_prediction')
    print(average_ndcg)
    random_ndcgs.append(average_ndcg)
mean_random_ndcg = sum(random_ndcgs) / len(random_ndcgs)
mean_random_ndcg

0.0236656948581
0.0219363300881
0.0222226638968
0.0238626922271
0.0256958719646


0.023476650606948018

In [157]:
print('Increase: ', (mean_prediction_ndcg / mean_random_ndcg - 1) * 100)

Increase:  35.6944763804
