# Initial Model

## Considering Normalization
### Comparing to min / max normalization to bucketization

In [2]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))
from src import EDA, ModelEvaluation, Split
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


![caption](../figures/user_dropoff_5.png)

In [8]:
steam_df = EDA.load_without_cold_start(5)
steam_df.head(2)

Unnamed: 0,uid,game_name,playtime
1,151603712,The Elder Scrolls V Skyrim,273.0
3,151603712,Fallout 4,87.0


## Some normalization can be done for played time

In [9]:
steam_df["playtime_rank"] = steam_df['playtime'].map(lambda value: EDA.rank_playtime(value))
steam_df.head(2)

Unnamed: 0,uid,game_name,playtime,playtime_rank
1,151603712,The Elder Scrolls V Skyrim,273.0,3
3,151603712,Fallout 4,87.0,3


In [51]:
with_summaries_df = EDA.add_summaries(steam_df)
with_summaries_df.head(2)

Unnamed: 0,uid,game_name,playtime,playtime_rank,game_uid,playtime_mean,playtime_min,playtime_max,game_counts,min_max
1,151603712,The Elder Scrolls V Skyrim,273.0,3,0,105.72153,0.1,1986.0,562,0.412256
3,151603712,Fallout 4,87.0,3,1,65.274172,0.2,629.0,151,0.414122


### Game names need to be changed to IDs for Spark ML model

In [52]:
# fitting ALS must have numbers for itemCol and userCol
steam_df = EDA.get_uids(with_summaries_df, from_column='game_name', to_column='game_uid')
steam_df['game_uid'].value_counts().size == steam_df['game_name'].value_counts().size

True

### User and Item Counts

In [53]:
print('Number of users: ', steam_df['uid'].value_counts().size)
print('Number of games: ', steam_df['game_name'].value_counts().size)

Number of users:  2436
Number of games:  3544


In [119]:
# delete?
with_summaries_df.mean()

uid              9.276618e+07
playtime         4.058039e+01
playtime_rank    1.455035e+00
game_uid         6.333881e+02
playtime_mean    4.058039e+01
playtime_min     4.687449e-01
playtime_max     8.067901e+02
game_counts      1.681010e+02
min_max          4.541064e-01
dtype: float64

##### We will use Co-clustering instead of relying only on user-user or item-item similarity

# Spark ALS Model Building

### Simple train test split

In [71]:
spark = SparkSession.builder.getOrCreate()
spark_df = spark.createDataFrame(steam_df)
spark_df.count()
train, test = spark_df.randomSplit([0.8, 0.2], seed=427471138)

print('Training size: ', train.count())
print('Test size: ', test.count())

Training size:  46128
Test size:  11661


In [72]:
als_rank_model = ALS(
    itemCol='game_uid',
    userCol='uid',
    ratingCol='playtime_rank',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy="drop", # Drops if user or item in test was not in train
    rank=10) 
fitted_als_rank_model = als_rank_model.fit(train)

In [73]:
# one_row_pandas_df = pd.DataFrame({'uid': [151603712], 'game_uid': [1]})
# one_row_spark_df = spark.createDataFrame(one_row_pandas_df)
# fitted_als_rank_model.transform(one_row_spark_df).show()
# steam_df[steam_df['uid'] == 151603712]

## RMSE

In [82]:
rank_predictions = fitted_als_rank_model.transform(test)
evaluator = RegressionEvaluator() \
    .setMetricName("rmse") \
    .setLabelCol("playtime_rank") \
    .setPredictionCol("prediction")
rmse = evaluator.evaluate(rank_predictions)

In [83]:
rmse
# was 1.046 without restricting to 5+
# was 1.005 with 5+, without min_max hours played
# was 1.015 with 5+, 2+ users, without normalizing hours played
# was 1.03 using 1-4 instead of 0-3

1.0060683366599708

In [78]:
with_summaries_df.head(2)
# TODO drop some columns here

Unnamed: 0,uid,game_name,playtime,playtime_rank,game_uid,playtime_mean,playtime_min,playtime_max,game_counts,min_max
1,151603712,The Elder Scrolls V Skyrim,273.0,3,0,105.72153,0.1,1986.0,562,0.412256
3,151603712,Fallout 4,87.0,3,1,65.274172,0.2,629.0,151,0.414122


In [61]:
# spark_df = spark.createDataFrame(with_summaries_df)
# spark_df.count()
# # train, test = spark_df.randomSplit([0.8, 0.2], seed=427471138)
# print('Training size: ', train.count())
# print('Test size: ', test.count())

In [113]:
als_min_max_model = ALS(
    itemCol='game_uid',
    userCol='uid',
    ratingCol='min_max',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy="drop", # Drops if user or item in test was not in train
    rank=10) 

fitted_als_min_max_model = als_min_max_model.fit(train)

min_max_predictions = fitted_als_min_max_model.transform(test)
evaluator = RegressionEvaluator() \
    .setMetricName("rmse") \
    .setLabelCol("min_max") \
    .setPredictionCol("prediction")
min_max_rmse = evaluator.evaluate(min_max_predictions)
min_max_rmse

0.7654772700522948

## Normalizing makes a  difference on predictions

In [64]:
# make sure we are using training point example

one_row_pandas_df = pd.DataFrame({'uid': [151603712], 'game_uid': [1]})
one_row_spark_df = spark.createDataFrame(one_row_pandas_df)
fitted_als_rank_model.transform(one_row_spark_df).show()
with_summaries_df[(with_summaries_df['uid'] == 151603712) & (with_summaries_df['game_uid'] == 1)]

+--------+---------+-----------+
|game_uid|      uid| prediction|
+--------+---------+-----------+
|       1|151603712|0.041270062|
+--------+---------+-----------+



Unnamed: 0,uid,game_name,playtime,playtime_rank,game_uid,playtime_mean,playtime_min,playtime_max,game_counts,min_max
3,151603712,Fallout 4,87.0,3,1,65.274172,0.2,629.0,151,0.414122


# NDCG

In [103]:
rank_prediction_count = rank_predictions.count()
min_max_prediction_count = min_max_predictions.count()
rank_predictions_rdd = rank_predictions.rdd
min_max_predictions_rdd = min_max_predictions.rdd
print(rank_prediction_count)
print(min_max_prediction_count)

11418
11418


In [66]:
# For NDCG Calculation
# fitted_als_rank_model.recommendForAllUsers(n) may be interesting to compare
# predictions.groupBy('user_id') may be more efficient - df is more efficient than rdd

test = (309404240,
  [(0.0006331785563528914, 0.000634816475212574),
   (0.42567567567567566, 0.008724773302674294)])

def do_sort(arr):
    actual_and_pred = np.array(arr)
    indeces = np.argsort(actual_and_pred[:, 1])
    return actual_and_pred[indeces[::-1]].tolist()
    
def sort_predictions_slice_relevance(arr, n):
    actual_and_pred = np.array(arr)
    indeces = np.argsort(actual_and_pred[:, 1])
    return actual_and_pred[indeces[::-1]][:n].tolist()

# lambda functions in rdds cant import modules
def dcg_at_k(scores, k):
    """
    Discounted cumulative gain
    See http://fastml.com/evaluating-recommender-systems/
    Args:
        r: List - Relevance scores in rank order
        k: Number of results to consider
    Returns:
        Float
    """
    r = np.asfarray(scores)[:k]
    if r.size:
        # item 1 and 2 have same weights
        return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        # use below for more emphasis on first rank
        # return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.

def ndcg_at_k(scores, k, method=0):
    """
    Normalized Discounted cumulative gain
    See http://fastml.com/evaluating-recommender-systems/
    Args:
        r: List - Relevance scores in rank order
        k: Number of results to consider
    Returns:
        Float from 0 to 1
    """
    dcg_max = dcg_at_k(sorted(scores, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(scores, k) / dcg_max

do_sort(test[1])
x = sort_predictions_slice_relevance(test[1], 3)
print('x: ', x)

x:  [[0.42567567567567566, 0.008724773302674294], [0.0006331785563528914, 0.000634816475212574]]


In [110]:
# use actual values for gain
rank_sampled = rank_predictions_rdd.sample(False, 1, 1)
rank_ndcg = rank_sampled.map(lambda row: (row['uid'], [(row['playtime_rank'], row['prediction'])])) \
    .reduceByKey(lambda total, val: total + val) \
    .map(lambda kv: (kv[0], sort_predictions_slice_relevance(kv[1], 1000))) \
    .map(lambda kv: ndcg_at_k(np.array(kv[1])[:, 0], 5)) \
    .reduce(lambda total, gain: total + gain)
rank_ndcg / prediction_count

0.15631623239587861

### NDCGain is higher on min_max splits

In [111]:
# use actual values for gain
min_max_sampled = min_max_predictions_rdd.sample(False, 1, 1)
min_max_ndcg = min_max_sampled.map(lambda row: (row['uid'], [(row['min_max'], row['prediction'])])) \
    .reduceByKey(lambda total, val: total + val) \
    .map(lambda kv: (kv[0], sort_predictions_slice_relevance(kv[1], 1000))) \
    .map(lambda kv: ndcg_at_k(np.array(kv[1])[:, 0], 5)) \
    .reduce(lambda total, gain: total + gain)
min_max_ndcg / prediction_count

0.15707517048126932

In [112]:
min_max_ndcg / rank_ndcg

1.0048551457117305

In [117]:
rank_predictions_rdd.take(40)

[Row(uid=61632730, game_name='H1Z1', playtime=418.0, playtime_rank=3, game_uid=463, playtime_mean=54.08468468468469, playtime_min=0.1, playtime_max=602.0, game_counts=111, min_max=2.082904136899817, prediction=1.8934279680252075),
 Row(uid=18074031, game_name='H1Z1', playtime=70.0, playtime_rank=3, game_uid=463, playtime_mean=54.08468468468469, playtime_min=0.1, playtime_max=602.0, game_counts=111, min_max=0.3483967436451238, prediction=1.4224703311920166),
 Row(uid=24919113, game_name='H1Z1', playtime=25.0, playtime_rank=2, game_uid=463, playtime_mean=54.08468468468469, playtime_min=0.1, playtime_max=602.0, game_counts=111, min_max=0.12410699451736168, prediction=1.2894067764282227),
 Row(uid=43684632, game_name='H1Z1', playtime=4.8, playtime_rank=1, game_uid=463, playtime_mean=54.08468468468469, playtime_min=0.1, playtime_max=602.0, game_counts=111, min_max=0.023425818242232933, prediction=1.7647634744644165),
 Row(uid=105159791, game_name='H1Z1', playtime=1.0, playtime_rank=0, game_

In [118]:
min_max_predictions_rdd.take(40)

[Row(uid=61632730, game_name='H1Z1', playtime=418.0, playtime_rank=3, game_uid=463, playtime_mean=54.08468468468469, playtime_min=0.1, playtime_max=602.0, game_counts=111, min_max=2.082904136899817, prediction=0.13349205255508423),
 Row(uid=18074031, game_name='H1Z1', playtime=70.0, playtime_rank=3, game_uid=463, playtime_mean=54.08468468468469, playtime_min=0.1, playtime_max=602.0, game_counts=111, min_max=0.3483967436451238, prediction=0.019686125218868256),
 Row(uid=24919113, game_name='H1Z1', playtime=25.0, playtime_rank=2, game_uid=463, playtime_mean=54.08468468468469, playtime_min=0.1, playtime_max=602.0, game_counts=111, min_max=0.12410699451736168, prediction=0.0),
 Row(uid=43684632, game_name='H1Z1', playtime=4.8, playtime_rank=1, game_uid=463, playtime_mean=54.08468468468469, playtime_min=0.1, playtime_max=602.0, game_counts=111, min_max=0.023425818242232933, prediction=0.4433247447013855),
 Row(uid=105159791, game_name='H1Z1', playtime=1.0, playtime_rank=0, game_uid=463, pla