# Initial Model

### from playtime_distribution, we found 2-5, 6-25, 26+ as good initial cutoffs

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# needs python 2
# import graphlab as gl
# gl.canvas.set_target('ipynb')

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))
from src import EDA
from src import ModelEvaluation
%load_ext autoreload
%autoreload 2

In [4]:
# TODO Maybe show playtime distribution picture here

In [5]:
steam_df = EDA.load_without_cold_start(5)
steam_df = steam_df[steam_df['purchase_action'] == 'play']
steam_df.head(2)

  return filtered_users[steam_df['game_name'].isin(usable_games['game_name'].values)]


Unnamed: 0,uid,game_name,purchase_action,playtime
1,151603712,The Elder Scrolls V Skyrim,play,273.0
3,151603712,Fallout 4,play,87.0


## Some normalization needs to be done for played time

In [6]:
steam_df["playtime_rank"] = steam_df['playtime'].map(lambda value: EDA.rank_playtime(value))

### Game names need to be changed to IDs for Spark ML model

In [7]:
# fitting ALS must have numbers for itemCol and userCol
steam_df = EDA.get_uids(steam_df, from_column='game_name', to_column='game_uid')
steam_df['game_uid'].value_counts().size == steam_df['game_name'].value_counts().size

True

### User-User vs Item-Item

In [9]:
print('Number of users: ', steam_df['uid'].value_counts().size)
print('Number of games: ', steam_df['game_name'].value_counts().size)

Number of users:  2436
Number of games:  3544


##### We will use Co-clustering instead of relying only on user-user or item-item similarity

### Spark ALS Model Building

In [99]:
# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()
# Convert a Pandas DF to a Spark DF
spark_df = spark.createDataFrame(steam_df)
spark_df.count()
train, test = spark_df.randomSplit([0.8, 0.2], seed=427471138)
# can broadcast these

print('Training size: ', train.count())
print('Test size: ', test.count())

Training size:  46128
Test size:  11661


In [56]:
als_model = ALS(
    itemCol='game_uid',
    userCol='uid',
    ratingCol='playtime_rank',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy="drop", # Drops if user or item in test was not in train
    rank=10) 

In [57]:
fitted_als_model = als_model.fit(train)

In [58]:
one_row_pandas_df = pd.DataFrame({'uid': [151603712], 'game_uid': [1]})
one_row_spark_df = spark.createDataFrame(one_row_pandas_df)
fitted_als_model.transform(one_row_spark_df).show()

+--------+---------+----------+
|game_uid|      uid|prediction|
+--------+---------+----------+
|       1|151603712| 1.3429985|
+--------+---------+----------+



In [59]:
predictions = fitted_als_model.transform(test)
evaluator = RegressionEvaluator() \
    .setMetricName("rmse") \
    .setLabelCol("playtime_rank") \
    .setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)

In [60]:
rmse
# was 1.046 without restricting to 5+
# was 1.005 with 5+, without normalizing hours played
# was 1.015 with 5+, 2+ users, without normalizing hours played

1.0051786784416223

In [16]:
preds = predictions.collect()

In [39]:
preds[0:5]

[Row(uid=116876958, game_name='No Time To Explain Remastered', purchase_action='play', playtime=3.4, playtime_rank=1, game_uid=148, prediction=0.8219438791275024),
 Row(uid=11373749, game_name='Shadow Puppeteer', purchase_action='play', playtime=0.6, playtime_rank=0, game_uid=463, prediction=0.0),
 Row(uid=45617627, game_name='Tomb Raider I', purchase_action='play', playtime=0.2, playtime_rank=0, game_uid=471, prediction=0.8916951417922974),
 Row(uid=101687527, game_name='Tomb Raider I', purchase_action='play', playtime=0.7, playtime_rank=0, game_uid=471, prediction=0.9380249977111816),
 Row(uid=44472980, game_name='Mortal Kombat Komplete Edition', purchase_action='play', playtime=505.0, playtime_rank=3, game_uid=496, prediction=1.6772217750549316)]

In [18]:
# TODO find examples of each item rank to get a sense of types of items
# TODO better evaluation than rmse - this is not good if recommending a small subset of items

In [22]:
# we want to recommend on test set
userRecs = fitted_als_model.recommendForAllUsers(5)

In [23]:
userRecs.head()

Row(uid=46014950, recommendations=[Row(game_uid=832, rating=2.1705732345581055), Row(game_uid=2375, rating=2.0654635429382324), Row(game_uid=2034, rating=2.061188220977783), Row(game_uid=2857, rating=2.0255558490753174), Row(game_uid=214, rating=1.932112216949463)])

In [31]:
ModelEvaluation.dcg_at_k([1,2,1],3)

3.6309297535714578

In [40]:
predictions.groupBy('user_id')

<pyspark.sql.group.GroupedData at 0x1150dc208>

In [81]:
with_summaries_df = EDA.add_summaries(steam_df)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [82]:
with_summaries_df.head(4)
# can drop some columns here

Unnamed: 0,uid,game_name,purchase_action,playtime,playtime_rank,game_uid,playtime_mean,playtime_min,playtime_max,game_counts,min_max
1,151603712,The Elder Scrolls V Skyrim,play,273.0,3,0,105.72153,0.1,1986.0,562,0.412256
3,151603712,Fallout 4,play,87.0,3,1,65.274172,0.2,629.0,151,0.414122
5,151603712,Spore,play,14.9,2,2,26.016667,0.1,417.0,54,0.1065
7,151603712,Fallout New Vegas,play,12.1,2,3,52.247843,0.1,417.0,255,0.086352


In [83]:
spark_df = spark.createDataFrame(with_summaries_df)
spark_df.count()
train, test = spark_df.randomSplit([0.8, 0.2], seed=427471138)
print('Training size: ', train.count())
print('Test size: ', test.count())

Training size:  46128
Test size:  11661


In [None]:
als_model = ALS(
    itemCol='game_uid',
    userCol='uid',
    ratingCol='min_max',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy="drop", # Drops if user or item in test was not in train
    rank=10) 

fitted_als_model = als_model.fit(train)

predictions = fitted_als_model.transform(test)
evaluator = RegressionEvaluator() \
    .setMetricName("rmse") \
    .setLabelCol("playtime_rank") \
    .setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)
rmse

In [92]:
# Higher RMSE using min-max
steam_df.min()

uid                       5250
game_name          007 Legends
purchase_action           play
playtime                   0.1
playtime_rank                0
game_uid                     0
dtype: object

In [91]:
with_summaries_df.min()

uid                       5250
game_name          007 Legends
purchase_action           play
playtime                   0.1
playtime_rank                0
game_uid                     0
playtime_mean              0.1
playtime_min               0.1
playtime_max               0.1
game_counts                  1
min_max                      0
dtype: object

In [None]:
# cumulitive survivorship score
# removing users with 5+ games but no games outside of bucket 1
# maybe remove gamers with 100+ games
# normalize to users instead of games (does ALS model adjust for user bias?)

In [None]:
als_model = ALS(
    itemCol='game_uid',
    userCol='uid',
    ratingCol='min_max',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy="drop", # Drops if user or item in test was not in train
    rank=10) 

fitted_als_model = als_model.fit(train)

predictions = fitted_als_model.transform(test)
evaluator = RegressionEvaluator() \
    .setMetricName("rmse") \
    .setLabelCol("playtime_rank") \
    .setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)
rmse