# Initial Model

### from playtime_distribution, we found 2-5, 6-25, 26+ as good initial cutoffs

In [27]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# needs python 2
# import graphlab as gl
# gl.canvas.set_target('ipynb')

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))
from src import EDA
from src import ModelEvaluation
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# TODO Maybe show playtime distribution picture here

In [41]:
steam_df = EDA.load_200k_n_games_played(5)
steam_df = steam_df[steam_200k_df['purchase_action'] == 'play']
steam_df.head(2)

  


Unnamed: 0,uid,game_name,purchase_action,playtime
1,151603712,The Elder Scrolls V Skyrim,play,273.0
3,151603712,Fallout 4,play,87.0


In [42]:
steam_df["playtime_rank"] = steam_df['playtime'].map(lambda value: EDA.rank_playtime(value))

In [43]:
# fitting ALS must have numbers for itemCol and userCol
game_uid = 0
game_uid_map = {}
for item in steam_df['game_name']:
    if item in game_uid_map:
        continue
    game_uid_map[item] = game_uid
    game_uid += 1
steam_df['game_name'].value_counts().size == len(game_uid_map.values())

True

In [44]:
steam_df['game_uid'] = steam_df['game_name'].map(lambda name: game_uid_map[name])
steam_df['game_uid'].value_counts().size == steam_df['game_name'].value_counts().size

True

In [45]:
# Should we use user-user or item-item?
print('Number of users: ', steam_df['uid'].value_counts().size)
print('Number of games: ', steam_df['game_name'].value_counts().size)
# Use the games since there are less games than users (good for steam!)

Number of users:  2436
Number of games:  3544


In [46]:
# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()
# Convert a Pandas DF to a Spark DF
spark_df = spark.createDataFrame(steam_df)
spark_df.count()

57789

In [47]:
train, test = spark_df.randomSplit([0.8, 0.2], seed=427471138)

In [48]:
als_model = ALS(
    itemCol='game_uid',
    userCol='uid',
    ratingCol='playtime_rank',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy="drop", # Drops if user or item in test was not in train
    rank=10) 

In [50]:
fitted_als_model = als_model.fit(train)

In [51]:
one_row_pandas_df = pd.DataFrame({'uid': [151603712], 'game_uid': [1]})
one_row_spark_df = spark.createDataFrame(one_row_pandas_df)

In [52]:
fitted_als_model.transform(one_row_spark_df).show()

+--------+---------+----------+
|game_uid|      uid|prediction|
+--------+---------+----------+
|       1|151603712| 1.3429985|
+--------+---------+----------+



In [53]:
# fitted_als_model.setColdStartStrategy("drop")
predictions = fitted_als_model.transform(test)
# preds = predictions.collect()
# predictions.take(1)
evaluator = RegressionEvaluator() \
    .setMetricName("rmse") \
    .setLabelCol("playtime_rank") \
    .setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)

In [54]:
rmse
# was 1.046 without restricting to 5+
# was 1.005 without normalizing hours played

1.0051786784416223

In [16]:
preds = predictions.collect()

In [39]:
preds[0:5]

[Row(uid=116876958, game_name='No Time To Explain Remastered', purchase_action='play', playtime=3.4, playtime_rank=1, game_uid=148, prediction=0.8219438791275024),
 Row(uid=11373749, game_name='Shadow Puppeteer', purchase_action='play', playtime=0.6, playtime_rank=0, game_uid=463, prediction=0.0),
 Row(uid=45617627, game_name='Tomb Raider I', purchase_action='play', playtime=0.2, playtime_rank=0, game_uid=471, prediction=0.8916951417922974),
 Row(uid=101687527, game_name='Tomb Raider I', purchase_action='play', playtime=0.7, playtime_rank=0, game_uid=471, prediction=0.9380249977111816),
 Row(uid=44472980, game_name='Mortal Kombat Komplete Edition', purchase_action='play', playtime=505.0, playtime_rank=3, game_uid=496, prediction=1.6772217750549316)]

In [18]:
# TODO find examples of each item rank to get a sense of types of items
# TODO better evaluation than rmse - this is not good if recommending a small subset of items

In [22]:
# we want to recommend on test set
userRecs = fitted_als_model.recommendForAllUsers(5)

In [23]:
userRecs.head()

Row(uid=46014950, recommendations=[Row(game_uid=832, rating=2.1705732345581055), Row(game_uid=2375, rating=2.0654635429382324), Row(game_uid=2034, rating=2.061188220977783), Row(game_uid=2857, rating=2.0255558490753174), Row(game_uid=214, rating=1.932112216949463)])

In [31]:
ModelEvaluation.dcg_at_k([1,2,1],3)

3.6309297535714578

In [35]:
predictions.head(10)

[Row(uid=16167221, game_name='Total War ROME II - Emperor Edition', purchase_action='play', playtime=15.9, playtime_rank=2, game_uid=516, prediction=1.7675402164459229),
 Row(uid=16167221, game_name='Counter-Strike Condition Zero Deleted Scenes', purchase_action='play', playtime=2.8, playtime_rank=1, game_uid=481, prediction=0.6783820390701294),
 Row(uid=16167221, game_name='The Forest', purchase_action='play', playtime=19.7, playtime_rank=2, game_uid=625, prediction=1.2680799961090088),
 Row(uid=16167221, game_name='Hitman 2 Silent Assassin', purchase_action='play', playtime=4.7, playtime_rank=1, game_uid=725, prediction=0.828895092010498),
 Row(uid=16167221, game_name='Team Fortress 2', purchase_action='play', playtime=2.1, playtime_rank=1, game_uid=9, prediction=0.8562924265861511),
 Row(uid=16167221, game_name='The Escapists', purchase_action='play', playtime=0.3, playtime_rank=0, game_uid=733, prediction=1.1161251068115234),
 Row(uid=55120589, game_name='Counter-Strike Condition Z

In [40]:
predictions.groupBy('user_id')

<pyspark.sql.group.GroupedData at 0x1150dc208>

In [38]:
train.head(4)

[Row(uid=1364546, game_name='Counter-Strike', purchase_action='play', playtime=553.0, playtime_rank=3, game_uid=273),
 Row(uid=1950243, game_name='3DMark', purchase_action='play', playtime=26.0, playtime_rank=3, game_uid=707),
 Row(uid=1950243, game_name='Age of Empires II HD Edition', purchase_action='play', playtime=1.3, playtime_rank=1, game_uid=83),
 Row(uid=1950243, game_name='Battlefield Bad Company 2', purchase_action='play', playtime=40.0, playtime_rank=3, game_uid=477)]