# Initial Model

### from playtime_distribution, we found 2-5, 6-25, 26+ as good initial cutoffs

In [100]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# needs python 2
# import graphlab as gl
# gl.canvas.set_target('ipynb')

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))
from src import EDA
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
# TODO Maybe show playtime distribution picture here

In [95]:
steam_200k_df = EDA.load_200k()
steam_200k_df = steam_200k_df[steam_200k_df['purchase_action'] == 'play']
steam_200k_df.head(2)

Unnamed: 0,uid,game_name,purchase_action,playtime
1,151603712,The Elder Scrolls V Skyrim,play,273.0
3,151603712,Fallout 4,play,87.0


In [96]:
def rank_playtime(time):
    if time <= 1:
        return 0
    if time > 1 and time <= 5:
        return 1
    if time > 6 and time <= 25:
        return 2
    return 3
steam_200k_df["playtime_rank"] = steam_200k_df['playtime'].map(lambda value: rank_playtime(value))

In [97]:
steam_200k_df['playtime_rank'].value_counts()

1    18918
0    17850
3    16961
2    16760
Name: playtime_rank, dtype: int64

In [83]:
# fitting ALS must have numbers for itemCol and userCol
game_uid = 0
game_uid_map = {}
for item in steam_200k_df['game_name']:
    if item in game_uid_map:
        continue
    game_uid_map[item] = game_uid
    game_uid += 1
steam_200k_df['game_name'].value_counts().size == len(game_uid_map.values())

True

In [84]:
steam_200k_df['game_uid'] = steam_200k_df['game_name'].map(lambda name: game_uid_map[name])
steam_200k_df['game_uid'].value_counts().size == steam_200k_df['game_name'].value_counts().size

True

In [85]:
# Should we use user-user or item-item?
print('Number of users: ', steam_200k_df['uid'].value_counts().size)
print('Number of games: ', steam_200k_df['game_name'].value_counts().size)
# Use the games since there are less games than users (good for steam!)

Number of users:  11350
Number of games:  3600


In [86]:
# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()
# Convert a Pandas DF to a Spark DF
spark_df = spark.createDataFrame(steam_200k_df)
spark_df.count()

70489

In [87]:
train, test = spark_df.randomSplit([0.8, 0.2], seed=427471138)

In [131]:
als_model = ALS(
    itemCol='game_uid',
    userCol='uid',
    ratingCol='playtime_rank',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy="drop", # TODO
    rank=10) 

In [132]:
fitted_als_model = als_model.fit(train)

In [133]:
one_row_pandas_df = pd.DataFrame({'uid': [151603712], 'game_uid': [1]})
one_row_spark_df = spark.createDataFrame(one_row_pandas_df)

In [134]:
fitted_als_model.transform(one_row_spark_df).show()

+--------+---------+----------+
|game_uid|      uid|prediction|
+--------+---------+----------+
|       1|151603712| 1.6639204|
+--------+---------+----------+



In [135]:
# fitted_als_model.setColdStartStrategy("drop")
predictions = fitted_als_model.transform(test)
# preds = predictions.collect()
# predictions.take(1)
evaluator = RegressionEvaluator() \
    .setMetricName("rmse") \
    .setLabelCol("playtime_rank") \
    .setPredictionCol("prediction")
rmse = evaluator.evaluate(predictions)

In [136]:
rmse

1.0460225606688012

In [137]:
preds = predictions.collect()

In [140]:
preds[0:100]

[Row(uid=116876958, game_name='No Time To Explain Remastered', purchase_action='play', playtime=3.4, playtime_rank=1, game_uid=148, prediction=0.8219438791275024),
 Row(uid=11373749, game_name='Shadow Puppeteer', purchase_action='play', playtime=0.6, playtime_rank=0, game_uid=463, prediction=0.0),
 Row(uid=45617627, game_name='Tomb Raider I', purchase_action='play', playtime=0.2, playtime_rank=0, game_uid=471, prediction=0.8916951417922974),
 Row(uid=101687527, game_name='Tomb Raider I', purchase_action='play', playtime=0.7, playtime_rank=0, game_uid=471, prediction=0.9380249977111816),
 Row(uid=44472980, game_name='Mortal Kombat Komplete Edition', purchase_action='play', playtime=505.0, playtime_rank=3, game_uid=496, prediction=1.6772217750549316),
 Row(uid=87740709, game_name='Mortal Kombat Komplete Edition', purchase_action='play', playtime=9.2, playtime_rank=2, game_uid=496, prediction=1.8232853412628174),
 Row(uid=24469287, game_name='Mortal Kombat Komplete Edition', purchase_acti

In [141]:
# TODO find examples of each item rank to get a sense of types of items
# TODO better evaluation than rmse