In [1]:
import math
import random
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS

import os, sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))
from src import EDA, ModelEvaluation, Preprocess, Split
%load_ext autoreload
%autoreload 2

spark_session = SparkSession.builder.getOrCreate()

In [2]:
# need holdout users.  Hold out 10%.  We could put some percent of their games into
# the training but it will make it very complicated
steam_df = EDA.load_without_cold_start(min_games_played=5, min_users_for_game=3)
preprocessor = Preprocess.PandasALSPreprocessor(steam_df)
preprocessor.process_general()
preprocessor.process_buckets()
preprocessor.process_min_max()
preprocessor.keep_standard_columns()
steam_df = preprocessor.get_df()
pandas_train_test = Split.PandasTrainTest(steam_df, seed=1)
train_val, holdout_test = pandas_train_test.user_only_split(user_split_train=.9)
print('split sizes: ', (train_val.shape[0], holdout_test.shape[0]))
train_val.head(2)

split sizes:  (49854, 5213)


Unnamed: 0,uid,playtime,playtime_min_max,game_name,game_uid
1,151603712,273.0,0.412256,The Elder Scrolls V Skyrim,0
3,151603712,87.0,0.414122,Fallout 4,1


In [None]:
final_model_training = EDA.restrict_user_item(train_val, min_games_played=5, min_users_for_game=3)
final_model_holdout = EDA.restrict_user_item(holdout_test, min_games_played=5, min_users_for_game=3)
final_splitter = Split.PandasTrainTest(final_model_holdout)
holdout_train_games, holdout_test_games  = final_splitter.user_games_split(final_model_holdout, game_split_train=.3)
final_model_training = final_model_training.append(holdout_train_games)

In [3]:
als_model = ALS(
    itemCol='game_uid',
    userCol='uid',
    ratingCol='playtime_min_max',
    nonnegative=True,    
    regParam=0.1,
    coldStartStrategy="drop", # Drops if user or item in test was not in train
    rank=10
);

In [6]:
spark_dataset = spark_session.createDataFrame(train_val)
fitted_model = als_model.fit(spark_dataset)

In [5]:
# fitted_model.recommendFor
# final_test_predictions = training_fitted_model.transform(holdout_test_spark_dataset)

In [26]:
fitted_model.transform(spark_dataset.where('uid == 151603712')) \
    .select('game_uid', 'prediction') \
    .orderBy('prediction', ascending=False) \
    .limit(5) \
    .collect()

[Row(game_uid=7, prediction=1.1215405464172363),
 Row(game_uid=5, prediction=0.2440052032470703),
 Row(game_uid=6, prediction=0.16802188754081726),
 Row(game_uid=3, prediction=0.1376160979270935),
 Row(game_uid=10, prediction=0.126241534948349)]

In [53]:
recommendation_per_user = fitted_model.recommendForAllUsers(5).take(100)

### Easier to Use Format

In [66]:
formatted_per_user_recommendations = []
for recommendation in recommendation_per_user:
    user_id = recommendation.uid
    game_ids = [rec.game_uid for rec in recommendation.recommendations]
    formatted_per_user_recommendations.append({'uid': user_id, 'game_ids': game_ids})
formatted_per_user_recommendations[1]

{'game_ids': [669, 1480, 1643, 1648, 610], 'uid': 208061820}

In [93]:
trial = formatted_per_user_recommendations[2]
top_played = train_val[train_val['uid'] == trial['uid']].sort_values(['playtime_min_max'], ascending=False)
top_5_played = top_played[['playtime_min_max', 'game_name']].head(5)

In [94]:
game_names = []
for game_id in trial['game_ids']:
    game_names.append(train_val[train_val['game_uid'] == game_id]['game_name'].iloc[0])

In [95]:
print('recommendations: {}'.format(game_names))
top_5_played

recommendations: ['SiN Episodes Emergence', 'Serious Sam Double D XXL', 'Requiem', 'Krosmaster Arena', 'Tomb Raider II']


Unnamed: 0,playtime_min_max,game_name
4294,3.0,Counter-Strike Condition Zero
4312,3.0,Oddworld Abe's Exoddus
4296,3.0,Fable III
4302,3.0,Dark Messiah of Might & Magic Single Player
4316,1.007634,Hitman 2 Silent Assassin


In [108]:
top_5_played[['game_name']]['game_name'].values

array(['Counter-Strike Condition Zero', "Oddworld Abe's Exoddus",
       'Fable III', 'Dark Messiah of Might & Magic Single Player',
       'Hitman 2 Silent Assassin'], dtype=object)