In [17]:
import math
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import graphlab
from graphlab import factorization_recommender

# For comparison to Spark
## This notebook was taking a quick look at RMSE for graphlab

In [8]:
# Graphlab is python 2, so avoiding import differences
def load_200k():
    steam_200k_df = pd.read_csv('../data/steam-200k.csv', header=None)
    steam_200k_df.columns = ['uid', 'game_name', 'purchase_action', 'playtime', 'extra']
    steam_200k_df = steam_200k_df.drop('extra', axis=1)
    steam_200k_df = steam_200k_df[steam_200k_df['purchase_action'] == 'play']
    steam_200k_df = steam_200k_df.drop('purchase_action', axis=1)
    return steam_200k_df

def load_without_cold_start(min_games=5, min_users=0):
    steam_df = load_200k()
    # filter users
    game_counts = steam_df.groupby('uid').count()
    usable_users = game_counts[game_counts['game_name'] >= min_games].reset_index()
    filtered_users = steam_df[steam_df['uid'].isin(usable_users['uid'].values)]
    # filter games
    user_counts = steam_df.groupby('game_name').count()
    usable_games = user_counts[user_counts['playtime'] > min_users].reset_index()
    return filtered_users[steam_df['game_name'].isin(usable_games['game_name'].values)]

import numpy as np
import pandas as pd
import random

class PandasALSPreprocessor(object):
    def __init__(self, df):
        self.df = df.copy()
        self.standard_columns = ['uid', 'game_uid', 'game_name', 'playtime', 'playtime_min_max']

    def _rank_playtime(self, time):
        if time <= 1:
            return 0
        if time > 1 and time <= 5:
            return 1
        if time > 6 and time <= 25:
            return 2
        return 3

    def _min_max(self, row):
        # TODO double check this
        if (row['playtime_max'] == row['playtime_min']):
            return 1.0 / row['game_counts']
        diff = row['playtime_max'] - row['playtime_min']
        return (row['playtime'] - row['playtime_min']) / diff

    def _add_playtime_summaries(self, df, max_rank=3):
        aggs = {'playtime_mean': np.mean, 'playtime_min': np.min, 'playtime_max': np.max, 'game_counts': 'count'}
        grouped_means = df.groupby('game_name').agg({'playtime': aggs})
        grouped_means.columns = [col[1] for col in grouped_means.columns]
        joined = df.join(grouped_means, on='game_name')
        joined['playtime_min_max'] = joined.apply(lambda x: (self._min_max(x) * max_rank) + 1, axis=1)
        return joined

    def _create_uids(self, df, from_column='game_name', to_column='game_uid'):
        # fitting ALS must have numbers for itemCol and userCol
        uid = 0
        uid_map = {}
        for item in df[from_column]:
            if item in uid_map:
                continue
            uid_map[item] = uid
            uid += 1
        df[to_column] = df[from_column].map(lambda name: uid_map[name])
        return df

    def get_df(self):
        return self.df

    def process_general(self):
        self.df = self._create_uids(self.df, from_column='game_name', to_column='game_uid')
        return self.df

    def process_buckets(self):
        self.df["playtime_rank"] = self.df['playtime'].map(lambda value: self._rank_playtime(value))
        return self.df

    def process_min_max(self, max_rank=3):
        self.df = self._add_playtime_summaries(self.df, max_rank=max_rank)
        return self.df

    def keep_columns(self, keep_columns):
        existing_columns = set(self.df.columns)
        intersection = existing_columns.intersection(set(keep_columns))
        self.df = self.df[list(intersection)]
        return self.df

    def keep_standard_columns(self):
        self.keep_columns(self.standard_columns)
        return self.df

class PandasTrainTest(object):
    '''
    '''
    def __init__(self, df, seed=None):
        self.df = df
        if (seed):
            random.seed(seed)

    def _user_games_split(self, test_df, game_split_train=.5):
        train_indices = []
        test_indices = []
        # can do uid in users and game in gameid
        for uid in test_df['uid'].unique():
            user_data = test_df[test_df['uid'] == uid]
            indices = user_data.index.values
            random.shuffle(indices)
            train_game_indices = indices[:int((len(indices) + 1) * game_split_train)]
            test_game_indices = indices[int((len(indices) + 1) * game_split_train):]
            train_indices += train_game_indices.tolist()
            test_indices += test_game_indices.tolist()
        train_games = test_df.loc[train_indices,:]
        test_games = test_df.loc[test_indices,:]
        return (train_games, test_games)

    def _get_unique_users(self, df, user_column='uid'):
        unique_users = df[user_column].unique()
        random.shuffle(unique_users)
        return unique_users

    def user_only_split(self, user_column='uid', user_split_train=.8):
        df = self.df
        unique_users = self._get_unique_users(df, user_column)
        train_users = unique_users[:int((len(unique_users) + 1) * user_split_train)]
        test_users = unique_users[int((len(unique_users) + 1) * user_split_train):]
        train_df = df[df[user_column].isin(train_users)]
        test_df = df[df[user_column].isin(test_users)]
        return (train_df, test_df)

    # Consider sampling with replacement?  What does that mean in rec
    def train_test_split(
        self,
        user_column='uid',
        user_split_train=.8,
        game_split_train=.5,
    ):
        '''
            Custom train test split for recommender.
            Puts n percent of users into train and 1-n into test.
            For each user in test, put half of their games back into train
        '''
        train_df, test_df = self.user_only_split(user_column, user_split_train)
        # append test user training games to train users
        train_games_df, test_games_df = self._user_games_split(test_df, game_split_train=game_split_train)
        final_train_df = train_df.append(train_games_df)
        return (final_train_df, test_games_df)

    def get_k_folds(self, k, user_column='uid', game_split_train=.5):
        unique_users = self._get_unique_users(self.df)
        print('Number of users: ', len(unique_users))
        len_over_k = int((len(unique_users) + 1) / k)
        user_divisions = []
        for i in range(0, k):
            user_subset = unique_users[i * len_over_k : (i + 1) * len_over_k]
            user_divisions.append(user_subset)
        # return user_divisions
        # just do range
        finals = []
        for i in range(0, len(user_divisions)):
            k_test_ids = user_divisions[i]
            k_train = user_divisions[0:i] + user_divisions[i+1:]
            k_train_ids = [item for sublist in k_train for item in sublist]
            k_test_df = self.df[self.df[user_column].isin(k_test_ids)]
            k_train_df = self.df[self.df[user_column].isin(k_train_ids)]
            k_train_games_df, k_test_games_df = self._user_games_split(k_test_df, game_split_train=game_split_train)
            final_k_train_df = k_train_df.append(k_train_games_df)
            finals.append((final_k_train_df, k_test_games_df))
        return finals

    def run_k_folds(self, model, evaluator):
        # get k folds
        # run function on each split
        # evaluate
        # return average loss
        pass

In [6]:
steam_df = load_without_cold_start(min_games=5)
preprocessor = PandasALSPreprocessor(steam_df)
preprocessor.process_general()
preprocessor.process_buckets()
preprocessor.process_min_max()
preprocessor.keep_standard_columns()
steam_df = preprocessor.get_df()
steam_df.head(2)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0,game_name,game_uid,playtime_min_max,uid,playtime
1,The Elder Scrolls V Skyrim,0,1.412256,151603712,273.0
3,Fallout 4,1,1.414122,151603712,87.0


In [10]:
# need holdout users.  Hold out 10%.  We could put some percent of their games into
# the training but it will make it very complicated
pandas_train_test = PandasTrainTest(steam_df, seed=1)
train_val, holdout_test = pandas_train_test.user_only_split(user_split_train=.9)
print('split sizes: ', (train_val.shape[0], holdout_test.shape[0]))

('split sizes: ', (52831, 4958))


In [11]:
pandas_train_test = PandasTrainTest(train_val, seed=1)
k_fold_dfs = pandas_train_test.get_k_folds(5, game_split_train=.3)
for i, d in enumerate(k_fold_dfs):
    printed = 'train test split size ' + str(i) + ': '
    print(printed, (d[0].shape[0], d[1].shape[0]))

('Number of users: ', 2193)
('train test split size 0: ', (45376, 7414))
('train test split size 1: ', (44843, 7947))
('train test split size 2: ', (45393, 7397))
('train test split size 3: ', (45753, 7037))
('train test split size 4: ', (45278, 7512))


In [42]:
test = k_fold_dfs[0][0]
test.columns
test.columns = 'game_name', 'item_id', 'playtime_min_max', 'user_id', 'rating'
test.head(2)

test_test = k_fold_dfs[0][1]
test_test.columns
test_test.columns = 'game_name', 'item_id', 'playtime_min_max', 'user_id', 'rating'
test_test.head(2)

Unnamed: 0,game_name,item_id,playtime_min_max,user_id,rating
847,Dungeon Defenders,235,1.046283,26122540,6.8
851,Day of Defeat Source,237,1.002054,26122540,1.1


In [47]:
sf = graphlab.SFrame(test)
sf_test = graphlab.SFrame(test_test)

In [84]:
m1 = graphlab.ranking_factorization_recommender.create(sf, target='rating', ranking_regularization = 0.1 )

In [85]:
m1.evaluate_precision_recall(sf_test)

{'precision_recall_by_user': Columns:
 	user_id	int
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 7884
 
 Data:
 +---------+--------+-----------+--------+-------+
 | user_id | cutoff | precision | recall | count |
 +---------+--------+-----------+--------+-------+
 |   5250  |   1    |    0.0    |  0.0   |   4   |
 |   5250  |   2    |    0.0    |  0.0   |   4   |
 |   5250  |   3    |    0.0    |  0.0   |   4   |
 |   5250  |   4    |    0.0    |  0.0   |   4   |
 |   5250  |   5    |    0.0    |  0.0   |   4   |
 |   5250  |   6    |    0.0    |  0.0   |   4   |
 |   5250  |   7    |    0.0    |  0.0   |   4   |
 |   5250  |   8    |    0.0    |  0.0   |   4   |
 |   5250  |   9    |    0.0    |  0.0   |   4   |
 |   5250  |   10   |    0.0    |  0.0   |   4   |
 +---------+--------+-----------+--------+-------+
 [7884 rows x 5 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns

In [86]:
m1.evaluate_precision_recall(sf)

{'precision_recall_by_user': Columns:
 	user_id	int
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 39420
 
 Data:
 +---------+--------+-----------+--------+-------+
 | user_id | cutoff | precision | recall | count |
 +---------+--------+-----------+--------+-------+
 |   5250  |   1    |    0.0    |  0.0   |   2   |
 |   5250  |   2    |    0.0    |  0.0   |   2   |
 |   5250  |   3    |    0.0    |  0.0   |   2   |
 |   5250  |   4    |    0.0    |  0.0   |   2   |
 |   5250  |   5    |    0.0    |  0.0   |   2   |
 |   5250  |   6    |    0.0    |  0.0   |   2   |
 |   5250  |   7    |    0.0    |  0.0   |   2   |
 |   5250  |   8    |    0.0    |  0.0   |   2   |
 |   5250  |   9    |    0.0    |  0.0   |   2   |
 |   5250  |   10   |    0.0    |  0.0   |   2   |
 +---------+--------+-----------+--------+-------+
 [39420 rows x 5 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and colum

In [87]:
test_eval = m1.evaluate(sf_test)


Precision and recall summary statistics by cutoff
+--------+----------------+-----------------+
| cutoff | mean_precision |   mean_recall   |
+--------+----------------+-----------------+
|   1    | 0.342465753425 | 0.0319927487667 |
|   2    | 0.285388127854 | 0.0535668354547 |
|   3    | 0.269406392694 | 0.0737194791988 |
|   4    | 0.255136986301 | 0.0908910421853 |
|   5    | 0.232876712329 |  0.101598745005 |
|   6    | 0.212709284627 |  0.109460616663 |
|   7    | 0.196999347684 |  0.117189984177 |
|   8    | 0.181792237443 |  0.122197651809 |
|   9    | 0.170725520041 |  0.129095714461 |
|   10   | 0.161415525114 |  0.133606295497 |
+--------+----------------+-----------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 236.32466262739274)

Per User RMSE (best)
+-----------+-------+---------------+
|  user_id  | count |      rmse     |
+-----------+-------+---------------+
| 203092936 |   4   | 24.4625835707 |
+-----------+-------+---------------+
[1 rows x 3 columns]


Per Us

In [88]:
test_test.shape[0]

7414

In [90]:
test_eval.viewkeys()

dict_keys(['rmse_by_user', 'precision_recall_overall', 'rmse_by_item', 'precision_recall_by_user', 'rmse_overall'])

In [97]:
pr = test_eval['rmse_overall']

In [98]:
pr

236.32466262739274

In [94]:
not_zero_df.mean()

user_id      1.152340e+08
cutoff       1.801234e+01
precision    2.432792e-01
recall       2.082940e-01
count        2.012344e+01
dtype: float64