In [2]:
import pandas as pd
import numpy as np 
import matplotlib
import scipy 
import random
# import matplotlib.pyplot as plt 
import seaborn as sns
# import turicreate
import sklearn as sk
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

In [3]:
interactions_full_df = pd.read_pickle("../Data_Cleaning/clean_steam_advanced.pkl")
game_map = pd.read_pickle("../Data_Cleaning/gameMap.pkl")
interactions_full_df.describe()

Unnamed: 0,userId,play,Actions,gameId
count,128792.0,128792.0,128792.0,128792.0
mean,102529700.0,0.547122,0.096225,2498.26695
std,72430570.0,0.497776,0.256709,1472.513412
min,5250.0,0.0,4e-06,0.0
25%,45532210.0,0.0,6.1e-05,1179.0
50%,86055700.0,1.0,0.001025,2338.0
75%,154230700.0,1.0,0.018092,3827.0
max,309903100.0,1.0,1.0,5154.0


### split train and test set

In [3]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df, 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 103033
# interactions on Test set: 25759


### first of all, we want to declare our evaluation function

In [5]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_full_df.set_index('userId')
interactions_train_indexed_df = interactions_train_df.set_index('userId')
interactions_test_indexed_df = interactions_test_df.set_index('userId')

In [114]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    if not(person_id in interactions_df['gameId']):
        return []
    else:
        interacted_items = interactions_df.loc[person_id]['gameId']
        return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [115]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100
class ModelEvaluator:
    # sample could be  any items that a user has not purchased
    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(game_map['gameId'])
        non_interacted_items = all_items - interacted_items
        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)
    
    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

        # basically what this model is doing is that for every item in the test set, combine it with 100 random non interacted 
        # items and see when the recommender is doing recommendations, if the one item in the test set will rank higher
        # than the other non interactive items.
    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set for a given userId
        interacted_values_testset = interactions_test_indexed_df.loc[person_id] 
        
        if type(interacted_values_testset['gameId']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['gameId'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['gameId'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        #print(person_id)
        person_recs_df = model.recommend_items(person_id, items_to_ignore=get_items_interacted(person_id, interactions_train_indexed_df), topn=10000000000)
        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            # notice here the original order will be maintained
            valid_recs_df = person_recs_df[person_recs_df['gameId'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['gameId'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    
    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        #for all the person_id appearing in the interactions_test_indexed_df
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            # we only predict items that have at least one record in train set
            if (person_id in interactions_train_indexed_df['gameId']):
                person_metrics = self.evaluate_model_for_user(model, person_id)  
                person_metrics['_person_id'] = person_id
                people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator()    


### Next, we want to declare our SVD model

In [9]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_train_df.pivot(index='userId', columns='gameId', values='Actions').fillna(0)
users_items_pivot_matrix_df.head()

gameId,0,1,2,3,4,5,6,7,8,9,...,5145,5146,5147,5148,5149,5150,5151,5152,5153,5154
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
users_items_pivot_matrix = users_items_pivot_matrix_df.values
users_items_pivot_matrix[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [21]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

[5250, 76767, 86540, 103360, 144736, 181212, 229911, 298950, 299153, 381543]

In [13]:
from scipy.sparse.linalg import svds
#The number of factors to factor the user-item matrix. This could be tuned later
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [14]:
U.shape

(11197, 15)

In [15]:
Vt.shape

(15, 4917)

In [16]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [19]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 5.28706315e-10,  1.74599968e-09,  2.54871389e-08, ...,
         1.94219075e-07,  1.87624992e-06,  1.71108806e-07],
       [ 3.62376612e-08,  5.14720261e-10,  7.01724522e-07, ...,
        -1.33344913e-07, -6.38452787e-07,  1.25235756e-07],
       [-3.23705445e-09,  3.40049737e-09,  4.58062570e-07, ...,
         2.75657498e-07,  1.01867684e-05,  5.60601353e-07],
       ...,
       [-8.58999886e-12,  9.32972520e-10,  1.80303975e-08, ...,
         6.05283784e-07,  3.13418939e-06,  8.85204037e-07],
       [ 1.10998641e-08,  1.82459026e-09,  2.57095343e-08, ...,
         2.08437884e-07,  1.40033436e-06,  2.06709636e-07],
       [-2.59468504e-10,  3.41019189e-08,  4.00351541e-07, ...,
         2.14142009e-05,  2.90973628e-04,  4.82994382e-06]])

In [30]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df

Unnamed: 0_level_0,5250,76767,86540,103360,144736,181212,229911,298950,299153,381543,...,309216884,309228590,309262440,309265377,309375103,309404240,309434439,309554670,309626088,309812026
gameId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5.287063e-10,3.623766e-08,-3.237054e-09,2.705076e-13,-1.594936e-11,1.358793e-10,7.347264e-08,6.642944e-09,5.176635e-11,-1.252860e-10,...,-9.579626e-11,-9.579626e-11,-8.218618e-10,9.202214e-12,-5.858753e-14,-1.954365e-10,-9.579626e-11,-8.589999e-12,1.109986e-08,-2.594685e-10
1,1.746000e-09,5.147203e-10,3.400497e-09,1.571875e-10,2.209335e-11,7.443873e-10,8.493245e-10,3.801202e-10,1.042974e-09,-8.143661e-11,...,-1.261584e-10,-1.261584e-10,-1.280954e-09,9.777591e-10,5.360438e-12,-1.892597e-09,-1.261584e-10,9.329725e-10,1.824590e-09,3.410192e-08
2,2.548714e-08,7.017245e-07,4.580626e-07,1.095793e-08,1.889965e-08,6.014241e-08,1.397242e-06,1.601612e-07,4.123148e-09,6.790619e-08,...,1.178850e-07,1.178850e-07,2.436078e-07,1.392001e-08,1.083387e-10,7.923773e-07,1.178850e-07,1.803040e-08,2.570953e-08,4.003515e-07
3,5.795020e-09,2.866904e-08,1.419756e-07,3.326944e-09,4.569117e-09,9.590736e-09,4.216444e-08,1.809730e-08,3.777532e-09,9.740168e-09,...,-3.509032e-09,-3.509032e-09,-2.715321e-08,4.449880e-09,4.232801e-11,7.797881e-08,-3.509032e-09,1.803775e-08,7.314544e-09,1.086126e-07
4,2.359294e-09,-3.502950e-10,1.261708e-06,2.154919e-11,1.311650e-11,1.502848e-10,-3.027806e-09,1.294508e-07,6.064243e-11,-6.315252e-11,...,-1.257771e-09,-1.257771e-09,1.125467e-07,1.448340e-10,1.643409e-12,1.065546e-08,-1.257771e-09,-1.338998e-12,4.187371e-09,9.045164e-09
5,2.031198e-07,-4.345659e-07,1.600633e-04,-2.757436e-09,-9.256346e-08,-6.843021e-07,-1.368899e-06,1.235191e-05,-2.544052e-07,-8.414867e-07,...,-3.298305e-07,-3.298305e-07,-1.687272e-06,8.902102e-08,1.813481e-09,-6.807843e-07,-3.298305e-07,9.867863e-07,5.307110e-07,5.880727e-07
6,2.359661e-11,5.182029e-11,9.217647e-10,-6.824951e-12,-1.817573e-11,-5.471819e-11,1.127436e-10,5.975348e-11,-9.645224e-12,-6.988315e-11,...,-2.811940e-11,-2.811940e-11,-9.044685e-11,3.633810e-11,2.654853e-13,3.970579e-11,-2.811940e-11,4.175296e-12,3.869810e-11,9.430004e-10
7,1.037672e-08,4.594336e-06,4.838322e-07,1.705372e-06,3.521202e-06,1.032260e-05,1.689395e-06,-3.213682e-09,6.680211e-08,1.345147e-05,...,2.810537e-07,2.810537e-07,-1.621540e-07,3.797105e-08,3.765849e-10,-1.931971e-07,2.810537e-07,-3.987701e-09,3.590377e-08,1.134044e-06
8,7.688597e-09,-1.209600e-09,4.809304e-07,-3.089016e-09,-1.051323e-08,-3.102957e-08,-2.619475e-08,1.926773e-08,-1.262669e-08,-3.852023e-08,...,2.332821e-07,2.332821e-07,-8.713953e-08,3.567343e-08,2.723919e-10,-1.084823e-07,2.332821e-07,-1.956390e-09,1.377182e-08,7.771852e-07
9,6.878473e-09,-4.641202e-10,3.617694e-07,-2.117589e-09,-8.105231e-09,-2.344686e-08,-3.241893e-09,2.269588e-08,-1.735058e-09,-3.059486e-08,...,-1.384799e-08,-1.384799e-08,-4.094776e-08,2.909279e-08,1.881200e-10,-6.548237e-08,-1.384799e-08,-1.493340e-10,1.052841e-08,6.313519e-07


In [24]:
len(cf_preds_df.columns)

11197

In [111]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=50, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recGame'})
        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['gameId'].isin(items_to_ignore)] \
                               .sort_values('recGame', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'gameId', 
                                                          right_on = 'gameId')[['recGame', 'gameId', 'title', 'url', 'lang']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, game_map)

### now we evaluate our model

In [116]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
5765 users processed

Global metrics:
{'modelName': 'Collaborative Filtering', 'recall@5': 0.4496630590157239, 'recall@10': 0.5821931794976516}


Unnamed: 0,_person_id,hits@10_count,hits@5_count,interacted_count,recall@10,recall@5
153,62990992,44,28,205,0.214634,0.136585
85,33865373,33,17,150,0.22,0.113333
134,30246419,38,21,149,0.255034,0.14094
210,58345543,35,25,139,0.251799,0.179856
22,22301321,37,24,116,0.318966,0.206897
20,11403772,35,22,116,0.301724,0.189655
271,47457723,38,20,115,0.330435,0.173913
252,53875128,48,30,111,0.432432,0.27027
158,20772968,50,31,108,0.462963,0.287037
227,24721232,43,28,107,0.401869,0.261682


### Take a look at the result on some users

In [66]:
interactions_test_df.reset_index()
group = interactions_test_df.sort_values('userId')

Unnamed: 0,userId,gameName,play,Actions,gameId
41880,5250,Alien Swarm,1.0,0.021657,227
41879,5250,Portal 2,1.0,0.06011,3223
35796,76767,Day of Defeat,0.0,4.1e-05,1131
35806,76767,Thief - The Bank Heist,0.0,4.1e-05,4543
35802,76767,Thief,0.0,4.1e-05,4539


In [53]:
rec = cf_recommender_model.recommend_items(44314169)

In [61]:
testSet = interactions_test_df[interactions_test_df.userId== 44314169]
testSet.count()

userId      47
gameName    47
play        47
Actions     47
gameId      47
dtype: int64

In [60]:
interactions_full_df[interactions_full_df.userId== 44314169]

Unnamed: 0,userId,gameName,play,Actions,gameId
106456,44314169,Viscera Cleanup Detail Shadow Warrior,0.0,0.000040,4852
106457,44314169,XCOM Enemy Unknown,0.0,0.000040,5053
106451,44314169,The Walking Dead Season Two,0.0,0.000040,4517
106452,44314169,Tom Clancy's Ghost Recon Advanced Warfighter 2,0.0,0.000040,4597
106441,44314169,Natural Selection 2,0.0,0.000040,2901
106442,44314169,Patch testing for Chivalry,0.0,0.000040,3113
106363,44314169,Just Cause 2 Multiplayer Mod,1.0,0.000484,2345
106443,44314169,PixelJunk Monsters Ultimate,0.0,0.000040,3185
106440,44314169,Metro Last Light Redux,0.0,0.000040,2716
106439,44314169,Metro 2033,0.0,0.000040,2712


In [54]:
rec.head(50)

Unnamed: 0,gameId,recGame
0,4257,0.004138
1,984,0.002122
2,4364,0.000964
3,4899,0.000789
4,4276,0.000705
5,2475,0.000643
6,1894,0.000623
7,3825,0.000415
8,737,0.000356
9,1499,0.000319


In [55]:
testSet.merge(rec, on="gameId", how="left") 

Unnamed: 0,userId,gameName,play,Actions,gameId,recGame
0,44314169,Need for Speed Hot Pursuit,1.0,0.001048,2911,
1,44314169,Dungeon Defenders,1.0,0.023387,1405,
2,44314169,Borderlands,1.0,0.020968,618,
3,44314169,Tom Clancy's Splinter Cell Conviction,1.0,0.002581,4642,
4,44314169,Fallout 3 - Game of the Year Edition,1.0,0.003226,1674,
5,44314169,Tom Clancy's Ghost Recon Advanced Warfighter 2,0.0,4e-05,4597,
6,44314169,Just Cause 2 Multiplayer Mod,1.0,0.000484,2345,
7,44314169,XCOM Enemy Unknown,0.0,4e-05,5053,
8,44314169,Sniper Elite Nazi Zombie Army,1.0,0.003306,3896,
9,44314169,Assassin's Creed II,0.0,4e-05,356,
