In [2]:
import pandas as pd
import numpy as np
import pickle
import json
import ast
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, cross_validate
from sklearn.metrics.pairwise import cosine_similarity
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from scipy import sparse
import operator

In [6]:
game_data = pd.read_csv('data/steamspy_data.csv')
game_data.head()

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
0,10,Counter-Strike,Valve,Valve,,185686,4807,0,"10,000,000 .. 20,000,000",9363,426,262,323,199,999,80,"English, French, German, Italian, Spanish - Sp...",Action,11955,"{'Action': 5372, 'FPS': 4796, 'Multiplayer': 3..."
1,20,Team Fortress Classic,Valve,Valve,,5235,874,0,"2,000,000 .. 5,000,000",852,3,27,3,99,499,80,"English, French, German, Italian, Spanish - Sp...",Action,94,"{'Action': 745, 'FPS': 306, 'Multiplayer': 258..."
2,30,Day of Defeat,Valve,Valve,,4885,541,0,"5,000,000 .. 10,000,000",811,0,16,0,99,499,80,"English, French, German, Italian, Spanish - Spain",Action,119,"{'FPS': 785, 'World War II': 246, 'Multiplayer..."
3,40,Deathmatch Classic,Valve,Valve,,1791,403,0,"5,000,000 .. 10,000,000",271,0,12,0,99,499,80,"English, French, German, Italian, Spanish - Sp...",Action,10,"{'Action': 628, 'FPS': 138, 'Classic': 106, 'M..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,,12501,638,0,"5,000,000 .. 10,000,000",1919,3,171,5,99,499,80,"English, French, German, Korean",Action,122,"{'FPS': 879, 'Action': 321, 'Classic': 250, 'S..."


In [9]:
library_df = pd.read_csv('data/library_data.csv')
library_df.head()

Unnamed: 0,steamid,library
0,76561198219067393,"[{'appid': 220, 'name': 'Half-Life 2', 'hours'..."
1,76561198148157441,"[{'appid': 17390, 'name': 'Spore', 'hours': 26..."
2,76561198993539076,hidden
3,76561198247182340,hidden
4,76561198278705159,hidden


In [20]:
with open('data/modded_library_df.pickle', 'rb') as handle:
    modded_library_df = pickle.load(handle)
modded_library_df

FileNotFoundError: [Errno 2] No such file or directory: 'data/modded_library_df.pickle'

In [None]:
rating_matrix = big_rating_df.pivot_table(index='steamid', columns='appid', values='rating')
# replace NaN values with 0
rating_matrix = rating_matrix.fillna(0)
# display the top few rows
rating_matrix.head(50)

In [None]:
def find_similar_users(steamid, matrix, k=3):
    
    user = np.array(matrix.loc[steamid]).reshape(1, -1)
    
    
    other_users = matrix[matrix.index != steamid]
    similarities = cosine_similarity(user,other_users)[0].tolist()
    

    indices = other_users.index.tolist()
    

    index_similarity = dict(zip(indices, similarities))
    

    index_similarity_sorted = sorted(index_similarity.items(), key=operator.itemgetter(1))
    index_similarity_sorted.reverse()
    

    top_users_similarities = index_similarity_sorted[:k]
    users = [u[0] for u in top_users_similarities]
    
    return users

In [None]:
def recommend_item(steamid, matrix, items=5):
    
    similar_user_indices = find_similar_users(steamid, rating_matrix)
    # load vectors for similar users
    similar_users = matrix[matrix.index.isin(similar_user_indices)]
    # calc avg ratings across the 3 similar users
    similar_users = similar_users.mean(axis=0)
    # convert to dataframe so its easy to sort and filter
    similar_users_df = pd.DataFrame(similar_users, columns=['mean'])
    
    
    # load vector for the current user
    user_df = matrix[matrix.index == steamid]
    # transpose it so its easier to filter
    user_df_transposed = user_df.transpose()
    # rename the column as 'rating'
    user_df_transposed.columns = ['rating']
    # remove any rows without a 0 value
    user_df_transposed = user_df_transposed[user_df_transposed['rating']==0]
    # generate a list
    games_unseen = user_df_transposed.index.tolist()
    print(games_unseen)
    
    # filter avg ratings of similar users
    similar_users_df_filtered = similar_users_df[similar_users_df.index.isin(games_unseen)]
    # order the dataframe
    similar_users_df_ordered = similar_users_df_filtered.sort_values(by=['mean'], ascending=False)
    # grab the top n
    top_n_games = similar_users_df_ordered.head(items)
    top_n_games_indices = top_n_games.index.tolist()
    top_n_games_indices = [int(x) for x in top_n_games_indices]
    # lookup in the other dataframe to find names
    game_information = game_data[game_data.index.isin(top_n_games_indices)]
    
    return game_information #items

In [None]:
# instantiate a reader and read in our rating data
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(big_rating_df[['steamid','appid','rating']], reader)

In [None]:
# train SVD on 75% of known rates
trainset, testset = train_test_split(data, test_size=.25)
svd = SVD()
svd.fit(trainset)
predictions = svd.test(testset)

In [None]:
# check the accuracy using Root Mean Square Error
accuracy.rmse(predictions)

In [None]:
# Run 5-fold cross-validation and then print results
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)