In [116]:
import pandas as pd
import math
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import surprise

from surprise import SVD, KNNBaseline, KNNBasic, KNNWithMeans
from surprise.model_selection import cross_validate


In [117]:
df = pd.read_csv("steamWithIds-200k.csv")
df

Unnamed: 0.1,Unnamed: 0,playerId,gameTitle,playerGameStatus,time,zero,gameId
0,0,151603712,The Elder Scrolls V: Skyrim,purchase,1.0,0,72850.0
1,1,151603712,The Elder Scrolls V: Skyrim,play,273.0,0,72850.0
2,2,151603712,Fallout 4,purchase,1.0,0,377160.0
3,3,151603712,Fallout 4,play,87.0,0,377160.0
4,6,151603712,Fallout: New Vegas,purchase,1.0,0,22380.0
...,...,...,...,...,...,...,...
142309,199995,128470551,Titan Souls,play,1.5,0,297130.0
142310,199996,128470551,Grand Theft Auto Vice City,purchase,1.0,0,12110.0
142311,199997,128470551,Grand Theft Auto Vice City,play,1.5,0,12110.0
142312,199998,128470551,RUSH,purchase,1.0,0,38720.0


In [118]:
df_purchase = df.loc[(df['playerGameStatus']=='purchase')]
#df_purchase.to_csv("steamPurchased-200k.csv")

In [119]:
play = df.loc[(df['playerGameStatus']=='play')]
#play.to_csv("steamPlayed-200k.csv")

In [120]:
play

Unnamed: 0.1,Unnamed: 0,playerId,gameTitle,playerGameStatus,time,zero,gameId
1,1,151603712,The Elder Scrolls V: Skyrim,play,273.0,0,72850.0
3,3,151603712,Fallout 4,play,87.0,0,377160.0
5,7,151603712,Fallout: New Vegas,play,12.1,0,22380.0
7,9,151603712,Left 4 Dead 2,play,8.9,0,550.0
9,11,151603712,HuniePop,play,8.5,0,339800.0
...,...,...,...,...,...,...,...
142305,199987,128470551,Rogue Legacy,play,2.6,0,241600.0
142307,199993,128470551,Magic Duels,play,2.2,0,316010.0
142309,199995,128470551,Titan Souls,play,1.5,0,297130.0
142311,199997,128470551,Grand Theft Auto Vice City,play,1.5,0,12110.0


In [84]:
# suppose how much a user likes a game is poroportional to the time played
# this is what we are trying to predict and base the recommendation off
x = play.drop(columns='time')
y = play["time"].values

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)


In [32]:
x_train["time"] = y_train
x_train


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train["time"] = y_train


Unnamed: 0.1,Unnamed: 0,playerId,gameTitle,playerGameStatus,zero,gameId,time
49707,69915,127220350,Portal 2,play,0,620.0,44.0
72647,101991,178038549,PAYDAY 2,play,0,218620.0,17.2
121555,170801,25539892,PAYDAY 2,play,0,218620.0,4.6
36729,51639,33282871,Primal Carnage,play,0,215470.0,13.9
28649,40201,150137344,Counter-Strike Global Offensive,play,0,730.0,153.0
...,...,...,...,...,...,...,...
14894,20947,234108053,Counter-Strike Global Offensive,play,0,730.0,65.0
20715,29100,11403772,Serious Sam 3 BFE,play,0,41070.0,8.5
46070,64920,251410133,Dota 2,play,0,570.0,241.0
46539,65546,54826284,Bully Scholarship Edition,play,0,12200.0,2.3


In [92]:
# baseline estimation (naive)
# Each user will spend the averge amount of time playing the game (doesnt look at users of games)
# Group each game and average the time spent playing.
baseline_y_pred = pd.DataFrame(x_train.groupby('gameId')['time'].mean())

# ground truth that the model will be tested against
gt_df = pd.DataFrame({'gameId': x_test["gameId"], 'Actual time': y_test})
gt_df

Unnamed: 0,gameId,Actual time
14409,232890.0,2.8
79564,228280.0,3898.0
2229,346900.0,5.4
5610,10180.0,40.0
30183,570.0,18.9
...,...,...
60671,220440.0,22.0
134146,377160.0,34.0
58378,570.0,4.1
14444,273110.0,0.5


In [93]:
y_pred_and_y_true = pd.merge(baseline_y_pred, gt_df, on='gameId')
baseline_y_pred_vs_y_true = y_pred_and_y_true.rename(columns={"time": "Predicted rating"})

baseline_y_pred_vs_y_true

Unnamed: 0,gameId,Predicted rating,Actual time
0,10.0,199.959453,675.0
1,10.0,199.959453,3.7
2,10.0,199.959453,11.6
3,10.0,199.959453,0.2
4,10.0,199.959453,0.2
...,...,...,...
15854,997530.0,8.900000,1.0
15855,997530.0,8.900000,49.0
15856,997530.0,8.900000,3.0
15857,997530.0,8.900000,5.2


In [96]:
# calculating RMSE for the baseline model
print("RMSE baseline model: ", math.sqrt(mean_squared_error(baseline_y_pred_vs_y_true["Predicted rating"], 
                                                       baseline_y_pred_vs_y_true["Actual time"])))

RMSE baseline model:  257.5689297823295


In [97]:
x_train.head()

Unnamed: 0.1,Unnamed: 0,playerId,gameTitle,playerGameStatus,zero,gameId,time
49707,69915,127220350,Portal 2,play,0,620.0,44.0
72647,101991,178038549,PAYDAY 2,play,0,218620.0,17.2
121555,170801,25539892,PAYDAY 2,play,0,218620.0,4.6
36729,51639,33282871,Primal Carnage,play,0,215470.0,13.9
28649,40201,150137344,Counter-Strike Global Offensive,play,0,730.0,153.0


In [98]:
game_info = pd.read_csv("gameInfo/steam.csv")
#game_info.columns
game_info = pd.concat([game_info, game_info['steamspy_tags'].str.split(';', expand=True)], axis=1).drop(columns=['platforms', 'required_age', 'categories', 
                                                                                                     'genres', 'steamspy_tags', 'achievements', 'positive_ratings',
                                                                                                    'negative_ratings', 'average_playtime', 'median_playtime', 'owners',
                                                                                                    'release_date','english', 'price'])
game_info

Unnamed: 0,appid,name,developer,publisher,0,1,2
0,10,Counter-Strike,Valve,Valve,Action,FPS,Multiplayer
1,20,Team Fortress Classic,Valve,Valve,Action,FPS,Multiplayer
2,30,Day of Defeat,Valve,Valve,FPS,World War II,Multiplayer
3,40,Deathmatch Classic,Valve,Valve,Action,FPS,Multiplayer
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,FPS,Action,Sci-fi
...,...,...,...,...,...,...,...
27070,1065230,Room of Pandora,SHEN JIAWEI,SHEN JIAWEI,Adventure,Indie,Casual
27071,1065570,Cyber Gun,Semyon Maximov,BekkerDev Studio,Action,Indie,Adventure
27072,1065650,Super Star Blast,EntwicklerX,EntwicklerX,Action,Indie,Casual
27073,1066700,New Yankee 7: Deer Hunters,Yustas Game Studio,Alawar Entertainment,Indie,Casual,Adventure


In [54]:
#Colaborative

In [105]:

#had to drop duplicates because some player has the same game twice
x_train.head()
x_train = x_train.drop_duplicates(subset=['playerId', 'gameId'])
#x_train
x_train.shape

(37675, 7)

In [107]:
user_matrix = x_train.pivot(index='playerId', columns='gameId', values='time')


user_matrix = user_matrix.sub(user_matrix.mean(axis=1), axis=0)
user_matrix = user_matrix.fillna(0.0)


In [108]:
user_dist_matrix = 1 - user_matrix.T.corr()
user_dist_matrix

playerId,5250,76767,86540,144736,181212,229911,298950,547685,554278,561758,...,309181805,309188905,309213952,309255941,309262440,309265377,309404240,309434439,309554670,309903146
playerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5250,0.000000,0.949017,1.0,,1.000000,1.000000,1.007897,1.000000,1.000000,1.142174,...,,1.000000,,,,,1.0,,,
76767,0.949017,0.000000,1.0,,0.518061,0.920894,0.987098,1.206133,0.996449,1.029075,...,,1.000000,,,,,1.0,,,
86540,1.000000,1.000000,0.0,,1.000000,0.968287,1.000775,1.000000,0.995343,0.983401,...,,1.000000,,,,,1.0,,,
144736,,,,,,,,,,,...,,,,,,,,,,
181212,1.000000,0.518061,1.0,,0.000000,0.917929,1.000000,1.269436,1.000000,0.912123,...,,1.000000,,,,,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309265377,,,,,,,,,,,...,,,,,,,,,,
309404240,1.000000,1.000000,1.0,,1.000000,1.000000,1.020427,1.000000,1.000000,1.000000,...,,1.573733,,,,,0.0,,,
309434439,,,,,,,,,,,...,,,,,,,,,,
309554670,,,,,,,,,,,...,,,,,,,,,,


In [47]:
import pandas as pd
from surprise import Dataset
from surprise import Reader

In [2]:
# This is the same data that was plotted for similarity earlier
# with one new user "E" who has rated only movie 1
ratings_dict = {
    "item": [1, 2, 1, 2, 1, 2, 1, 2, 1],
    "user": ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'],
    "rating": [1, 2, 2, 4, 2.5, 4, 4.5, 5, 3],
}

df = pd.DataFrame(ratings_dict)
reader = Reader(rating_scale=(1, 5))

# Loads Pandas dataframe
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)
# Loads the builtin Movielens-100k data
movielens = Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] n
Ok then, I'm out!


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [3]:
from surprise import KNNWithMeans

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)

In [4]:
data

<surprise.dataset.DatasetAutoFolds at 0x2a22e5d5130>

In [5]:
trainingSet = data.build_full_trainset()

algo.fit(trainingSet)


prediction = algo.predict('E', 2)
prediction.est

Computing the cosine similarity matrix...
Done computing similarity matrix.


4.15

In [9]:
from surprise.model_selection import GridSearchCV

sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}

param_grid = {"sim_options": sim_options}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)


print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [16]:
#############FAAAAAAAKKKKK
# suppose how much a user likes a game is poroportional to the time played
# this is what we are trying to predict and base the recommendation off
x = play.drop(columns='time')
y = play["time"].values

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)


In [18]:
x_train["time"] = y_train
x_train


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train["time"] = y_train


Unnamed: 0.1,Unnamed: 0,playerId,gameTitle,playerGameStatus,zero,gameId,time
49707,69915,127220350,Portal 2,play,0,620.0,44.0
72647,101991,178038549,PAYDAY 2,play,0,218620.0,17.2
121555,170801,25539892,PAYDAY 2,play,0,218620.0,4.6
36729,51639,33282871,Primal Carnage,play,0,215470.0,13.9
28649,40201,150137344,Counter-Strike Global Offensive,play,0,730.0,153.0
...,...,...,...,...,...,...,...
14894,20947,234108053,Counter-Strike Global Offensive,play,0,730.0,65.0
20715,29100,11403772,Serious Sam 3 BFE,play,0,41070.0,8.5
46070,64920,251410133,Dota 2,play,0,570.0,241.0
46539,65546,54826284,Bully Scholarship Edition,play,0,12200.0,2.3


In [21]:
reader = Reader(rating_scale=(0, 9))

data = Dataset.load_from_df(x_train[['playerId', 'gameId', 'time']], reader)

In [24]:
from surprise import SVD, KNNBaseline, KNNBasic, KNNWithMeans
from surprise.model_selection import cross_validate




benchmark = []
# Iterate over all algorithms
for algorithm in [SVD(), KNNBaseline(), KNNBasic(), KNNWithMeans()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,249.853238,1.233929,0.077992
KNNWithMeans,249.893791,1.436149,1.487847
KNNBasic,250.10526,1.362494,1.474279
KNNBaseline,250.194664,1.425324,1.629925


In [109]:
play.drop_duplicates(inplace=True)
#swapping columns
play=play[['playerId','gameId','time']] 
play.columns = ['playerId','gameId','time']

train,test = train_test_split(play, test_size=0.3 )
play.head()

Unnamed: 0,playerId,gameId,time
1,151603712,72850.0,273.0
3,151603712,377160.0,87.0
5,151603712,22380.0,12.1
7,151603712,550.0,8.9
9,151603712,339800.0,8.5


In [114]:
# baseline estimation (naive)
# Each user will spend the averge amount of time playing the game (doesnt look at users of games)
# Group each game and average the time spent playing.
baseline_y_pred = pd.DataFrame(train.groupby('gameId')['time'].mean())

# ground truth that the model will be tested against
#gt_df = pd.DataFrame({'gameId': test["gameId"], 'Actual time': test})
#gt_df
test

Unnamed: 0,playerId,gameId,time
80969,248549548,200510.0,3.0
76585,206346171,236110.0,25.0
85479,62990992,35140.0,15.2
95927,129117376,218620.0,104.0
133585,74764496,271590.0,82.0
...,...,...,...
3375,130280718,304930.0,7.1
136833,16231773,24240.0,26.0
135575,142793906,268420.0,2.0
105939,241862248,320300.0,35.0


In [112]:
y_pred_and_y_true = pd.merge(baseline_y_pred, test, on='gameId')
baseline_y_pred_vs_y_true = y_pred_and_y_true.rename(columns={"time": "Predicted rating"})

baseline_y_pred_vs_y_true

Unnamed: 0,gameId,time_x,playerId,time_y
0,10.0,214.280155,203800963,4.3
1,10.0,214.280155,32498610,104.0
2,10.0,214.280155,9065641,0.4
3,10.0,214.280155,119949344,1.3
4,10.0,214.280155,58977564,0.2
...,...,...,...,...
15866,997530.0,12.500000,135400225,0.7
15867,997530.0,12.500000,117531196,5.2
15868,997530.0,12.500000,134322141,5.8
15869,997530.0,12.500000,58905429,0.9


In [115]:
# calculating RMSE for the baseline model
print("RMSE baseline model: ", math.sqrt(mean_squared_error(baseline_y_pred_vs_y_true["time_x"], 
                                                       baseline_y_pred_vs_y_true["time_y"])))

RMSE baseline model:  232.28659456885035


In [100]:
# when importing from a DF, you only need to specify the scale of the ratings.
reader = surprise.Reader(rating_scale=(1,5)) 
#into surprise:
data = surprise.Dataset.load_from_df(train,reader)
holdout = surprise.Dataset.load_from_df(test,reader)
data

<surprise.dataset.DatasetAutoFolds at 0x2a22f49fac0>

In [101]:
testset

[(72842694, 258200.0, 1.4),
 (94521606, 550.0, 9.1),
 (167738438, 440.0, 0.9),
 (46252394, 214950.0, 58.0),
 (120113754, 230410.0, 520.0),
 (256819019, 338180.0, 24.0),
 (167815968, 339800.0, 5.6),
 (174002551, 1520.0, 3.6),
 (293578749, 301520.0, 0.4),
 (57603447, 200510.0, 0.6),
 (24366790, 240.0, 3.0),
 (31178210, 10.0, 59.0),
 (38049880, 208200.0, 6.8),
 (167443514, 570.0, 0.2),
 (113546110, 320300.0, 1.1),
 (154014392, 240320.0, 3.8),
 (181900480, 204360.0, 1.4),
 (157275758, 248610.0, 4.3),
 (189734107, 22380.0, 7.3),
 (273539240, 570.0, 0.5),
 (124725852, 312150.0, 0.4),
 (187463594, 12120.0, 0.5),
 (172518437, 296470.0, 1.5),
 (112647787, 570.0, 2609.0),
 (254368479, 246620.0, 0.6),
 (105788297, 72850.0, 3.6),
 (115167911, 440.0, 4.5),
 (235692659, 620.0, 13.1),
 (212466515, 4000.0, 2.4),
 (127628725, 231430.0, 0.2),
 (35701646, 22000.0, 8.3),
 (215540943, 49520.0, 1.3),
 (127344335, 730.0, 184.0),
 (188250158, 362620.0, 9.8),
 (189191657, 214420.0, 4.3),
 (242523654, 259080.0,

In [102]:
kSplit = surprise.model_selection.split.KFold(n_splits=10, shuffle=True) # split data into folds. 

In [103]:
sim_options = sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
collabKNN = surprise.KNNBasic(k=40,sim_options=sim_options) #try removing sim_options. You'll find memory errors. 
rmseKNN = []
rmseSVD = []
for trainset, testset in kSplit.split(data): #iterate through the folds.
    collabKNN.fit(trainset)
    predictionsKNN = collabKNN.test(testset)
    rmseKNN.append(surprise.accuracy.rmse(predictionsKNN,verbose=True))#get root means squared error
    

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 164.2555
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 343.0539
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 237.2175
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 266.9798
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 267.8344
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 245.9864
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 227.7293
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 297.8055
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 262.6306
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 199.8033


In [104]:
sum(rmseKNN)/len(rmseKNN)

251.329622092835

In [105]:
funkSVD = surprise.prediction_algorithms.matrix_factorization.SVD(n_factors=30,n_epochs=10,biased=True)

In [106]:
min_error = 1
for trainset, testset in kSplit.split(data): #iterate through the folds.
    funkSVD.fit(trainset)
    predictionsSVD = funkSVD.test(testset)
    rmseSVD.append(surprise.accuracy.rmse(predictionsSVD,verbose=True))#get root means squared error

RMSE: 239.4394
RMSE: 245.0305
RMSE: 245.9786
RMSE: 306.9683
RMSE: 310.5372
RMSE: 215.9132
RMSE: 230.0264
RMSE: 319.6019
RMSE: 217.1668
RMSE: 190.3489


In [107]:
sum(rmseSVD)/len(rmseSVD)

252.10110742051424