Recommendation System 1: Given a game, recommend another one

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import string
import ast

In [2]:
# building game-genre matrix
filepath = "./Datasets/"
game_genres_matrix = pd.read_csv(filepath + "game_genres_matrix.csv", sep = "|", encoding = "utf-8")
games_final_df = game_genres_matrix[['title', 'id']]
game_genres_matrix.drop(game_genres_matrix.columns[[0, 1]], axis = 1, inplace = True)
game_genres_matrix

Unnamed: 0,id,Action,Casual,Indie,Simulation,Strategy,Free to Play,RPG,Sports,Adventure,...,Animation &amp; Modeling,Video Production,Utilities,Web Publishing,Education,Software Training,Design &amp; Illustration,Audio Production,Photo Editing,Accounting
0,761140.0,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,643980.0,0,0,1,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,670290.0,0,1,1,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,767400.0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,773570.0,1,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32126,773640.0,0,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32127,733530.0,0,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32128,610660.0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32129,658870.0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Principal Component Analysis

In [3]:
# reducing game-genre dimmensionality by performing a PCA analysis with 12 components by 
features = list(game_genres_matrix.columns)[1:]
x = game_genres_matrix.loc[:, features].values
y = game_genres_matrix.loc[:, ['id']].values
# normalizing game's vector components
x = StandardScaler().fit_transform(x)
# applying PCA
pca = PCA(n_components = 12)
principal_components = pca.fit_transform(x)
# building game-PCA matrix
principal_df = pd.DataFrame(data = principal_components, columns = ['pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'pca6', 'pca7', 'pca8', 'pca9', 'pca10', 'pca11', 'pca12'])
final_df = pd.concat([principal_df, game_genres_matrix[['id']]], axis = 1)
final_df

Unnamed: 0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,pca10,pca11,pca12,id
0,-0.709392,0.861016,-0.289405,-0.368894,0.993608,-1.148036,-0.186731,0.548794,-0.101315,0.042968,-0.752811,-1.458808,761140.0
1,-0.597041,-1.157868,3.127492,-1.248051,1.004799,-0.665776,0.753309,0.416259,0.106125,-0.021621,0.773024,-1.099443,643980.0
2,-0.655788,3.331155,2.428161,1.498731,-0.174745,-1.691839,-1.269805,1.501117,-0.023735,0.058382,0.992524,-1.080388,670290.0
3,-0.352468,-0.524772,-0.988471,0.732801,-0.514534,0.170392,-1.634551,-0.039791,0.136705,-0.074684,0.325662,0.292981,767400.0
4,-0.596499,1.588300,-0.210192,2.191854,-0.912503,-1.018975,0.183526,0.874906,0.098595,-0.002525,0.882237,-1.741965,773570.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32126,-0.502857,1.487245,-0.198983,-1.184632,1.634251,-1.426137,-0.224162,0.780006,-0.093858,0.053549,-0.207384,-0.850624,773640.0
32127,-0.408047,0.284383,-0.341127,-1.019122,1.269708,-1.113669,-0.010261,0.678217,-0.009597,0.026616,0.271432,-1.689245,733530.0
32128,-0.274632,2.750431,-0.108327,1.539679,-0.786261,-0.268056,1.892957,-0.002853,0.313769,-0.112418,1.339436,0.503792,610660.0
32129,-0.250128,0.337523,-0.839069,-0.158924,0.456412,-0.644113,-0.999934,0.552623,-0.036008,0.030760,0.285389,-0.836443,658870.0


In [4]:
# reducing from 22 dimensions (game's genres) to 12 (pca) it is still possible to get an explained variance of 75%
print(pca.explained_variance_ratio_)
print(sum(list(pca.explained_variance_ratio_)))

[0.13445116 0.07783383 0.06959081 0.06288534 0.06150512 0.05541542
 0.05213189 0.05055316 0.04598327 0.04501015 0.04445592 0.03994551]
0.7397615949728245


In [5]:
# game's genre vs game's pca transformation matrix
genre_to_pca_matrix = pd.DataFrame(pca.components_, columns = features)
genre_to_pca_matrix

Unnamed: 0,Action,Casual,Indie,Simulation,Strategy,Free to Play,RPG,Sports,Adventure,Racing,...,Animation &amp; Modeling,Video Production,Utilities,Web Publishing,Education,Software Training,Design &amp; Illustration,Audio Production,Photo Editing,Accounting
0,-0.100589,-0.076999,-0.156141,-0.039882,-0.065731,-0.043317,-0.071575,-0.029945,-0.093286,-0.018941,...,0.37225,0.275087,0.415124,0.271227,0.333991,0.353734,0.398611,0.175603,0.215447,0.044504
1,-0.304992,0.215255,-0.280728,0.505984,-0.022119,-0.021843,-0.333763,0.401945,-0.356902,0.318693,...,-0.044803,-0.029525,-0.037717,0.00042,-0.043007,-0.053796,-0.012118,-0.009668,-0.017344,-0.002285
2,-0.044038,-0.130206,-0.132316,0.059793,0.20726,0.60937,0.293982,0.154032,-0.144858,0.056711,...,0.024599,0.022868,0.022059,-0.015114,0.034977,0.037951,-0.008376,0.008325,0.003487,0.000465
3,0.397288,0.027313,0.16786,-0.069622,-0.358043,0.07298,-0.173317,0.328716,0.18429,0.36361,...,0.040973,0.213428,0.115891,-0.291644,0.116998,0.180903,-0.258178,0.21831,-0.037928,0.023075
4,-0.312012,0.171274,0.09535,0.153345,0.338521,-0.067738,0.144984,-0.155953,-0.062186,-0.231967,...,-0.014359,0.319485,0.142007,-0.4125,0.065119,0.171288,-0.383444,0.36171,-0.050862,0.045871
5,0.135443,-0.361336,-0.427146,-0.13144,-0.195445,-0.020845,-0.102582,-0.139827,-0.142888,-0.021138,...,-0.214223,0.259132,0.079689,-0.05684,-0.365,-0.249937,-0.06801,0.360781,0.145585,0.082363
6,0.01823,-0.522427,0.099512,-0.089977,0.411935,-0.304459,0.306857,0.245413,-0.211154,0.368319,...,-0.011512,0.008443,0.014981,0.009344,-0.039636,-0.026197,0.020009,0.01305,0.031993,0.001577
7,-0.112607,0.30844,0.336568,0.042817,0.052276,0.074268,0.048618,0.118526,0.140046,0.004907,...,-0.116585,0.170496,0.148687,0.147495,-0.393868,-0.293285,0.237267,0.170814,0.477875,0.073248
8,-0.003632,-0.029607,-0.038724,-0.035444,0.010993,-0.011531,0.036316,0.030421,0.04582,0.06961,...,0.003334,-0.190398,0.117313,-0.415913,0.094387,0.074521,-0.060753,-0.389995,0.571544,0.44409
9,-0.005153,0.019322,0.031028,0.011329,-0.001725,0.005926,-0.010944,-0.004862,-0.014597,-0.024052,...,-0.015985,-0.003321,-0.13262,0.243535,0.013186,0.001148,0.004088,0.146225,-0.32014,0.883973


Hierarchical Clustering of game's features

In [6]:
# performing an hierarchical clustering of games by theirs pca coordinates affinity
hierarchical_cluster = AgglomerativeClustering(n_clusters = 10, affinity = "euclidean", linkage = "ward")
labels = hierarchical_cluster.fit_predict(final_df)
print(labels)



[2 8 8 ... 4 8 2]


In [7]:
labels[1]

8

In [8]:
# how many games are in each cluster
np.unique(labels, return_counts = True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([5769, 4423, 3743, 4255, 3951, 1868, 3360,    5, 2852, 1905]))

In [13]:
games_final_df['cluster'] = labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  games_final_df['cluster'] = labels


Setting up Recommender System

In [14]:
aust_user_item_df = pd.read_csv(filepath + "aust_user_item_df.csv", sep = "|", encoding = "utf-8")
aust_user_item_df = aust_user_item_df[['items_item_id', 'items_playtime_forever']]
aust_user_item_df

Unnamed: 0,items_item_id,items_playtime_forever
0,10,6
1,20,0
2,30,7
3,40,0
4,50,0
...,...,...
802338,418070,0
802339,368500,291
802340,434570,46
802341,442080,118


In [15]:
def game_total_played_all(x):
  total_hours = x.items_playtime_forever.sum()
  return pd.Series([total_hours], index=['total_hours'])

df1 = aust_user_item_df.groupby('items_item_id').apply(game_total_played_all)
df1 = df1.reset_index()
df1.rename(columns = {'items_item_id': 'id'}, inplace = True)

games_final_df = games_final_df.merge(df1, on='id', how='left')
games_final_df


Unnamed: 0,title,id,cluster,total_hours
0,Lost Summoner Kitty,761140.0,2,
1,Ironbound,643980.0,8,
2,Real Pool 3D - Poolians,670290.0,8,
3,弹炸人2222,767400.0,2,
4,Log Challenge,773570.0,2,
...,...,...,...,...
32126,Colony On Mars,773640.0,2,
32127,LOGistICAL: South Africa,733530.0,2,
32128,Russian Roads,610660.0,4,
32129,EXIT 2 - Directions,658870.0,8,


In [16]:
# now, all NaNs in total_hours are because those games haven't had been bought by any user therefore, they don't show up in dataframe and produces NaNs here
# since they haven't had been bought, the total hours played is "0" --> impute NaNs with "0"
games_final_df = games_final_df.fillna(0)

In [18]:
pd.DataFrame(games_final_df).to_csv(filepath + "games_cluster.csv", sep = "|", encoding = "utf-8", index = False)
games_final_df

Unnamed: 0,title,id,cluster,total_hours
0,Lost Summoner Kitty,761140.0,2,0.0
1,Ironbound,643980.0,8,0.0
2,Real Pool 3D - Poolians,670290.0,8,0.0
3,弹炸人2222,767400.0,2,0.0
4,Log Challenge,773570.0,2,0.0
...,...,...,...,...
32126,Colony On Mars,773640.0,2,0.0
32127,LOGistICAL: South Africa,733530.0,2,0.0
32128,Russian Roads,610660.0,4,0.0
32129,EXIT 2 - Directions,658870.0,8,0.0


In [28]:
games_final_df[games_final_df['id'] == 40.0]

Unnamed: 0,title,id,cluster,total_hours
32104,Deathmatch Classic,40.0,5,4543.0


Recommender System

In [24]:
# so now lets recomend another game given a game
gameid_to_recommend = 10                # <<<<---------------- here goes the id of the input game

# lets see at which cluster it belongs
game_cluster_to_recommend = games_final_df[games_final_df['id'] == gameid_to_recommend]['cluster']
game_cluster_to_recommend

# now lets filter those games that belongs to that cluster
condition = games_final_df['cluster'] == int(game_cluster_to_recommend)
games_in_cluster = games_final_df.loc[condition,:]
games_rec_top10 = games_in_cluster.sort_values('total_hours', ascending = False).head(10)
games_rec_top10

Unnamed: 0,title,id,cluster,total_hours
1044,Counter-Strike: Global Offensive,730.0,5,135159765.0
58,Garry's Mod,4000.0,5,86919468.0
31254,Terraria,105600.0,5,29246990.0
30005,Arma 3,107410.0,5,16810219.0
493,Sid Meier's Civilization® V,8930.0,5,15331061.0
31525,Left 4 Dead 2,550.0,5,14691385.0
30743,Borderlands 2,49520.0,5,13877901.0
32008,Counter-Strike: Source,240.0,5,12326501.0
387,Mount &amp; Blade: Warband,48700.0,5,8454309.0
31444,Arma 2: Operation Arrowhead,33930.0,5,7395224.0


In [20]:
temp_total_hours = games_rec_top10['total_hours'].sum()
games_rec_top10['prob'] = games_rec_top10['total_hours'] / temp_total_hours
games_rec_top10 = games_rec_top10[['title', 'cluster', 'prob']]
games_rec_top10

Unnamed: 0,title,cluster,prob
1044,Counter-Strike: Global Offensive,5,0.39728
58,Garry's Mod,5,0.255486
31254,Terraria,5,0.085967
30005,Arma 3,5,0.049411
493,Sid Meier's Civilization® V,5,0.045063
31525,Left 4 Dead 2,5,0.043183
30743,Borderlands 2,5,0.040792
32008,Counter-Strike: Source,5,0.036232
387,Mount &amp; Blade: Warband,5,0.02485
31444,Arma 2: Operation Arrowhead,5,0.021737


In [21]:
import random
recommended_game = random.choices(list(games_rec_top10['title']), weights=games_rec_top10['prob'], k=1)
recommended_game[0]

'Counter-Strike: Global Offensive'