In [1]:
import pandas as pd
import numpy as np
import ast

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import matplotlib.cm as cm
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn import preprocessing
from sklearn.cluster import DBSCAN, OPTICS

%matplotlib inline

Load the steam games dataset

In [2]:
with open('/content/drive/MyDrive/VideoGameRecFiles/steam_games.json','r',encoding='utf8') as f:
  data = f.read()
  data = data.strip().split("\n")
  steam_games = []
  # The steam games dataset is not a properly formatted json file. Because of this we need to iterate through each individual JSON object and use
  # the ast module to parse the object.
  for entry in data:
    game = ast.literal_eval(entry)
    # Convert the genres, tags, and specs field from a string type to a list type
    if 'genres' in game:
      game['genres'] = ','.join(game['genres'])
    else:
      game['genres'] = None
    if 'tags' in game:
      game['tags'] = ','.join(game['tags'])
    else:
      game['tags'] = None
    if 'specs' in game:
      game['specs'] = ','.join(game['specs'])
    else:
      game['specs'] = None
    steam_games.append(game)
  # Create a dataframe
  steam_games_df = pd.DataFrame(steam_games)
  del(steam_games)

Create a dictionary containing sets of games that contain each genre, tag, and spec

In [3]:
# Create a dictionary containing sets of games that have each attribute
category_sets = {}

for idx in range(steam_games_df.shape[0]):
  game_genre = steam_games_df.iloc[idx]['genres']
  game_tags = steam_games_df.iloc[idx]['tags']
  game_specs = steam_games_df.iloc[idx]['specs']
  game_id = steam_games_df.iloc[idx]['id']
  if game_genre:
    cat_genres = game_genre.split(",")
    for g in cat_genres:
      if g in category_sets:
        category_sets[g].add(game_id)
      else:
        category_sets[g] = set([game_id])
  if game_tags:
    cat_tags = game_tags.split(",")
    for t in cat_tags:
      if t in category_sets:
        category_sets[t].add(game_id)
      else:
        category_sets[t] = set([game_id])
  if game_specs:
    cat_specs = game_specs.split(",")
    for s in cat_specs:
      if s in category_sets:
        category_sets[s].add(game_id)
      else:
        category_sets[s] = set([game_id])

Compute a matrix of conditional probabilities for each category. Each cell will contain the conditional probability of observing a game containing a particular attribute

In [4]:
game_categories = list(category_sets.keys())

In [5]:
probability_matrix = []

for g in game_categories:
  prob_list = []
  for c in game_categories:
    game_intersection = category_sets[g].intersection(category_sets[c])
    prob_list.append(len(game_intersection) / len(category_sets[g]))
  probability_matrix.append(prob_list)

In [6]:
probability_matrix = np.array(probability_matrix)

In [7]:
probability_matrix

array([[1.00000000e+00, 2.24245218e-01, 6.27871245e-01, ...,
        1.53645233e-04, 1.53645233e-04, 7.68226166e-05],
       [2.98863520e-01, 1.00000000e+00, 6.68168322e-01, ...,
        2.04771168e-04, 1.02385584e-04, 0.00000000e+00],
       [4.63269471e-01, 3.69912708e-01, 1.00000000e+00, ...,
        1.13365832e-04, 1.13365832e-04, 0.00000000e+00],
       ...,
       [5.00000000e-01, 5.00000000e-01, 5.00000000e-01, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 5.00000000e-01, 1.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00]])

In [8]:
probability_matrix.shape

(381, 381)

In [9]:
# Refit PCA to the probability matrix and keep only the 31 principal components
pca = PCA(n_components=31)
pca.fit(probability_matrix)
feature_set = pca.transform(probability_matrix)

# Clustering Game Attributes

We will use the K-Means clustering algorithm to cluster the games attributes.

In [10]:
# Fit the K Means clustering algorithm get the cluster assignments for each attribute
km = KMeans(n_clusters=24, random_state=25)
km.fit(feature_set)
labels = km.predict(feature_set)
attribute_assignments = pd.Series(labels, index=game_categories)

Let's look at the groupings for each attribute

# Building Game Features Table

In [11]:
attribute_clusters = []
for i in range(24):
  cluster = attribute_assignments[attribute_assignments == i]
  attribute_clusters.append(cluster.index.tolist())
# Create a list containing the attributes that belong in each cluster
# List of lists
# [[Attributes for cluster 0], [attributes for cluster 1], [attributes for cluster 2]]

In [12]:
feature_columns = ['clust_'+str(i) for i in range(25)]

In [13]:
game_features = []
for idx in range(steam_games_df.shape[0]):
  # Obtain list of genres, tags, and specs
  game_genre = steam_games_df.iloc[idx]['genres']
  game_tags = steam_games_df.iloc[idx]['tags']
  game_specs = steam_games_df.iloc[idx]['specs']
  attributes = []

  data_row = {k:0 for k in feature_columns}
  data_row['id'] = steam_games_df.iloc[idx]['id']
  
  # Iterate through each entry in the lists and create the features
  if game_genre:
    attributes.extend(game_genre.split(','))
  if game_tags:
    attributes.extend(game_tags.split(','))
  if game_specs:
    attributes.extend(game_specs.split(','))
  
  attributes = set(attributes)

  if len(attributes) > 0:
    for attr in attributes:
      for i in range(len(attribute_clusters)):
        if attr in attribute_clusters[i]:
          data_row['clust_'+str(i)] += 1
  else:
    data_row['clust_24'] += 1        
  game_features.append(data_row)
game_features_df = pd.DataFrame(game_features)

In [14]:
game_features_df = game_features_df.set_index('id')

In [15]:
game_features_df.head(15)

Unnamed: 0_level_0,clust_0,clust_1,clust_2,clust_3,clust_4,clust_5,clust_6,clust_7,clust_8,clust_9,...,clust_15,clust_16,clust_17,clust_18,clust_19,clust_20,clust_21,clust_22,clust_23,clust_24
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
761140,0,5,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
643980,0,9,3,0,0,1,0,1,4,1,...,0,0,0,0,4,0,0,0,0,0
670290,0,6,0,0,0,0,0,0,0,0,...,1,0,0,1,3,0,0,0,0,0
767400,0,3,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
773570,0,4,0,0,0,0,4,0,0,0,...,1,0,0,1,0,0,0,0,0,0
772540,0,5,0,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
774276,0,12,0,0,0,0,0,0,0,0,...,0,0,0,1,2,0,0,0,0,0
774277,0,12,0,0,0,0,0,0,0,0,...,0,0,0,1,2,0,0,0,0,0
774278,0,12,0,0,0,0,0,0,0,0,...,0,0,0,1,2,0,0,0,0,0
768800,0,5,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0


In [16]:
game_feat_dict = game_features_df.to_dict()

# Building User Features Table

In [17]:
# Read user items data file and build features table
with open('/content/drive/MyDrive/VideoGameRecFiles/australian_users_items.json','r',encoding='utf8') as f:
  data = f.read()
  data = data.strip().split("\n")
  user_features = []
  # We need to keep track of all the games each user played so we can avoid recommending games that they have already played.
  user_play_list = {}
  for user_data in data:
    # The stdataset is not a properly formatted json file. Because of this we need to iterate through each individual JSON object and use
    # the ast module to parse the object.
    record = ast.literal_eval(user_data)
    data_row = {k:0 for k in feature_columns}
    data_row['user_id'] = record['user_id']
    play_list = []
    
    for item in record['items']:
      item_id = item['item_id']
      play_list.append(item_id)
      for col in feature_columns:
        if item_id in game_feat_dict[col]:
          data_row[col] += game_feat_dict[col][item_id]
    
    user_play_list[record['user_id']] = play_list
    user_features.append(data_row)
  
  user_features_df = pd.DataFrame(user_features)

In [18]:
# Dropping duplicate records from data frame.
user_features_df = user_features_df.set_index("user_id").drop_duplicates()

In [19]:
user_features_df.head(15)

Unnamed: 0_level_0,clust_0,clust_1,clust_2,clust_3,clust_4,clust_5,clust_6,clust_7,clust_8,clust_9,...,clust_15,clust_16,clust_17,clust_18,clust_19,clust_20,clust_21,clust_22,clust_23,clust_24
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
76561197970982479,149,1740,677,659,5,7,8,146,70,92,...,168,14,6,15,265,52,51,114,0,0
js41637,392,5159,1653,1331,11,15,23,306,133,217,...,677,27,11,73,468,144,164,297,0,0
evcentric,166,922,308,400,7,2,3,80,24,26,...,66,1,0,4,145,21,39,65,0,0
Riot-Punch,151,1751,798,704,10,7,8,125,43,59,...,188,14,3,21,216,59,40,132,0,0
doctr,220,3343,1311,1512,7,5,10,272,121,156,...,253,27,5,38,560,148,92,258,0,0
MinxIsBetterThanPotatoes,137,2564,822,1113,8,9,17,215,90,108,...,140,22,3,34,564,122,71,135,0,0
NitemarePK,117,1988,886,586,8,4,11,111,34,98,...,241,2,3,26,250,85,101,172,0,0
themanwich,156,1696,780,553,2,15,10,131,43,63,...,221,8,4,19,183,74,99,108,0,0
maplemage,245,4327,1981,1161,13,14,14,262,150,237,...,452,6,7,61,537,277,204,294,0,0
Wackky,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Recommender Functions

In [20]:
def cosine_score(user1, user2):
  score = cosine_similarity(user1.values.reshape(1, -1), user2.values.reshape(1,-1))[0][0]
  return score

In [21]:
def recommend_games(user_id, n=10):
  '''
  Given a user id, recommend games to that user. By default 10 games are recommended
  '''

  # Get user features
  user = user_features_df.loc[user_id]
  # Get games played by the user
  play_list = user_play_list[user_id]
  other_users = user_features_df[user_features_df.index != user_id]

  scores = other_users.apply(lambda user2: cosine_score(user, user2), axis=1).sort_values(ascending=False)

  rec_idx = 0
  recommended_games = user_play_list[scores.index[rec_idx]]
  recommended_games = list(filter((lambda gid: gid not in play_list), recommended_games))

  while (len(recommended_games) < n):
    rec_idx += 1
    additional_games = user_play_list[scores.index[rec_idx]]
    recommended_games.extend(list(filter((lambda gid: gid not in play_list), additional_games)))
  
  return recommended_games[:n]

In [22]:
recommended_games = recommend_games('evcentric')

In [23]:
recommended_games

['6850',
 '6860',
 '6900',
 '2400',
 '2420',
 '2430',
 '4000',
 '12900',
 '1500',
 '1530']

In [24]:
for game in recommended_games:
  filtered_games = steam_games_df[steam_games_df.id == game]
  if filtered_games.shape[0] > 0:
    game_name = filtered_games.iloc[0]['app_name']
    print(game_name)

Hitman 2: Silent Assassin
Hitman: Blood Money
Hitman: Codename 47
The Ship: Murder Party
The Ship: Single Player
Garry's Mod
AudioSurf
Darwinia
Multiwinia
