# Video Game Recommender Model
This step takes the data output from exploration to group and recommend video games by "like" similarity and fine-tune based on text review sentiment scores.

In [1]:
%config Completer.use_jedi = False

In [2]:
import pandas as pd
import numpy as np
import os.path

from ast import literal_eval
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [52]:
path_steam = "C:/users/ggibs/steam_game_recommender"
path_interim = os.path.join(path_steam, "data/interim/")
path_external = os.path.join(path_steam, "data/external/")
path_finished = os.path.join(path_steam, "data/finished/")

In [4]:
# list of game titles
app_list = pd.read_csv(os.path.join(path_external, "app_list.csv"), index_col="appid")
app_list.head()

Unnamed: 0_level_0,name
appid,Unnamed: 1_level_1
10,Counter-Strike
20,Team Fortress Classic
30,Day of Defeat
40,Deathmatch Classic
50,Half-Life: Opposing Force


In [5]:
game_model = pd.read_csv(os.path.join(path_interim, "game_model.csv"))
game_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 779826 entries, 0 to 779825
Data columns (total 23 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   appid                           779826 non-null  int64  
 1   recommendationid                779826 non-null  int64  
 2   rescore                         779826 non-null  float64
 3   timestamp_created               779826 non-null  int64  
 4   voted_up                        779826 non-null  bool   
 5   author.steamid                  779826 non-null  int64  
 6   author.num_games_owned          779826 non-null  int64  
 7   author.num_reviews              779826 non-null  int64  
 8   author.playtime_forever         779739 non-null  float64
 9   author.playtime_last_two_weeks  779739 non-null  float64
 10  author.playtime_at_review       779570 non-null  float64
 11  genre                           779374 non-null  object 
 12  Action          

In [6]:
game_model.rename(columns = {'rescore':'sentiment'}, inplace = True) 

game_model[game_model.duplicated(subset=['author.steamid', 'appid'])==True]

Unnamed: 0,appid,recommendationid,sentiment,timestamp_created,voted_up,author.steamid,author.num_games_owned,author.num_reviews,author.playtime_forever,author.playtime_last_two_weeks,...,Indie,Adventure,RPG,Strategy,Simulation,Casual,Massively,Sports,Racing,Other


No duplicate votes of same game, same user 

In [7]:
# Add numeric for voted_up True / False, 1 / -1, so emtpy locations can be zero
game_model['voted_up_num'] = np.where(game_model['voted_up'] == True, 1, -1)
game_model['voted_up_num'].value_counts()

 1    693628
-1     86198
Name: voted_up_num, dtype: int64

11% of votes are down votes

### Start item-item collaborative filtering

In [8]:
# Start game-neighbors, similar liked-based game recommendations only, drop user and genre, reserve disklikes for exclusion
likes_only = game_model[['appid', 'author.steamid','voted_up']][game_model['voted_up']==True]
likes_only.voted_up = 1
likes_only = likes_only.merge(app_list, on='appid')
likes_only.drop('appid', 1, inplace=True)

dislikes_only = game_model[['appid', 'author.steamid','voted_up']][game_model['voted_up']==False]
dislikes_only.voted_up = -1
dislikes_only = dislikes_only.merge(app_list, on='appid')
dislikes_only.drop('appid', 1, inplace=True)

In [9]:
# Create pivot
likes_only_pivot = likes_only.pivot_table(index = 'author.steamid', columns = 'name', values = 'voted_up').fillna(0)
likes_only_pivot.reset_index(inplace=True)

# Remove index name
likes_only_pivot = likes_only_pivot.rename_axis(None, axis=1).reset_index(drop=True)


dislikes_only_pivot = dislikes_only.pivot_table(index = 'author.steamid', columns = 'name', values = 'voted_up').fillna(0)
dislikes_only_pivot.reset_index(inplace=True)
dislikes_only_pivot = dislikes_only_pivot.rename_axis(None, axis=1).reset_index(drop=True)

In [10]:
# Items only, drop users
likes_only_pivot_items = likes_only_pivot.drop('author.steamid', 1)
likes_only_pivot_items.iloc[:5, :10]

Unnamed: 0,12 is Better Than 6,7 Days to Die,A Plague Tale: Innocence,ARMA: Cold War Assault,ASTRONEER,ATLAS,Ace of Spades: Battle Builder,Age of Empires II (2013),Age of Empires II: Definitive Edition,Age of Empires III: Complete Collection
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
dislikes_only_pivot_items = dislikes_only_pivot.drop('author.steamid', 1)

In [16]:
def magnitude(likes_items):
# Normalize vectors with magnitude so that high-volume gamers do not overwhelm low-volume
    magnitude = np.sqrt(np.square(likes_items).sum(axis=1))
    likes_items = likes_items.divide(magnitude, axis='index')
    return likes_items

In [17]:
likes_only_pivot_items = magnitude(likes_only_pivot_items)

In [13]:
def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sims = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sims

In [14]:
# Build the similarity matrix
gameXgame = calculate_similarity(likes_only_pivot_items)
gameXgame.iloc[:7, :7]

Unnamed: 0,12 is Better Than 6,7 Days to Die,A Plague Tale: Innocence,ARMA: Cold War Assault,ASTRONEER,ATLAS,Ace of Spades: Battle Builder
12 is Better Than 6,1.0,0.001533,0.006094,0.005461,0.0,0.000687,0.0
7 Days to Die,0.001533,1.0,0.007915,0.007277,0.001113,0.045306,0.003646
A Plague Tale: Innocence,0.006094,0.007915,1.0,0.0,0.0,0.004776,0.000707
ARMA: Cold War Assault,0.005461,0.007277,0.0,1.0,0.0,0.0,0.0
ASTRONEER,0.0,0.001113,0.0,0.0,1.0,0.004892,0.0
ATLAS,0.000687,0.045306,0.004776,0.0,0.004892,1.0,0.0
Ace of Spades: Battle Builder,0.0,0.003646,0.000707,0.0,0.0,0.0,1.0


In [15]:
# Identify closest neighbors of video games
game_neighbors = pd.DataFrame(index=gameXgame.columns, columns=range(1,11))
for i in range(0, len(gameXgame.columns)):
    game_neighbors.iloc[i,:10] = gameXgame.iloc[0:,i].sort_values(ascending=False)[:10].index

In [18]:
game_neighbors.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
12 is Better Than 6,12 is Better Than 6,Reus,Prey,Beholder,Guns of Icarus Online,Call of Duty: Ghosts,Mark of the Ninja,The Darkness II,LIMBO,Dark Messiah of Might & Magic
7 Days to Die,7 Days to Die,Terraria,Raft,Space Engineers,ATLAS,Garry's Mod,Counter-Strike: Global Offensive,DayZ,They Are Billions,Left 4 Dead 2
A Plague Tale: Innocence,A Plague Tale: Innocence,Shadow of the Tomb Raider: Definitive Edition,GRIS,STAR WARS Jedi: Fallen Order,Assassin's Creed Odyssey,Resident Evil 2,Ori and the Will of the Wisps,Life is Strange: Before the Storm,Sekiro: Shadows Die Twice,Quantum Break
ARMA: Cold War Assault,ARMA: Cold War Assault,Arma 2: Operation Arrowhead,Day of Defeat,Half-Life 2: Deathmatch,Counter-Strike,Mount & Blade: Warband,POSTAL 2,Counter-Strike: Source,Zombie Army Trilogy,Half-Life
ASTRONEER,ASTRONEER,Terraria,Among Us,The Forest,Oxygen Not Included,The Incredible Adventures of Van Helsing,SUPERHOT,Risk of Rain 2,Counter-Strike: Global Offensive,The Bureau: XCOM Declassified


In [19]:
# Some game series with many editions match mostly to themselves
game_neighbors.loc['Resident Evil':'Resident Evil 6', 2:7]

Unnamed: 0,2,3,4,5,6,7
Resident Evil,Resident Evil 2,Resident Evil 4,Resident Evil 3,Resident Evil Revelations,Resident Evil 5,Resident Evil Revelations 2
Resident Evil 2,Resident Evil 3,Resident Evil,Devil May Cry 5,Resident Evil 4,Sekiro: Shadows Die Twice,DOOM Eternal
Resident Evil 3,Resident Evil 2,Resident Evil,Resident Evil 4,DOOM Eternal,Devil May Cry 5,Resident Evil 5
Resident Evil 4,Resident Evil 5,Resident Evil,Resident Evil 6,Resident Evil 2,Resident Evil Revelations,Resident Evil 3
Resident Evil 5,Resident Evil 6,Resident Evil 4,Resident Evil Revelations,Resident Evil Revelations 2,Resident Evil,Resident Evil 2
Resident Evil 6,Resident Evil 5,Resident Evil Revelations,Resident Evil 4,Resident Evil Revelations 2,Resident Evil 2,Resident Evil


In [19]:
# Tomb Raider did match to Laura Croft
game_neighbors.loc['Tomb Raider II':'Tomb Raider II', 2:5]

Unnamed: 0,2,3,4,5
Tomb Raider II,Tomb Raider: Legend,Tomb Raider: Underworld,Tomb Raider: Anniversary,Lara Croft and the Guardian of Light


In [20]:
# Two games with Chinese matched
game_neighbors.loc['古剑奇谭三(Gujian3)':'古剑奇谭三(Gujian3)', 2:5]

Unnamed: 0,2,3,4,5
古剑奇谭三(Gujian3),Chinese Parents,Conqueror's Blade,Assassin's Creed Odyssey,FINAL FANTASY XV WINDOWS EDITION


### Start user-item collaborative filtering

In [103]:
def game_recommender(likes = likes_only_pivot, dislikes = dislikes_only_pivot, 
                     game_sentiment = None, game_genre = None, player_genre_count = None):
    
    """Takes separate user-game binary tables of "likes" and "dislikes" and game similarity neighbors 
        to make recommendations and scores, options to include sentiment and genre        
    """
    
    # Retain the recommended games and scores
    recommend = pd.DataFrame()
    recommend_scores = pd.DataFrame()

    record=0
    
    # Just items, no users
    likes_items = likes.drop('author.steamid', 1)
    dislikes_items = dislikes.drop('author.steamid', 1)
    
    # Magnitude to balance high and low frequency gamers
    likes_items = magnitude(likes_items)

    for user in likes['author.steamid']:
        record += 1
        print("iteration {}".format(record), end='\r')  # display record counter
        user_index = likes[likes['author.steamid'] == user].index.tolist()[0]
        # Not every game has a dislike vote - error handling
        try:
            user_index_dis = dislikes[dislikes['author.steamid'] == user].index.tolist()[0]
        except:
            user_index_dis = None

        # Get the games the user has played
        known_user_likes = likes_items.iloc[user_index]
        known_user_likes = known_user_likes[known_user_likes >0].index.values
        if user_index_dis is not None:
            known_user_dislikes = dislikes_items.iloc[user_index_dis]
            known_user_dislikes = known_user_dislikes[known_user_dislikes <0].index.values

        # Construct the neighborhood from the most similar items to the ones user has already liked
        most_similar_to_likes = game_neighbors.loc[known_user_likes]
        similar_list = most_similar_to_likes.values.tolist()
        similar_list = list(set([item for sublist in similar_list for item in sublist]))
        neighborhood = gameXgame[similar_list].loc[similar_list]

        # A user vector containing only the neighborhood items and the known user likes
        user_vector = likes_items.iloc[user_index].loc[similar_list]
        # Calculate the score
        score = neighborhood.dot(user_vector).div(neighborhood.sum(axis=1))
        # Drop the known likes and dislikes
        score = score.drop(known_user_likes)
        if user_index_dis is not None:
            score = score.drop(known_user_dislikes, errors='ignore')

# Model enhancement
        if game_sentiment is not None and game_genre is not None and player_genre_count is not None:
        # Add sentiment scores to similarity scores for new total
            score_df = score.to_frame(name='score')
            score_df.index.name = 'name'
            score_df = score_df.join(game_sentiment)
            score_df['total'] = score_df.score + score_df.sentiment

        # Add genre column
            score_df = score_df.join(game_genre.genre_list)

        # Filter recommended games to only those in player's favorite genre
            selection = player_genre_count.genre_max.loc[user]
            score_df = score_df[pd.DataFrame(score_df.genre_list.tolist(), index=score_df.index).isin(selection).any(1).values] 
            score = score_df['total']            
# End model enhancement
            
        topscore = score.nlargest(10)
        topscore.name = user

        # Save player ID and games
        game_names = pd.DataFrame()
        game_names[user] = topscore.index
        game_names = game_names.transpose()
        recommend = recommend.append(game_names)

        # Save player ID and game similarity scores
        game_scores = pd.DataFrame()
        game_scores[user] = topscore.values
        game_scores = game_scores.transpose()
        recommend_scores = recommend_scores.append(game_scores)
    return recommend, recommend_scores

In [49]:
game_recommend, game_recommend_scores = game_recommender()

iteration 179850

In [50]:
game_recommend.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
76561197960267984,Aliens vs. Predator,Resident Evil 2,Tomb Raider,Amnesia: The Dark Descent,BioShock Infinite,A Plague Tale: Innocence,DOOM Eternal,Halo: The Master Chief Collection,Portal 2,Batman: Arkham Knight
76561197960268765,DOOM Eternal,Resident Evil 2,STAR WARS Jedi: Fallen Order,CODE VEIN,FINAL FANTASY XV WINDOWS EDITION,Risk of Rain 2,Destiny 2,DRAGON BALL FighterZ,MORDHAU,Remnant: From the Ashes
76561197960269155,Total War: WARHAMMER II,Assassin's Creed III Remastered,STAR WARS Jedi: Fallen Order,Total War: ROME II - Emperor Edition,Shadow of the Tomb Raider: Definitive Edition,Far Cry New Dawn,Sekiro: Shadows Die Twice,Planet Zoo,MORDHAU,A Plague Tale: Innocence
76561197960269294,Terraria,Slay the Spire,DRAGON BALL XENOVERSE 2,Risk of Rain,Halo: The Master Chief Collection,Remnant: From the Ashes,Skullgirls,DOOM Eternal,Sekiro: Shadows Die Twice,Devil May Cry 5
76561197960269645,Among Us,Counter-Strike: Global Offensive,Portal 2,Left 4 Dead 2,Bloons TD 6,Ravenfield,The Binding of Isaac: Rebirth,Fallout: New Vegas,Half-Life 2,Fall Guys: Ultimate Knockout


In [51]:
game_recommend_scores.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
76561197960267984,0.030982,0.022507,0.019024,0.018436,0.018104,0.016352,0.016254,0.015975,0.015852,0.015732
76561197960268765,0.07683,0.067248,0.064066,0.062007,0.053155,0.052918,0.048502,0.047494,0.046875,0.046416
76561197960269155,0.081284,0.052797,0.052104,0.046214,0.04484,0.038948,0.037996,0.037104,0.035048,0.033774
76561197960269294,0.047962,0.045824,0.045516,0.043749,0.039652,0.037697,0.034396,0.033525,0.032844,0.031398
76561197960269645,0.078445,0.078318,0.070763,0.070283,0.070129,0.059886,0.057821,0.055349,0.055053,0.054524


In [54]:
game_recommend.to_csv(os.path.join(path_finished, "game_recommend.csv"))
game_recommend_scores.to_csv(os.path.join(path_finished, "game_recommend_scores.csv"))
game_neighbors.to_csv(os.path.join(path_finished, "game_neighbors.csv"))

In [16]:
# game_recommend = pd.read_csv(os.path.join(path_finished, "game_recommend.csv"), index_col=[0])
# game_recommend_scores = pd.read_csv(os.path.join(path_finished, "game_recommend_scores.csv"), index_col=[0])

### Validation

In [17]:
# Manually calculate a recall, hide 4, re-recommend, what percentage is included in recommendations?

In [55]:
# Sum games liked
likes_only_pivot_sum = likes_only.pivot_table(index = 'author.steamid', columns = 'name', values = 'voted_up', aggfunc='sum', margins=True).fillna(0)

In [56]:
# Gamers with at least 10 likes
n=10
likes_sample = likes_only_pivot_sum.index[likes_only_pivot_sum['All']>=n]
len(likes_sample)

5761

In [57]:
# Drop last 4 likes
n=4
likes_sample_data = game_model[game_model['author.steamid'].isin(likes_sample)][game_model['voted_up']==True]
likes_sample_sorted = likes_sample_data.groupby(['author.steamid']).apply(lambda x: x.sort_values(['timestamp_created'])).reset_index(drop=True)
likes_sample_sorted = likes_sample_sorted.reset_index(drop=True)
likes_removed = likes_sample_sorted.drop(likes_sample_sorted.groupby(['author.steamid']).tail(n).index, axis=0)

  likes_sample_data = game_model[game_model['author.steamid'].isin(likes_sample)][game_model['voted_up']==True]


In [226]:
# Make a similar source pivot for the recommender as before
def source_pivot(likes_rem):
    likes_removed_only = likes_rem[['appid', 'author.steamid','voted_up']][likes_rem['voted_up']==True]
    likes_removed_only.voted_up = 1
    likes_removed_only = likes_removed_only.merge(app_list, on='appid')
    likes_removed_only.drop('appid', 1, inplace=True)

    # Create pivot
    likes_rem_piv = likes_removed_only.pivot_table(index = 'author.steamid', columns = 'name', values = 'voted_up').fillna(0)
    likes_rem_piv.reset_index(inplace=True)

    # Remove index name
    likes_rem_piv = likes_rem_piv.rename_axis(None, axis=1).reset_index(drop=True)

    # Make same columns as original (restore missing games)
    for column in likes_only_pivot.columns:
        if column not in likes_rem_piv.columns:
            likes_rem_piv[column] = 0.0
    likes_rem_piv.sort_index(axis=1, inplace=True)
    acol = likes_rem_piv['author.steamid']
    likes_rem_piv.drop(labels=['author.steamid'], axis=1, inplace=True)
    likes_rem_piv.insert(0, 'author.steamid', acol)
    return likes_rem_piv

In [None]:
likes_removed_pivot = source_pivot(likes_removed)

In [60]:
# Run the votes-missing-four through the recommendation
game_removed, game_removed_scores = game_recommender(likes = likes_removed_pivot)

iteration 5760

In [61]:
# Identify the 4 original votes removed
likes_sample_games = likes_sample_sorted.merge(app_list, on='appid')
likes_sample_games = likes_sample_games[['author.steamid', 'name']]
likes_removed_games = likes_removed_only[['author.steamid', 'name']]
likes_merge = likes_sample_games.merge(likes_removed_games, how='left', indicator=True)
likes_merge.sort_values(by = ['author.steamid', 'name'], inplace=True)
likes_diff = likes_merge[likes_merge._merge != 'both']
likes_diff.head(12)

Unnamed: 0,author.steamid,name,_merge
6580,76561197960271994,Dark Messiah of Might & Magic,left_only
6699,76561197960271994,Hitman 2: Silent Assassin,left_only
6909,76561197960271994,Prey,left_only
6771,76561197960271994,Soundpad,left_only
25520,76561197960319772,Deus Ex: Human Revolution - Director's Cut,left_only
25303,76561197960319772,Hitman: Blood Money,left_only
25049,76561197960319772,Mass Effect,left_only
25005,76561197960319772,Sniper: Ghost Warrior,left_only
30853,76561197960396581,Among Us,left_only
28611,76561197960396581,DOOM Eternal,left_only


In [62]:
len(likes_removed_games)

60070

In [63]:
len(likes_sample_games)

83110

In [64]:
len(likes_diff)

23040

In [66]:
# How many of original 4 votes removed appeared in new recommendations
game_removed_stack = game_removed.stack().reset_index()
game_removed_stack.columns=['author.steamid', 'num', 'name']
game_removed_stack.head(20)

Unnamed: 0,author.steamid,num,name
0,76561197960271994,0,STAR WARS Knights of the Old Republic II - The...
1,76561197960271994,1,Slay the Spire
2,76561197960271994,2,Risk of Rain
3,76561197960271994,3,Sid Meier's Civilization V
4,76561197960271994,4,Bloons TD 6
5,76561197960271994,5,Fallout: New Vegas
6,76561197960271994,6,Garry's Mod
7,76561197960271994,7,Borderlands 2
8,76561197960271994,8,Counter-Strike: Global Offensive
9,76561197960271994,9,Space Engineers


In [68]:
# Calculate recall
recall_merge = likes_diff[['author.steamid', 'name']].merge(game_removed_stack, indicator=True)
len(recall_merge[recall_merge._merge == 'both'])

5548

In [69]:
print("Recall rate: ", len(recall_merge)/len(likes_diff))

Recall rate:  0.24079861111111112


The recall rate of our model is 24%

In [None]:
recall_merge.to_csv(os.path.join(path_finished, "recall_merge.csv"))

### Model Enhancement

Can the recall rate improve after including player sentiment scores and game genres into recommendation rankings?

In [70]:
# Identify highest genre game count by player
#player_genre_counts = game_model[game_model['voted_up']==True].groupby(['author.steamid'])[[:, 'Action':'Other']].sum()
player_genre = game_model[['author.steamid', 'Action', 'Indie', 'Adventure', 'RPG', 'Strategy', 'Simulation', 'Casual',
                           'Massively', 'Sports','Racing', 'Other']][game_model['voted_up']==True]
player_genre_count = player_genre.groupby(['author.steamid']).sum()
player_genre_count.head()

Unnamed: 0_level_0,Action,Indie,Adventure,RPG,Strategy,Simulation,Casual,Massively,Sports,Racing,Other
author.steamid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
76561197960267984,2,1,0,0,1,0,0,0,0,0,0
76561197960268765,3,0,1,0,0,0,0,0,0,0,0
76561197960269155,2,0,1,1,1,0,0,0,0,0,0
76561197960269294,3,1,0,1,0,0,0,0,0,0,0
76561197960269645,4,4,1,1,1,1,2,1,0,0,0


In [97]:
# Make column listing players' favorite genre or tie

player_genre_count['max'] = player_genre_count.max(axis=1)

cols = ['Action', 'Indie', 'Adventure', 'RPG', 'Strategy', 'Simulation', 'Casual', 'Massively', 'Sports','Racing', 'Other']
player_genre_count['genre_max']=''
for row in range(len(player_genre_count)):
    print("row {}".format(row), end='\r')  # display row counter
    glist = []
    for col in cols:
        if player_genre_count[col].iloc[row] == player_genre_count['max'].iloc[row]:
            glist.append(col)
    player_genre_count['genre_max'].iloc[row] = glist

row 0row 1row 2row 3row 4row 5row 6row 7row 8row 9row 10row 11row 12row 13row 14row 15row 16row 17row 18row 19row 20row 21row 22row 23row 24row 25row 26row 27row 28row 29row 30row 31row 32row 33row 34row 35row 36row 37row 38row 39row 40row 41row 42row 43row 44row 45row 46row 47row 48row 49row 50row 51row 52row 53row 54row 55row 56row 57row 58row 59row 60row 61row 62row 63row 64row 65row 66row 67row 68row 69row 70row 71row 72row 73row 74row 75row 76row 77row 78row 79row 80row 81row 82row 83row 84row 85row 86row 87row 88row 89row 90row 91row 92row 93row 94row 95row 96row 97row 98row 99row 100row 101row 102row 103row 104row 105row 106row 107row 108row 109row 110row 111row 112row 113row 114row 115row 116row 117row 118row 119row 120row 121row 122row 123row 124row 125row 126row 127row 128row 129row 130row 131row 132row 133row 134row 135row 136row 137row 13

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


row 179849

In [98]:
player_genre_count.head()

Unnamed: 0_level_0,Action,Indie,Adventure,RPG,Strategy,Simulation,Casual,Massively,Sports,Racing,Other,max,genre_max
author.steamid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
76561197960267984,2,1,0,0,1,0,0,0,0,0,0,2,[Action]
76561197960268765,3,0,1,0,0,0,0,0,0,0,0,3,[Action]
76561197960269155,2,0,1,1,1,0,0,0,0,0,0,2,[Action]
76561197960269294,3,1,0,1,0,0,0,0,0,0,0,3,[Action]
76561197960269645,4,4,1,1,1,1,2,1,0,0,0,4,"[Action, Indie]"


In [218]:
player_genre_count.to_csv(os.path.join(path_interim, "player_genre_count.csv"))
# player_genre_count = pd.read_csv(os.path.join(path_interim, "player_genre_count.csv"), index_col = [0])
# player_genre_count.loc[:,'genre_max'] = player_genre_count.loc[:,'genre_max'].apply(lambda x: literal_eval(x))

In [99]:
# Identify each game's genres

game_genre = game_model.groupby(['appid', 'Action', 'Indie', 'Adventure', 'RPG', 'Strategy', 'Simulation', 'Casual', 'Massively', 'Sports', 'Racing', 
                    'Other'])['recommendationid'].count().to_frame()
game_genre = game_genre.join(app_list)
game_genre.reset_index(inplace=True)

cols = ['Action', 'Indie', 'Adventure', 'RPG', 'Strategy', 'Simulation', 'Casual', 'Massively', 'Sports','Racing', 'Other']
game_genre['genre_list']=''
for row in range(len(game_genre)):
    print("row {}".format(row), end='\r')  # display row counter
    glist = []
    for col in cols:
        if game_genre[col].iloc[row] == 1:
            glist.append(col)
    game_genre['genre_list'].iloc[row] = glist

game_genre.set_index('name', inplace=True)

row 0row 1row 2row 3row 4row 5row 6row 7row 8row 9row 10row 11row 12row 13row 14row 15row 16row 17row 18row 19row 20row 21row 22row 23row 24row 25row 26row 27row 28row 29row 30row 31row 32row 33row 34row 35row 36row 37row 38row 39row 40row 41row 42row 43row 44row 45row 46row 47row 48row 49row 50row 51row 52row 53row 54row 55row 56row 57row 58row 59row 60row 61row 62row 63row 64row 65row 66row 67row 68row 69row 70row 71row 72row 73row 74row 75row 76row 77row 78row 79row 80row 81row 82row 83row 84row 85row 86row 87row 88row 89row 90row 91row 92row 93row 94row 95row 96row 97row 98row 99row 100row 101row 102row 103row 104row 105row 106row 107row 108row 109row 110row 111row 112row 113row 114row 115row 116row 117row 118row 119row 120row 121row 122row 123row 124row 125row 126row 127row 128row 129row 130row 131row 132row 133row 134row 135row 136row 137row 13

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


row 390row 391row 392row 393row 394row 395row 396row 397row 398row 399row 400row 401row 402row 403row 404row 405row 406row 407row 408row 409row 410row 411row 412row 413row 414row 415row 416row 417row 418row 419row 420row 421row 422row 423row 424row 425row 426row 427row 428row 429row 430row 431row 432row 433row 434row 435row 436row 437row 438row 439row 440row 441row 442row 443row 444row 445row 446row 447row 448row 449row 450row 451row 452row 453row 454row 455row 456row 457row 458row 459row 460row 461row 462row 463row 464row 465row 466row 467row 468row 469row 470row 471row 472row 473row 474row 475row 476row 477row 478row 479row 480row 481row 482row 483row 484row 485row 486row 487row 488row 489row 490row 491row 492row 493row 494row 495row 496row 497

In [100]:
# Average sentiment scores per game
game_sentiment = game_model.groupby(['appid'])['sentiment'].mean().to_frame()
game_sentiment = game_sentiment.merge(app_list, on='appid')
game_sentiment.set_index('name', inplace=True)
game_sentiment.head()

Unnamed: 0_level_0,sentiment
name,Unnamed: 1_level_1
Counter-Strike,0.310351
Team Fortress Classic,0.347132
Day of Defeat,0.360889
Deathmatch Classic,0.406576
Half-Life: Opposing Force,0.444091


In [104]:
# Re-run recommendations with sentiment and genre additions
game_recommend2, game_recommend2_scores = game_recommender(game_sentiment=game_sentiment, game_genre=game_genre, player_genre_count=player_genre_count)

iteration 179850

In [106]:
game_recommend2.to_csv(os.path.join(path_finished, "game_recommend2.csv"))
game_recommend2_scores.to_csv(os.path.join(path_finished, "game_recommend2_scores.csv"))

In [105]:
game_recommend2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
76561197960267984,A Plague Tale: Innocence,Tomb Raider,BioShock Infinite,Aliens vs. Predator,Resident Evil 2,Portal 2,Batman: Arkham Knight,Risk of Rain 2,Space Engineers,Hunt: Showdown
76561197960268765,Ori and the Will of the Wisps,STAR WARS Jedi: Fallen Order,Bloodstained: Ritual of the Night,Remnant: From the Ashes,Resident Evil 2,CODE VEIN,Resident Evil 3,Risk of Rain 2,DRAGON BALL FighterZ,DmC: Devil May Cry
76561197960269155,A Plague Tale: Innocence,STAR WARS Jedi: Fallen Order,Shadow of the Tomb Raider: Definitive Edition,Total War: WARHAMMER II,Resident Evil 2,BATTLETECH,Mount & Blade: Warband,Sekiro: Shadows Die Twice,Far Cry New Dawn,Assassin's Creed III Remastered
76561197960269294,Grim Dawn,Assassin's Creed Odyssey,Remnant: From the Ashes,Torchlight II,Dishonored,Titan Quest Anniversary Edition,The Incredible Adventures of Van Helsing,Borderlands Game of the Year,DRAGON BALL XENOVERSE 2,Terraria
76561197960269645,Ravenfield,Portal 2,Risk of Rain 2,Left 4 Dead 2,Fallout: New Vegas,Beat Saber,Call of Duty 4: Modern Warfare,Half-Life 2,The Binding of Isaac: Rebirth,Call of Duty: Black Ops


### Revalidate Recall with Genre and Sentiment Scores Addition

In [333]:
# Gamers with 10+ upvotes, remove last 4 favorite genre games and see what percentage reappear in recommendations

In [110]:
game_genre.tail()

Unnamed: 0_level_0,appid,Action,Indie,Adventure,RPG,Strategy,Simulation,Casual,Massively,Sports,Racing,Other,recommendationid,genre_list
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Sea of Thieves,1172620,1,0,1,0,0,0,0,0,0,0,0,2814,"[Action, Adventure]"
Red Dead Redemption 2,1174180,1,0,1,0,0,0,0,0,0,0,0,1555,"[Action, Adventure]"
DEATH STRANDING,1190460,1,0,1,0,0,0,0,0,0,0,0,1977,"[Action, Adventure]"
Command & Conquer Remastered Collection,1213210,0,0,0,0,1,0,0,0,0,0,0,1016,[Strategy]
Titanfall 2,1237970,1,0,0,0,0,0,0,0,0,0,0,752,[Action]


In [220]:
# Drop last 4 likes in player's favorite genres
n=4
fav_genre = player_genre_count['genre_max'].to_frame().copy()
likes_sample_data = game_model[game_model['author.steamid'].isin(likes_sample)][game_model['voted_up']==True]

# add favorite genres
likes_sample_data = likes_sample_data.merge(fav_genre, on='author.steamid')

# add to timestamps of favorite genre games to make them last in series
for row in range(len(likes_sample_data)): 
    print("row {}".format(row), end='\r')  # display row counter
    for g in likes_sample_data['genre_max'].iloc[row]:
        if likes_sample_data[g].iloc[row] == 1:
            likes_sample_data['timestamp_created'].iloc[row] = likes_sample_data['timestamp_created'].iloc[row] + 10000000000
            break

likes_sample_sorted = likes_sample_data.groupby(['author.steamid']).apply(lambda x: x.sort_values(['timestamp_created'])).reset_index(drop=True)
likes_sample_sorted = likes_sample_sorted.reset_index(drop=True)

# return timestamps
likes_sample_sorted.loc[likes_sample_sorted['timestamp_created'] >= 10000000000, 'timestamp_created'] = likes_sample_sorted['timestamp_created'] - 10000000000

# remove latest upvotes for favorite genre games
likes_removed = likes_sample_sorted.drop(likes_sample_sorted.groupby(['author.steamid']).tail(n).index, axis=0)

  likes_sample_data = game_model[game_model['author.steamid'].isin(likes_sample)][game_model['voted_up']==True]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


row 83109

In [None]:
# Make a similar source pivot for the recommender as before
likes_removed_pivot = source_pivot(likes_removed)

In [225]:
# Rerun the votes-missing-4 top genres through the new recommendations
game_removed2, game_removed2_scores = game_recommender(likes = likes_removed_pivot)

iteration 5760

In [227]:
# Identify the 4 original votes removed
likes_sample_games = likes_sample_sorted.merge(app_list, on='appid')
likes_sample_games = likes_sample_games[['author.steamid', 'name']]
likes_removed_games = likes_removed_only[['author.steamid', 'name']]
likes_merge = likes_sample_games.merge(likes_removed_games, how='left', indicator=True)
likes_merge.sort_values(by = ['author.steamid', 'name'], inplace=True)
likes_diff = likes_merge[likes_merge._merge != 'both']
likes_diff.head(12)

Unnamed: 0,author.steamid,name,_merge
6718,76561197960271994,Dark Messiah of Might & Magic,left_only
6837,76561197960271994,Hitman 2: Silent Assassin,left_only
6909,76561197960271994,Prey,left_only
6632,76561197960271994,Red Faction Guerrilla Steam Edition,left_only
25520,76561197960319772,Deus Ex: Human Revolution - Director's Cut,left_only
25303,76561197960319772,Hitman: Blood Money,left_only
25049,76561197960319772,Mass Effect,left_only
25005,76561197960319772,Sniper: Ghost Warrior,left_only
29209,76561197960396581,DOOM Eternal,left_only
29968,76561197960396581,Fall Guys: Ultimate Knockout,left_only


In [228]:
len(likes_sample_games)

83110

In [229]:
len(likes_removed_games)

60070

In [230]:
len(likes_diff)

23040

In [231]:
# How many of original 4 votes removed appeared in new recommendations
game_removed2_stack = game_removed2.stack().reset_index()
game_removed2_stack.columns=['author.steamid', 'num', 'name']
game_removed2_stack.head(20)

Unnamed: 0,author.steamid,num,name
0,76561197960271994,0,STAR WARS Knights of the Old Republic II - The...
1,76561197960271994,1,Slay the Spire
2,76561197960271994,2,Risk of Rain
3,76561197960271994,3,Garry's Mod
4,76561197960271994,4,Bloons TD 6
5,76561197960271994,5,Sid Meier's Civilization V
6,76561197960271994,6,Fallout: New Vegas
7,76561197960271994,7,Counter-Strike: Global Offensive
8,76561197960271994,8,Borderlands 2
9,76561197960271994,9,Among Us


In [232]:
# Calculate recall
recall_merge = likes_diff[['author.steamid', 'name']].merge(game_removed2_stack, indicator=True)
len(recall_merge[recall_merge._merge == 'both'])

6175

In [233]:
print("Recall rate: ", len(recall_merge)/len(likes_diff))

Recall rate:  0.2680121527777778


The recall rate of our enhanced model improved slightly from 24% to 26.8%

In [235]:
recall_merge.to_csv(os.path.join(path_finished, "recall_merge2.csv"))