## content-based recommendation system 
### Based-on column 'Description'

In [1]:
import pandas as pd
import numpy as np

In [2]:
games_df = pd.read_excel("games.xlsx")
games_df.head()

Unnamed: 0,BGGId,Name,Description,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,...,Rank:partygames,Rank:childrensgames,Cat:Thematic,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
0,1,Die Macher,die macher game seven sequential political rac...,1986,43206.0,761428.0,710363.0,157979.0,3,5,...,21926,21926,0,1,0,0,0,0,0,0
1,2,Dragonmaster,dragonmaster tricktaking card game base old ga...,1981,1963.0,664537.0,578447.0,14544.0,3,4,...,21926,21926,0,1,0,0,0,0,0,0
2,3,Samurai,samurai set medieval japan player compete gain...,1998,24859.0,745601.0,723994.0,118227.0,2,4,...,21926,21926,0,1,0,0,0,0,0,0
3,4,Tal der KÃ¶nige,triangular box luxurious large block tal der k...,1992,26667.0,660006.0,567954.0,123129.0,2,4,...,21926,21926,0,0,0,0,0,0,0,0
4,5,Acquire,acquire player strategically invest business t...,1964,25031.0,733861.0,714189.0,133583.0,2,6,...,21926,21926,0,1,0,0,0,0,0,0


In [25]:
# Number of records in original games_df
print("Number of unique games: " + str(games_df.shape[0]))

Number of unique games: 21925


In [22]:
# NUmber of unique games names
print("Number of unique games names: " + str(len(games_df['Name'].unique())))

Number of unique games names: 21521


In [4]:
# Noticed that there are repetitions of games names in the games_df, but with different BGGId, testing this with name 'Samurai'

games_df[games_df['Name']== 'Samurai']

Unnamed: 0,BGGId,Name,Description,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,...,Rank:partygames,Rank:childrensgames,Cat:Thematic,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
2,3,Samurai,samurai set medieval japan player compete gain...,1998,24859.0,745601.0,723994.0,118227.0,2,4,...,21926,21926,0,1,0,0,0,0,0,0
2125,3061,Samurai,gmt websitesamurai fifth entry awardwinne grea...,1996,3.44,708258.0,575259.0,149492.0,2,2,...,21926,21926,0,0,1,0,0,0,0,0
5240,11320,Samurai,abstract strategy game japanese warrior chrome...,1975,2.5,61087.0,551543.0,141772.0,2,2,...,21926,21926,0,0,0,0,0,1,0,0


In [5]:
# BGGId are unique identifiers of each game in games_df
len(games_df['BGGId'].unique())

21925

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3, max_features=None, 
                      strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                      ngram_range=(1,3), stop_words='english')

# Filling NaNs in "Description" with empty string
games_df["Description"]=games_df['Description'].fillna('')

# Fitting the TF-IDF on the "Description" text
tfv_matrix = tfv.fit_transform(games_df["Description"])

In [7]:
tfv_matrix

<21925x154740 sparse matrix of type '<class 'numpy.float64'>'
	with 2339369 stored elements in Compressed Sparse Row format>

In [8]:
tfv_matrix.shape

(21925, 154740)

In [9]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid_kernel (logistic regression)
sig=sigmoid_kernel(tfv_matrix,tfv_matrix)

In [10]:
sig[0] #similarity metrics of game index 0 with other games

array([0.76159687, 0.76159423, 0.76159418, ..., 0.76159418, 0.76159424,
       0.76159417])

In [11]:
# Reversing mapping of game name and game BGGId
indices = pd.Series(games_df.index, index=games_df['BGGId'])

In [12]:
indices

BGGId
1             0
2             1
3             2
4             3
5             4
          ...  
347146    21920
347521    21921
348955    21922
349131    21923
349161    21924
Length: 21925, dtype: int64

In [13]:
def give_rec(BGGId,sig=sig):
    
    # Get the index corresponding to original name
    idx=indices[BGGId]
    
    # Get the pairwise similarity scores
    sig_scores = list(enumerate(sig[idx]))
    
    # Sort the games
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)
    
    # Scores of the 10 most similar games
    sig_scores = sig_scores[1:11]
    
    # Game indices
    game_indices = [i[0] for i in sig_scores]
    
    # Top 10 most similar games
    return games_df['Name'].iloc[game_indices]
    

In [14]:
# Testing content-based recommendation system with input of BGGId
# BGGId 3 corresponds to the game name 'Samurai', which is exactly the name of three different games in games_df
give_rec(3)

# returns the top 10 most similar games to the one you enter in the function with their corresponding indices in the games_df 

18568                                            Rice Dice
12917                            Spirits of the Rice Paddy
19093                                      Seasons of Rice
9710                                    Seven Card Samurai
8323                                                 Bluff
11647                                         Seven Swords
11178                                                  Edo
10779                                        Seven Sisters
59       Samurai: Game of Politics and Warfare in Feuda...
13796                                            Takamatsu
Name: Name, dtype: object

In [15]:
games_df[games_df['Name']=='Samurai']

Unnamed: 0,BGGId,Name,Description,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,...,Rank:partygames,Rank:childrensgames,Cat:Thematic,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
2,3,Samurai,samurai set medieval japan player compete gain...,1998,24859.0,745601.0,723994.0,118227.0,2,4,...,21926,21926,0,1,0,0,0,0,0,0
2125,3061,Samurai,gmt websitesamurai fifth entry awardwinne grea...,1996,3.44,708258.0,575259.0,149492.0,2,2,...,21926,21926,0,0,1,0,0,0,0,0
5240,11320,Samurai,abstract strategy game japanese warrior chrome...,1975,2.5,61087.0,551543.0,141772.0,2,2,...,21926,21926,0,0,0,0,0,1,0,0


In [16]:
give_rec(3061)

16820                                          Tenkatoitsu
5232                                      Devil's Horsemen
6923                                                   RAN
4195     Risorgimento 1859: the Second Italian War of I...
2264     Prussia's Glory: The Battles of Frederick the ...
7335                  Sekigahara: The Unification of Japan
1375                                            Cataphract
16424                                        Azuchi Castle
5763                         Carthage: The First Punic War
7681      A Most Dangerous Time: Japan in Chaos, 1570-1584
Name: Name, dtype: object

In [17]:
give_rec(11320)

13796                                      Takamatsu
18901                 Swordcrafters Expanded Edition
18339                                  Swordcrafters
17539                                Bushido Breaker
11178                                            Edo
11647                                   Seven Swords
9710                              Seven Card Samurai
17107    Test of Honour: The Samurai Miniatures Game
10699                                 Sake & Samurai
14090                                       HexAgony
Name: Name, dtype: object