# Loading the csv file into a dataframe.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('bgg_data.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)

# Preprocessing the data

## Removing NaN values

In [3]:
df.loc[df.isnull().any(axis = 1)]

Unnamed: 0,Title,Geek Rating,Link,Min Player No.,Max Player No.,Weight,Designers,Categories,Mechanics,Reimplements,Reimplemented by
2769,Decktet,5.944,https://boardgamegeek.com/boardgame/37301/decktet,,,,,,,,
2949,Unpublished Prototype,5.912,https://boardgamegeek.com/boardgame/18291/unpu...,,,,,,,,
3054,Traditional Card Games,5.894,https://boardgamegeek.com/boardgame/21804/trad...,,,,,,,,
3736,Outside the Scope of BGG,5.805,https://boardgamegeek.com/boardgame/23953/outs...,,,,,,,,
5212,Miscellaneous Game Accessory,5.682,https://boardgamegeek.com/boardgame/5985/misce...,,,,,,,,
5838,Piecepack,5.649,https://boardgamegeek.com/boardgame/2860/piece...,,,,,,,,
7291,Monsterpocalypse Miniatures Game,5.599,https://boardgamegeek.com/boardgame/248641/mon...,,,,,,,,
7433,Fear God and Dread Nought,5.596,https://boardgamegeek.com/boardgame/6540/fear-...,,,,,,,,
8353,General Quarters,5.577,https://boardgamegeek.com/boardgame/16435/gene...,,,,,,,,
9832,The Badger Deck,5.555,https://boardgamegeek.com/boardgame/160950/bad...,,,,,,,,


In [4]:
df.dropna(inplace = True)

## Cleaning the data

In [5]:
for column in ['Min Player No.', 'Max Player No.']:
    df[column] = df[column].apply(lambda x: int(x))

In [6]:
from ast import literal_eval

for column in ['Designers', 'Categories', 'Mechanics', 'Reimplements', 'Reimplemented by']:
    df[column] = df[column].apply(literal_eval)

### Ensuring that any bracketed years I remove from the reimplementation list titles are not part of the title of any of the top 10,000 games

In [7]:
all_games_reimplemented = []
for x in df['Reimplements']:
    for y in x:
        all_games_reimplemented.append(y)

all_games_reimplemented = set(all_games_reimplemented)

In [8]:
all_games_reimplementing = []
for x in df['Reimplemented by']:
    for y in x:
        all_games_reimplementing.append(y)

all_games_reimplementing = set(all_games_reimplementing)

In [9]:
ends_in_year_brackets = []

for x in df['Title']:
    statement = False
    if x[-1] == ')':
        bracketed_string = x[x.rfind('('):]
        if len(bracketed_string) == 6:
            try:
                year = int(bracketed_string[1:-1])
                if 1000 <= year <= 3000:
                    statement = True
            except:
                pass
    ends_in_year_brackets.append(statement)
    
game_titles_ending_in_years = []

for x in df.loc[ends_in_year_brackets]['Title']:
    game_titles_ending_in_years.append(x)
    
for x in game_titles_ending_in_years:
    print(x)

MIL (1049)
Horus Heresy (1993)
The Invasion of Russia (1812)


In [10]:
for x in game_titles_ending_in_years:
    for y in all_games_reimplemented.union(all_games_reimplementing):
        if x in y:
            print(x)
            break

### Removing publishing years from reimplementation data titles

In [11]:
for i, x in enumerate(df['Reimplements']):
    for j, y in enumerate(x):
        if y[-1] == ')':
            bracketed_string = y[y.rfind('('):]
            if len(bracketed_string) == 6:
                try:
                    year = int(bracketed_string[1:-1])
                    if 1000 <= year <= 3000:
                        statement = True
                    df['Reimplements'].iloc[i][j] = y[:-7]
                except:
                    pass

In [12]:
for i, x in enumerate(df['Reimplemented by']):
    for j, y in enumerate(x):
        if y[-1] == ')':
            bracketed_string = y[y.rfind('('):]
            if len(bracketed_string) == 6:
                try:
                    year = int(bracketed_string[1:-1])
                    if 1000 <= year <= 3000:
                        statement = True
                    df['Reimplemented by'].iloc[i][j] = y[:-7]
                except:
                    pass

### String cleaning

In [13]:
def clean_data(lst):
    return [str.lower(x.replace(" ", "")) for x in lst]

In [14]:
for column in ['Designers', 'Categories', 'Mechanics']:
    df[column] = df[column].apply(clean_data)

## Soup creation

In [15]:
def create_soup(df):
    return ' '.join(df['Designers']) + ' ' + ' '.join(df['Categories']) + ' ' + ' '.join(df['Mechanics'])

In [16]:
df['Soup'] = df.apply(create_soup, axis = 1)

# Developing the Recommender

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['Soup'])

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [19]:
df.reset_index(drop = True, inplace = True)
indices = pd.Series(df.index, index = df['Title'])

In [20]:
import numpy as np

def get_recommendations(title):
    try:
        idx = indices[title]
    except:
        print('Couldn\'t find the title. Make sure you\'ve spelled it correctly (according to Board Game Geek) and that it is in the top 10,000 Board Game Geek games.')
        return None
    
    if type(idx) is not np.int64:
        print('There are multiple games titled {}. Enter the number corresponding to the correct link:'.format(title))
        for link in df.loc[df['Title'] == title]['Link'].values:
            print(str(df.loc[df['Link'] == link].index.values[0]) + ': ' + link)
        idx = int(input())
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:31]
    
    game_indices = [i[0] for i in sim_scores]
    similar_games = df.iloc[game_indices].copy()
    similar_games.reset_index(drop = True, inplace = True)
    
    reimplements_locations = []
    for i, x in enumerate(similar_games['Title']):
        if x in df.iloc[idx]['Reimplements']:
            reimplements_locations.append(i)
    similar_games.drop(reimplements_locations, inplace = True)
    similar_games.reset_index(drop = True, inplace = True)
    
    reimplemented_by_locations = []
    for i, x in enumerate(similar_games['Title']):
        if x in df.iloc[idx]['Reimplemented by']:
            reimplemented_by_locations.append(i)
    similar_games.drop(reimplemented_by_locations, inplace = True)
    similar_games.reset_index(drop = True, inplace = True)
    
    incompatible_player_number_locations = []
    for i in range(len(similar_games)):
        if (similar_games.iloc[i]['Min Player No.'] > df.iloc[idx]['Max Player No.']) or (similar_games.iloc[i]['Max Player No.'] < df.iloc[idx]['Min Player No.']):
            incompatible_player_number_locations.append(i)
    similar_games.drop(incompatible_player_number_locations, inplace = True)
    
    similar_games.sort_values('Geek Rating', ascending = False, inplace = True)
    
    print('Check out these games:\n')
    
    for i in range(min(10, len(similar_games))):
        print('{}: {} has a weight of {}/5. Find more info on it here: {}.'.format(i + 1, similar_games.iloc[i]['Title'], similar_games.iloc[i]['Weight'], similar_games.iloc[i]['Link']))
    
    if df.iloc[idx]['Reimplements'] != []:
        print('\n\n\n{} reimplements the following games:\n'.format(title))
        for x in df.iloc[idx]['Reimplements']:
            if x in df['Title'].values:
                print('{}. Find more info on it here: {}.'.format(x, df.loc[df['Title'] == x]['Link'].values[0]))
    
    if df.iloc[idx]['Reimplemented by'] != []:
        print('\n\n\n{} is reimplemented by the following games:\n'.format(title))
        for x in df.iloc[idx]['Reimplemented by']:
            if x in df['Title'].values:
                print('{}. Find more info on it here: {}.'.format(x, df.loc[df['Title'] == x]['Link'].values[0]))

# Recommender 

In [21]:
get_recommendations('Carcassonne')

Check out these games:

1: Kingdomino has a weight of 1.22/5. Find more info on it here: https://boardgamegeek.com/boardgame/204583/kingdomino.
2: Cacao has a weight of 1.82/5. Find more info on it here: https://boardgamegeek.com/boardgame/171499/cacao.
3: Keyper has a weight of 3.6/5. Find more info on it here: https://boardgamegeek.com/boardgame/212516/keyper.
4: Fjords has a weight of 1.72/5. Find more info on it here: https://boardgamegeek.com/boardgame/15511/fjords.
5: Carcassonne: Wheel of Fortune has a weight of 2.05/5. Find more info on it here: https://boardgamegeek.com/boardgame/45748/carcassonne-wheel-fortune.
6: Topoum has a weight of 2.43/5. Find more info on it here: https://boardgamegeek.com/boardgame/188314/topoum.
7: Khronos has a weight of 3.57/5. Find more info on it here: https://boardgamegeek.com/boardgame/25674/khronos.
8: Canterbury has a weight of 3.09/5. Find more info on it here: https://boardgamegeek.com/boardgame/136440/canterbury.
9: Infinite City has a wei