In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import pickle

In [16]:
#Example case used to identify all the tags within a scrape and what to grab as features
response = requests.get("https://www.boardgamegeek.com/xmlapi2/thing?id=30549&stats=1&comments=1")

In [17]:
soup = BeautifulSoup(response.text, 'xml')

In [18]:
soup

<?xml version="1.0" encoding="utf-8"?>
<items termsofuse="https://boardgamegeek.com/xmlapi/termsofuse"><item id="30549" type="boardgame">
<thumbnail>https://cf.geekdo-images.com/S3ybV1LAp-8SnHIXLLjVqA__thumb/img/oqViRj6nVxK3m36NluTxU1PZkrk=/fit-in/200x150/filters:strip_icc()/pic1534148.jpg</thumbnail>
<image>https://cf.geekdo-images.com/S3ybV1LAp-8SnHIXLLjVqA__original/img/IsrvRLpUV1TEyZsO5rC-btXaPz0=/0x0/filters:format(jpeg)/pic1534148.jpg</image>
<name sortindex="1" type="primary" value="Pandemic"/>
<name sortindex="1" type="alternate" value="EPIZOotic"/>
<name sortindex="1" type="alternate" value="Pandemia"/>
<name sortindex="1" type="alternate" value="Pandemia 10 Aniversario"/>
<name sortindex="1" type="alternate" value="Pandemia: Una Nuova Sfida"/>
<name sortindex="1" type="alternate" value="Pandemic, 10 Jahre Jubiläumsedition"/>
<name sortindex="1" type="alternate" value="Pandemic, Edición 10º Aniversario"/>
<name sortindex="1" type="alternate" value="Pandemic: 10th Anniversary E

In [4]:
#Later discovered that scraping by individual board games did capture description information, 
#but the batches scraped lost this information. Due to time constraints, it would not be possible to perform 
#individaul scrapes.
soup.find('description').string

"In Pandemic, several virulent diseases have broken out simultaneously all over the world! The players are disease-fighting specialists whose mission is to treat disease hotspots while researching cures for each of four plagues before they get out of hand.&#10;&#10;The game board depicts several major population centers on Earth. On each turn, a player can use up to four actions to travel between cities, treat infected populaces, discover a cure, or build a research station. A deck of cards provides the players with these abilities, but sprinkled throughout this deck are Epidemic! cards that accelerate and intensify the diseases' activity. A second, separate deck of cards controls the &quot;normal&quot; spread of the infections.&#10;&#10;Taking a unique role within the team, players must plan their strategy to mesh with their specialists' strengths in order to conquer the diseases. For example, the Operations Expert can build research stations which are needed to find cures for the dis

# Creating dataframe
After identifying the tags for each desired feature column, each was incorporated into one run through all the text files scraped. Try/except statements were added after encountering errors in some of the files acted upon. 

In [8]:
directory = 'bgg'
scrapelist = []
for filename in os.listdir(directory): 
    f = os.path.join(directory, filename) 
    if os.path.isfile(f): 
        scrapelist.append(f)

In [16]:
bgg_list = []

for file in scrapelist:
    with open(file) as f:
        batchsoup = BeautifulSoup(f.read())
        columns = ['name', 'desc', 'yr_pub', 'min_players', 'max_players', 'avg_play_time', 'min_play_time', 'max_play_time', \
                   'min_age', 'categories', 'mechanics', 'families', 'has_expansion', 'designers', 'artists', \
                  'publishers', 'comments', 'bgtype', 'num_users_rated', 'avg_user_rating', 'bgg_adj_rating', \
                  'owned', 'trading', 'wanting', 'wishing', 'game_id', 'stddev', 'num_users_complexity', 'complexity']
        for item in batchsoup.find_all('item'):
            name = item.find('name')['value']
            desc = item.find('description').string
            try:
                yr_pub = pd.to_datetime(item.find('yearpublished')['value'],yearfirst=True).year
            except ValueError:
                yr_pub = 0
            try:
                min_players = int(item.find('minplayers')['value'])
            except ValueError:
                min_players = 0
            try:
                max_players = int(item.find('maxplayers')['value'])
            except ValueError:
                max_players = 0
            try:
                avg_play_time = int(item.find('playingtime')['value'])
            except ValueError:
                avg_play_time = 0
            try:
                min_play_time = int(item.find('minplaytime')['value'])
            except ValueError:
                min_play_time = 0
            try:
                max_play_time = int(item.find('maxplaytime')['value'])
            except ValueError:
                max_play_time = 0
            try:
                min_age = int(item.find('minage')['value'])
            except ValueError:
                min_age = 0
            categories = []
            for cat in item.find_all(type='boardgamecategory'):
                categories.append(cat['value'])
            mechanics = []
            for mech in item.find_all(type='boardgamemechanic'):
                mechanics.append(mech['value'])
            families = []
            for fam in item.find_all(type='boardgamefamily'):
                families.append(fam['value'])
            has_expansion = 0
            if item.find(type='boardgameexpansion'):
                has_expansion = 1
            designers = []
            for des in item.find_all(type='boardgamedesigner'):
                designers.append(des['value'])
            artists = []
            for art in item.find_all(type='boardgameartist'):
                artists.append(art['value'])
            publishers = []
            for pub in item.find_all(type='boardgamepublisher'):
                publishers.append(pub['value'])
            comments = []
            for cmnt in item.find_all('comment'):
                comments.append(cmnt['value'])
            bgtype = []
            for group in item.find_all('rank')[1:]:
                bgtype.append(group['name'])
            try:
                num_users_rated = int(item.find('usersrated')['value'])
            except ValueError:
                num_users_rated = 0
            try:
                avg_user_rating = float(item.find('average')['value'])
            except ValueError:
                avg_user_rating = 0
            try:
                bgg_adj_rating = float(item.find('bayesaverage')['value'])
            except ValueError:
                bgg_adj_rating = 0
            try:
                owned = int(item.find('owned')['value'])
            except ValueError:
                owned = 0
            try:
                trading = int(item.find('trading')['value'])
            except ValueError:
                trading = 0
            try:
                wanting = int(item.find('wanting')['value'])
            except ValueError:
                wanting = 0
            try:
                wishing = int(item.find('wishing')['value'])
            except ValueError:
                wishing = 0
            try:
                game_id = item['id']
            except TypeError:
                game_id = 0
            try:
                stddev = float(item.find('stddev')['value'])
            except ValueError:
                stddev = 0
            try:
                num_users_complexity = int(item.find('numweights')['value'])
            except ValueError:
                num_users_complexity = 0
            try:
                complexity = float(item.find('averageweight')['value'])
            except ValueError:
                complexity = 0
            bgg_dict = dict(zip(columns, [name, desc, yr_pub, min_players, max_players, avg_play_time, min_play_time, \
                                          max_play_time, min_age, categories, mechanics, families, has_expansion, \
                                          designers, artists, publishers, comments, bgtype, num_users_rated, \
                                          avg_user_rating, bgg_adj_rating, owned, trading, wanting, wishing,
                                         game_id, stddev, num_users_complexity, complexity]))
            bgg_list.append(bgg_dict)

In [17]:
##with open('bgg_list.pickle', 'wb') as bgg_list_pickle:
##    pickle.dump(bgg_list, bgg_list_pickle)

In [18]:
df = pd.DataFrame(bgg_list)

In [2]:
##with open('bgg_df.pickle', 'rb') as f:
##    df = pickle.load(f)

In [19]:
##with open('bgg_df.pickle', 'wb') as bgg_df_pickle:
##    pickle.dump(df, bgg_df_pickle)

# Variations
Revisited to find a way to identify expansions to remove from recommender's results. Originally identified under the tags for expansions and implementations (hence exp_imp). After further testing, seemingly addtional variation results were showing up and it was discovered that some fell under the integrations and compilations tags. Grabbing these and re-creating the dictionary were then done.

In [9]:
exp_imp_list = []

for file in scrapelist:
    with open(file) as f:
        batchsoup = BeautifulSoup(f.read())
        columns = ['name', 'expansions', 'implementations','integrations','compilations']
        for item in batchsoup.find_all('item'):
            name = item.find('name')['value']
            expansions = []
            for exp in item.find_all(type='boardgameexpansion'):
                expansions.append(exp['value'])
            implementations = []
            for imp in item.find_all(type='boardgameimplementation'):
                implementations.append(imp['value'])
            integrations = []
            for integ in item.find_all(type='boardgameintegration'):
                integrations.append(integ['value'])
            compilations = []
            for comp in item.find_all(type='boardgamecompilation'):
                compilations.append(comp['value'])
            exp_imp_dict = dict(zip(columns, [name, expansions, implementations, integrations, compilations]))
            exp_imp_list.append(exp_imp_dict)

In [10]:
exp_imp_df = pd.DataFrame(exp_imp_list)

In [12]:
exp_imp_dict = dict(zip(exp_imp_df['name'],exp_imp_df['expansions']+exp_imp_df['implementations']+exp_imp_df['integrations']+exp_imp_df['compilations']))

In [13]:
exp_imp_dict

{'Troia': [],
 'Mauseschlau & Bärenstark: Wissen & Lachen – Unsere Erde': [],
 'Quintessential: The Fifth Element': [],
 'Wort für Wort': [],
 'Hannah Montana Secret Star': ['Spider-Man 3: The Ultimate Game'],
 'Strat-O-Matic Baseball': [],
 'King of the Tabletop': ['Kings & Things'],
 'Touring': ['Lindy, the New Flying Game',
  'Mille Bornes',
  'Online: Internet Card Game',
  'Stap op'],
 'Title Bout': ['Title Bout 2'],
 'Kartenschach': [],
 'The Magnificent Race': [],
 'Weekend in Vegas': [],
 'The Stock Market Game': [],
 'Afrika Korps': [],
 'Blood and Iron': [],
 'Battles of Trenton and Princeton': [],
 'Mythology: A Game of Adventure in the Age of Heros': [],
 'Crusades II': [],
 "Isaac Asimov's Super Quiz": [],
 'Ultimate Stratego': [],
 'Gobblet': ['Gobblet Gobblers'],
 'Space: The Game': ['Space: The Game Expansion Kit #1',
  'Space: The Game Expansion Kit #2'],
 'Lost Patrol': ['Warhammer Age of Sigmar: Crypt Hunters'],
 'Oil Power': [],
 'Borderlands': ['Borderlands Expansi

In [14]:
##with open('exp_imp_dict.pickle', 'wb') as exp_imp_dict_pickle:
##    pickle.dump(exp_imp_dict, exp_imp_dict_pickle)

In [None]:
##with open('exp_imp_dict.pickle', 'rb') as exp_imp_dict_pickle:
##    exp_imp_dict = pickle.load(exp_imp_dict_pickle)

In [23]:
##with open('exp_imp_df.pickle', 'wb') as exp_imp_df_pickle:
##    pickle.dump(exp_imp_df, exp_imp_df_pickle)

In [3]:
##with open('exp_imp_df.pickle', 'rb') as exp_imp_df_pickle:
##    exp_imp_df = pickle.load(exp_imp_df_pickle)

In [4]:
##with open('rec_df.pickle', 'rb') as f:
##    rec_df = pickle.load(f)

In [22]:
##with open('rec_df.pickle', 'wb') as rec_df_pickle:
##    pickle.dump(rec_df, rec_df_pickle)