# Board Game Geek (BGG) dataset

In [1]:
#Libraries to import
from bs4 import BeautifulSoup
import requests
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import re
import time
import json

In [8]:
# Define the number of pages to search and the base URL
sleep_time = 5

num_pages = 30
base_url = "https://boardgamegeek.com/browse/boardgame/page/"

# Create an empty list to store the extracted data
data = []

# Iterate through the pages
for page in range(1, num_pages + 1):
    url = base_url + str(page)
    response = requests.get(url)
    time.sleep(sleep_time)
    
    # Check if the response was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all <a> elements with href attribute and class="primary"
        elements = soup.find_all('a', class_='primary', href=True)
        
        # Extract the relevant information from each element
        for element in elements:
            # Access the 'href' attribute value
            href = element['href']
            
            p_element = element.find_next('p', class_="smallefont dull")
            
            if p_element:
                text = p_element.get_text(strip=True)
            else:
                text = ""
            
            # Append the extracted href value to the data list
            data.append((href, text))

# Create a DataFrame from the extracted data
top_games = pd.DataFrame(data, columns=['Href', 'Text'])

# Remove the first "/boardgame/" string
top_games['Href'] = top_games['Href'].str.replace('/boardgame/', '', regex=False)

# Separate the numbers between the "/" into a different column
top_games[['ID', 'Name']] = top_games['Href'].str.split('/', n=2, expand=True)[[0, 1]]

# Drop the original 'Href' column
top_games = top_games.drop('Href', axis=1)

In [9]:
top_games

Unnamed: 0,Text,ID,Name
0,"Build networks, grow industries, and navigate ...",224517,brass-birmingham
1,Mutating diseases are spreading around the wor...,161936,pandemic-legacy-season-1
2,Vanquish monsters with strategic cardplay. Ful...,174430,gloomhaven
3,"Plan and build a modern, scientifically manage...",342942,ark-nova
4,"Build an intergalactic empire through trade, r...",233078,twilight-imperium-fourth-edition
...,...,...,...
1995,Build & fortify city together. Make sure that ...,19526,elasund-first-city
1996,Grow and mutate your colony of ants to become ...,156496,march-ants
1997,Flip tiles and die horribly (almost always) as...,472,dungeonquest
1998,Draft cards to defend a mining operation in an...,189660,xenoshyft-dreadmire


In [10]:
top_games.to_csv('top_games.csv')

In [11]:
# Create an empty list to store the game details
all_results = []

# Iterate over the game IDs from the previous DataFrame
for game_id in top_games['ID']:
    url = f"https://boardgamegeek.com/xmlapi/boardgame/{game_id}?stats=1"
    response = requests.get(url)

    if response.status_code == 200:
        soup_2 = BeautifulSoup(response.content, 'html.parser')
        all_results.append(soup_2)
        time.sleep(sleep_time)
    else:
        print(f"Error: {response.status_code} - {response.text}")

In [12]:
data_retrieved = all_results

In [13]:
data = []

for soup in data_retrieved:
    # Retrieve the game details
    boardgames = soup.find_all("boardgame")
    
    for boardgame in boardgames:
        game_data = {}
        
        #GameID
        game_data["gameid"] = boardgame["objectid"]
        
        #Name of the game
        name_element = boardgame.find("name", primary="true", sortindex="1")
        if name_element is not None:
            game_data["name"] = name_element.text
        else:
            game_data["name"] = ""
        
        #Publish year
        yearpublished_element = boardgame.find("yearpublished")
        if yearpublished_element is not None:
            game_data["yearpublished"] = yearpublished_element.text
        else:
            game_data["yearpublished"] = ""
            
        #Minimum players
        minplayers_element = boardgame.find("minplayers")
        if minplayers_element is not None:
            game_data["minplayers"] =  minplayers_element.text
        else:
            game_data["minplayers"] = "" 
            
        #Maximum players
        maxplayers_element = boardgame.find("maxplayers")
        if maxplayers_element is not None:
            game_data["maxplayers"] =  maxplayers_element.text
        else:
            game_data["maxplayers"] = ""         
        
        #Minimum play time
        minplaytime_element = boardgame.find("minplaytime")
        if minplaytime_element is not None:
            game_data["minplaytime"] =  minplaytime_element.text
        else:
            game_data["minplaytime"] = ""    
           
        #Maximun play time
        maxplaytime_element = boardgame.find("maxplaytime")
        if maxplaytime_element is not None:
            game_data["maxplaytime"] =  maxplaytime_element.text
        else:
            game_data["maxplaytime"] = "" 
            
        #Age
        age_element = boardgame.find("age")
        if age_element is not None:
            game_data["age"] =  age_element.text
        else:
            game_data["age"] = "" 
            
        #Long description 
        description_element = boardgame.find("description")
        if description_element is not None:
            # Step 1: Remove HTML tags
            description = boardgame.find("description").text
            soup_description = BeautifulSoup(description, "html.parser")
            clean_description = soup_description.get_text()   
            # Step 2: Remove line breaks
            clean_description = clean_description.replace("<br/>", "\n")    
            # Step 3: Remove leading and trailing whitespace
            clean_description = clean_description.strip()   
            game_data["description"] = clean_description
        else:
            game_data["description"] = "" 

        #Image        
        image_element = boardgame.find("image")
        if image_element is not None:
            game_data["image"] = image_element.next_sibling.strip()
        else:
            game_data["image"] = ""
        
               
        #Designer        
        designer_element = boardgame.find("boardgamedesigner")
        if designer_element is not None:
            game_data["designer"] = designer_element.text
        else:
            game_data["designer"] = ""
        
        #Artist        
        boardgameartist_element = boardgame.find("boardgameartist")
        if boardgameartist_element is not None:
            game_data["artist"] = boardgameartist_element.text
        else:
            game_data["artist"] = ""        

        #Mechanic        
        mechanic_element = boardgame.find("boardgamemechanic")
        if mechanic_element is not None:
            game_data["mechanic"] = mechanic_element.text
        else:
            game_data["mechanic"] = ""

        #Category        
        category_element = boardgame.find("boardgamecategory")
        if category_element is not None:
            game_data["category"] = category_element.text
        else:
            game_data["category"] = ""
        
        #Usersrated        
        usersrated_element = boardgame.find("usersrated")
        if usersrated_element is not None:
            game_data["usersrated"] = usersrated_element.text
        else:
            game_data["usersrated"] = ""   

        #Average        
        average_element = boardgame.find("average")
        if average_element is not None:
            game_data["average"] = average_element.text
        else:
            game_data["average"] = ""  

        #Bayesaverage        
        bayesaverage_element = boardgame.find("bayesaverage")
        if bayesaverage_element is not None:
            game_data["bayesaverage"] = bayesaverage_element.text
        else:
            game_data["bayesaverage"] = ""              

        #How many people own the game       
        bayesaverage_element = boardgame.find("owned")
        if bayesaverage_element is not None:
            game_data["owned"] = bayesaverage_element.text
        else:
            game_data["owned"] = ""

            
        #How many people traded the game       
        trading_element = boardgame.find("trading")
        if trading_element is not None:
            game_data["trading"] = trading_element.text
        else:
            game_data["trading"] = ""

        #How many people want the game       
        wanting_element = boardgame.find("wanting")
        if wanting_element is not None:
            game_data["wanting"] = wanting_element.text
        else:
            game_data["wanting"] = ""

        #How many people wish the game       
        wishing_element = boardgame.find("wishing")
        if wishing_element is not None:
            game_data["wishing"] = wishing_element.text
        else:
            game_data["wishing"] = ""

        #Complexity       
        averageweight_element = boardgame.find("averageweight")
        if averageweight_element is not None:
            game_data["complexity"] = averageweight_element.text
        else:
            game_data["complexity"] = ""

           
        # Count the number of "Nominee" and "Winner" elements
        nominees = 0
        winners = 0
        honors = boardgame.find_all("boardgamehonor")
        for honor in honors:
            if "Nominee" in honor.text:
                nominees += 1
            elif "Winner" in honor.text:
                winners += 1
    
        game_data["nominee_awards"] = nominees
        game_data["winner_awards"] = winners
        
        
        # Extract suggested_numplayers poll information
        poll_element = boardgame.select_one("poll[name='suggested_numplayers']")
        if poll_element is not None:
            poll_results = poll_element.find_all("results")
            for result in poll_results:
                numplayers = result["numplayers"]
                for vote in result.find_all("result"):
                    value = vote["value"]
                    numvotes = vote["numvotes"]
                    game_data[f"poll_{numplayers}_{value.lower()}"] = numvotes
        else:
            # Set default values for poll columns
            game_data["poll_1_best"] = 0
            game_data["poll_1_recommended"] = 0
            game_data["poll_1_not recommended"] = 0
            game_data["poll_2_best"] = 0
            game_data["poll_2_recommended"] = 0
            game_data["poll_2_not recommended"] = 0
            game_data["poll_3_best"] = 0
            game_data["poll_3_recommended"] = 0
            game_data["poll_3_not recommended"] = 0
            game_data["poll_4_best"] = 0
            game_data["poll_4_recommended"] = 0
            game_data["poll_4_not recommended"] = 0
            game_data["poll_4+_best"] = 0
            game_data["poll_4+_recommended"] = 0
            game_data["poll_4+_not recommended"] = 0
            
            
        # Extract suggested_playerage poll information
        playerage_poll_element = boardgame.select_one("poll[name='suggested_playerage']")
        if playerage_poll_element is not None:
            playerage_results = playerage_poll_element.find("results")
            if playerage_results is not None:
                for result in playerage_results.find_all("result"):
                    player_age = result["value"]
                    num_votes = result["numvotes"]
                    game_data[f"poll_playerage_{player_age}"] = num_votes
        else:
            # Set default values for playerage poll columns
            game_data["poll_playerage_2"] = 0
            game_data["poll_playerage_3"] = 0
            game_data["poll_playerage_4"] = 0
            game_data["poll_playerage_5"] = 0
            game_data["poll_playerage_6"] = 0
            game_data["poll_playerage_8"] = 0
            game_data["poll_playerage_10"] = 0
            game_data["poll_playerage_12"] = 0
            game_data["poll_playerage_14"] = 0
            game_data["poll_playerage_16"] = 0
            game_data["poll_playerage_18"] = 0
            game_data["poll_playerage_21 and up"] = 0


                   
        # Append the game details to the list
        data.append(game_data)

# Create a DataFrame from the extracted game details
board_game_details = pd.DataFrame(data)

In [15]:
pd.set_option('display.max_columns', None) 
board_game_details

Unnamed: 0,gameid,name,yearpublished,minplayers,maxplayers,minplaytime,maxplaytime,age,description,image,designer,artist,mechanic,category,usersrated,average,bayesaverage,owned,trading,wanting,wishing,complexity,nominee_awards,winner_awards,poll_1_best,poll_1_recommended,poll_1_not recommended,poll_2_best,poll_2_recommended,poll_2_not recommended,poll_3_best,poll_3_recommended,poll_3_not recommended,poll_4_best,poll_4_recommended,poll_4_not recommended,poll_4+_best,poll_4+_recommended,poll_4+_not recommended,poll_playerage_2,poll_playerage_3,poll_playerage_4,poll_playerage_5,poll_playerage_6,poll_playerage_8,poll_playerage_10,poll_playerage_12,poll_playerage_14,poll_playerage_16,poll_playerage_18,poll_playerage_21 and up,poll_5_best,poll_5_recommended,poll_5_not recommended,poll_6_best,poll_6_recommended,poll_6_not recommended,poll_6+_best,poll_6+_recommended,poll_6+_not recommended,poll_5+_best,poll_5+_recommended,poll_5+_not recommended,poll_2+_best,poll_2+_recommended,poll_2+_not recommended,poll_7_best,poll_7_recommended,poll_7_not recommended,poll_7+_best,poll_7+_recommended,poll_7+_not recommended,poll_8_best,poll_8_recommended,poll_8_not recommended,poll_8+_best,poll_8+_recommended,poll_8+_not recommended,poll_9_best,poll_9_recommended,poll_9_not recommended,poll_10_best,poll_10_recommended,poll_10_not recommended,poll_11_best,poll_11_recommended,poll_11_not recommended,poll_12_best,poll_12_recommended,poll_12_not recommended,poll_13_best,poll_13_recommended,poll_13_not recommended,poll_14_best,poll_14_recommended,poll_14_not recommended,poll_15_best,poll_15_recommended,poll_15_not recommended,poll_16_best,poll_16_recommended,poll_16_not recommended,poll_17_best,poll_17_recommended,poll_17_not recommended,poll_18_best,poll_18_recommended,poll_18_not recommended,poll_19_best,poll_19_recommended,poll_19_not recommended,poll_20_best,poll_20_recommended,poll_20_not recommended,poll_21_best,poll_21_recommended,poll_21_not recommended,poll_22_best,poll_22_recommended,poll_22_not recommended,poll_23_best,poll_23_recommended,poll_23_not recommended,poll_24_best,poll_24_recommended,poll_24_not recommended,poll_25_best,poll_25_recommended,poll_25_not recommended,poll_26_best,poll_26_recommended,poll_26_not recommended,poll_27_best,poll_27_recommended,poll_27_not recommended,poll_28_best,poll_28_recommended,poll_28_not recommended,poll_29_best,poll_29_recommended,poll_29_not recommended,poll_30_best,poll_30_recommended,poll_30_not recommended,poll_30+_best,poll_30+_recommended,poll_30+_not recommended,poll_10+_best,poll_10+_recommended,poll_10+_not recommended,poll_3+_best,poll_3+_recommended,poll_3+_not recommended,poll_1+_best,poll_1+_recommended,poll_1+_not recommended,poll_12+_best,poll_12+_recommended,poll_12+_not recommended,poll_16+_best,poll_16+_recommended,poll_16+_not recommended,poll_20+_best,poll_20+_recommended,poll_20+_not recommended,poll_9+_best,poll_9+_recommended,poll_9+_not recommended,poll_18+_best,poll_18+_recommended,poll_18+_not recommended,poll_11+_best,poll_11+_recommended,poll_11+_not recommended,poll_0+_best,poll_0+_recommended,poll_0+_not recommended,poll_13+_best,poll_13+_recommended,poll_13+_not recommended
0,224517,Brass: Birmingham,2018,2,4,60,120,14,Brass: Birmingham is an economic strategy game...,https://cf.geekdo-images.com/x3zxjr-Vw5iU4yDPg...,Gavan Brown,Lina Cossette,Hand Management,Economic,38651,8.61232,8.4233,53544,172,1690,15450,3.8955,11,7,0,43,456,88,561,103,476,283,9,450,262,33,1,6,382,1,0,0,0,0,5,9,27,66,19,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,161936,Pandemic Legacy: Season 1,2015,2,4,60,60,13,Pandemic Legacy is a co-operative campaign gam...,https://cf.geekdo-images.com/-Qer2BBPG7qGGDu6K...,Rob Daviau,Chris Quilliams,Action Points,Environmental,50631,8.53562,8.38972,78285,453,824,12918,2.8318,15,12,19,123,408,149,421,93,162,407,40,460,197,19,8,6,427,1,1,0,0,2,19,50,75,29,7,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,174430,Gloomhaven,2017,1,4,60,120,14,Gloomhaven is a game of Euro-inspired tactica...,https://cf.geekdo-images.com/sZYp_3BTDGjh2unaZ...,Isaac Childres,Alexandr Elichev,Action Queue,Adventure,58418,8.62311,8.38865,91369,918,1253,19478,3.8959,20,10,139,623,336,359,791,85,701,485,37,437,570,158,6,50,797,5,0,0,0,0,7,42,86,142,21,3,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,342942,Ark Nova,2021,1,4,90,150,14,"In Ark Nova, you will plan and design a modern...",https://cf.geekdo-images.com/SoU8p28Sk1s8MSvoM...,Mathias Wigge,Steffen Bieker,End Game Bonuses,Animals,31052,8.53405,8.30236,46286,221,1052,10783,3.7249,15,7,126,545,141,813,333,29,390,564,82,70,351,558,2,6,616,0,0,0,0,0,5,23,61,68,12,2,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,233078,Twilight Imperium: Fourth Edition,2017,3,6,240,480,14,Twilight Imperium (Fourth Edition) is a game o...,https://cf.geekdo-images.com/_Ppn5lssO5OaildSE...,Dane Beltrami,Scott Schomburg,Action Drafting,Civilization,20922,8.61973,8.24243,25779,154,1017,10523,4.308,2,0,0,4,340,0,14,333,13,197,166,122,231,61,,,,0,0,0,0,0,0,6,14,37,25,3,2,80,284,41,347,67,20,9,50,195,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1997,19526,Elasund: The First City,2005,2,4,60,90,10,This is the second game in the 'Abenteuer Cata...,https://cf.geekdo-images.com/2-0eojuCt2yvG5mqN...,Klaus Teuber,Tanja Donner,Dice Rolling,Economic,3307,6.59293,6.17967,4397,276,66,334,2.7271,3,0,0,0,30,12,26,5,13,29,1,36,7,1,0,0,24,0,0,0,0,0,2,6,5,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1998,156496,March of the Ants,2015,1,5,60,90,13,As spring dawns the great thaw begins. Deep be...,https://cf.geekdo-images.com/OGb3iqs888qMreT84...,Tim Eisner,Tim Eisner,Area Majority / Influence,Animals,1441,7.1748,6.17964,3150,108,173,748,2.6667,0,0,1,14,9,3,20,3,14,10,2,17,8,1,,,,0,0,0,0,0,0,2,1,0,1,0,0,5,12,4,,,,,,,1,1,12,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1999,472,DungeonQuest,1985,1,4,20,60,10,Players explore the ruins of Castle Dragonfire...,https://cf.geekdo-images.com/Fj9TR_aSKK5KUi1KT...,Jakob Bonds,Dave Andrews,Dice Rolling,Adventure,2804,6.75048,6.17961,4258,139,114,421,1.9252,0,0,3,24,10,8,28,4,13,25,0,35,5,3,1,1,24,0,0,1,0,5,7,3,1,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2000,189660,XenoShyft: Dreadmire,2017,1,4,30,60,14,You survived the horrors of the Hive — now ven...,https://cf.geekdo-images.com/pdBp_xCXkHVh6p1_K...,Michael Shinall,Jonathan Gonzalez,Cooperative Game,Card Game,1338,7.22022,6.17899,3483,142,68,457,2.6429,0,0,8,8,1,15,3,0,2,10,3,2,2,15,0,0,10,0,0,0,0,0,0,2,1,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [16]:
board_game_details.to_csv('board_game_details.csv')