[website](https://boardgamegeek.com/boardgame/224517/brass-birmingham/stats)

In [54]:
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    StaleElementReferenceException,
    WebDriverException,
    InvalidSessionIdException
)
import time
from tqdm import tqdm
import logging
import json
import csv
import pandas as pd
import os
import unicodedata

In [None]:
logging.basicConfig(
    filename='scraper.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# How to Use
start_time = time.time()
logging.info("Starting scraping task...")

# Simulate some task

end_time = time.time()
duration = end_time - start_time

logging.info(f"Scraping completed in {duration:.2f} seconds.")

In [5]:
#Function to extract players and play-time in SECTION 0

def players_time(players, timing):
    def extract_min_max(elements):
        if elements:
            text = elements[0].text.strip()
            if '–' in text:
                minimum = int(text.split('–')[0])
                maximum = int(text.split('–')[1])
            else:
                minimum = maximum = int(text)
        else:
            minimum = maximum = None
        return minimum, maximum

    min_players, max_players = extract_min_max(players)
    min_time, max_time = extract_min_max(timing)

    return min_players, max_players, min_time, max_time

In [55]:
#function to cleanly collect game credits in SECTION 1

def get_credits(credits, index, new_line='\n'):
    try:
        raw = credits[index].text
        text = raw.encode('utf-8').decode('unicode_escape')

        if new_line in text:
            result = text.replace(new_line, ", ").split(", ")
        else:
            result = [text.strip()]
            
        return [t.strip() for t in result if t.strip()]
    except (IndexError, ValueError, AttributeError):
        return None

In [12]:
# collect game stats in SECTION 2

def get_stats(stats_elem, stats_index, sub_index, new_line='\n'):
    try:
        stat_category = stats_elem[stats_index].text.split(new_line)
        pre_stat = stat_category[sub_index].strip()
        if ',' in pre_stat:
            stat = int(pre_stat.replace(',',''))
        elif ' / ' in pre_stat:
            stat = float(pre_stat.split(' / ')[0])
        else: stat = pre_stat
        return stat
    except (IndexError, ValueError, AttributeError):
        return None

In [None]:
# function to handle ratings in SECTION 3

def get_rating(ratings, rating_index):
    raw_rating = ratings[rating_index].text.strip()
    if 'k' and '.' in raw_rating:
        rating = int(raw_rating.replace('.', '').replace('k', '00'))
    elif 'k' in raw_rating:
        rating = int(raw_rating.replace('k', '000'))
    else: rating = int(raw_rating)
    return rating

Extraction Cell

In [None]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

allboardgames = 0
all_games_links = []

destination = "boardgamegeek.json" 
if not os.path.exists(destination):
    with open(destination, 'w') as f:
        json.dump([], f)

first_iteration = True
row_number = 1

page_one = 1 
page_end = 2

try:
    for page in range(page_one, page_end):
        url = f"https://boardgamegeek.com/browse/boardgame/page/{page}"
        driver.get(url)
        time.sleep(3)

        try:
            game_links_per_page = [link.get_attribute('href') for link in driver.find_elements(By.XPATH, '//a[@class="primary"]')]
            all_games_links.extend(game_links_per_page)
        except (NoSuchElementException, StaleElementReferenceException) as e:
                print(f"Error on page {page}: {e}")
except (TimeoutException, WebDriverException, InvalidSessionIdException) as e:
    print(f"Navigation error: {e}")

game_links = 'GoToGames.txt'
with open(game_links, 'w') as file:
    for game_link in all_games_links:
         file.write(game_link + '\n')

print(f"The links to all {len(all_games_links)} board games has been collected and stored successfully")

with tqdm(total=len(all_games_links), desc="Games Scraped") as pbar:

    
    for href in all_games_links:
        
        try:
            driver.get(href)
            time.sleep(5)

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

        #Section 0
        #xt/players and play-time
            try:
                players = driver.find_elements(By.XPATH, '//span[@ng-if="::geekitemctrl.geekitem.data.item.minplayers > 0 || geekitemctrl.geekitem.data.item.maxplayers > 0"]')
                timing = driver.find_elements(By.XPATH, '//span[@min="::geekitemctrl.geekitem.data.item.minplaytime" and @max="::geekitemctrl.geekitem.data.item.maxplaytime"]')
                min_players, max_players, min_time, max_time = players_time(players, timing)

                bg = {
                    'min_players': min_players,
                    'max_players': max_players,
                    'min_playtime': min_time,
                    'max_playtime': max_time
                }
            except Exception as e:
                 print(f"{e} in players & playtime section of {href}")


        #Section 1
        #click on credits
            SeeFullCredits = WebDriverWait(driver,10).until(
                EC.element_to_be_clickable((By.XPATH, '//a[@ui-sref="geekitem.credits"]'))
            )
            SeeFullCredits.click()

        #wait for game_name and release year element
            WebDriverWait(driver,10).until(
            EC.visibility_of_element_located((By.XPATH, '//span[@ng-bind-html="creditsctrl.geekitem.data.item[info.keyname]|to_trusted"]'))
            )

        #xt/game_name, release year
            try:
                spans = driver.find_elements(By.XPATH, '//span[@ng-bind-html="creditsctrl.geekitem.data.item[info.keyname]|to_trusted"]')
                bg['game_name'] = spans[0].text.strip()
                bg['release_year'] = int(spans[1].text.strip())

                credits = driver.find_elements(By.XPATH, '//div[@ng-if="info.datatype == \'geekitem_linkdata\'"]')

        #xt/game-credits with the get_credits func.
                bg['designers'] = get_credits(credits, 0)
                bg['solo_designer'] = get_credits(credits, 1)
                bg['artists'] = get_credits(credits, 2)
                bg['publishers'] = get_credits(credits, 3)
                bg['developer'] = get_credits(credits, 4)
                bg['graphic_designer'] = get_credits(credits, 5)
                bg['sculptor'] = get_credits(credits, 6)
                bg['editor'] = get_credits(credits, 7)
                bg['writer'] = get_credits(credits, 8)
                bg['insert_designer'] = get_credits(credits, 9)
                bg['categories'] = get_credits(credits, 10)
                bg['mechanisms'] = get_credits(credits, 11)
                bg['family'] = get_credits(credits, 12)
            except Exception as e:
                print(f"{e} in credits section of {href}")
        
        #Section 2
        #click on stats section
            SeeGameStats = WebDriverWait(driver,10).until(
            EC.element_to_be_clickable((By.XPATH, '//a[@ui-sref="geekitem.stats({})"]'))
            )
            SeeGameStats.click()

        #wait until stats appear
            WebDriverWait(driver,10).until(
                EC.visibility_of_element_located((By.XPATH, '//div[@class="row game-stats"]'))
            )

            try:
        #xt/relevant stats with get_stats func.
                stats_elem = driver.find_elements(By.XPATH, '//ul[@class="outline fs-responsive-sm outline-border-col-xs"]')

        #game stats
                bg['average_rating'] = get_stats(stats_elem,0,1)
                bg['num_of_ratings'] = get_stats(stats_elem,0,3)
                bg['std_deviation'] = get_stats(stats_elem,0,5)
                bg['weight'] = get_stats(stats_elem,0,7)
                bg['comments'] = get_stats(stats_elem,0,9)
                bg['fans'] = get_stats(stats_elem,0,11)
                bg['page_views'] = get_stats(stats_elem,0,13)
            except Exception as e:
                print(f"{e} in game stats of {href}")

        #ranks stats
            try:
                bg['overall_rank'] = get_stats(stats_elem,1,1)
                try:
                    rank_row = stats_elem[1]
                    rank_items = rank_row.text.strip().split('\n')

                    seen_keys = set()

                    i = 2  # Start after Overall Rank
                    while i < len(rank_items) - 1:
                        label = rank_items[i].strip()
                        value = rank_items[i + 1].strip()

                        if label in seen_keys:
                            i += 1
                            continue

                        # Make sure value looks like a rank number
                        clean_value = value.replace(",", "")
                        if clean_value.isdigit():
                            key = label.replace(" ", "_").lower()
                            bg[key] = int(clean_value)
                            seen_keys.add(label)
                            i += 2  # move to next pair
                        else:
                            i += 1  # skip invalid or dangling label

                except Exception as e:
                    print(f"Failed to parse extra ranks of {href}: {e}")
            except Exception as e:
                print(f"{e} in rank stats of {href}")

        #play stats
            try:
                bg['all_time_plays'] = get_stats(stats_elem,2,1)
                bg['this_month_plays'] = get_stats(stats_elem,2,3)
            except Exception as e:
                print(f"{e} in play stats of {href}")

        #collecton stats
            try:
                bg['own'] = get_stats(stats_elem,3,1)
                bg['previously_owned'] = get_stats(stats_elem,3,3)
                bg['for_trade'] = get_stats(stats_elem,3,5)
                bg['want_in_trade'] = get_stats(stats_elem,3,8)
                bg['wishlist'] = get_stats(stats_elem,3,11)
            except Exception as e:
                print(f"{e} in collection stats of {href}")

        #Section 3
        #wait until ratings appear
            WebDriverWait(driver,10).until(
                EC.visibility_of_element_located((By.XPATH, '//ratings-stats-graph[@objecttype="thing"]'))
            )

            try:
                ratings = driver.find_elements(By.XPATH, "//*[name()='text']") #workaround xpath for html that include namespaces like SVG in this case
            
            #xt/ratings with the get_rating function    
                bg['rated_one'] = get_rating(ratings, 10)
                bg['rated_two'] = get_rating(ratings, 11)
                bg['rated_three'] = get_rating(ratings, 12)
                bg['rated_four'] = get_rating(ratings, 13)
                bg['rated_five'] = get_rating(ratings, 14)
                bg['rated_six'] = get_rating(ratings, 15)
                bg['rated_seven'] = get_rating(ratings, 16)
                bg['rated_eight'] = get_rating(ratings, 17)
                bg['rated_nine'] = get_rating(ratings, 18)
                bg['rated_ten'] = get_rating(ratings, 19)
            except Exception as e:
                print(f"{e} in scraping ratings of {href}")

            bg['link_to_game']=href
            with open(destination, 'r') as f:
                    content = f.read()
                    data = json.loads(content) if content.strip() else []
            data.append(bg)
            
            with open(destination, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=4)
    
        except Exception as e:
                print(f'Error scraping boardgame at {href} - {e}')

        row_number +=1
        allboardgames+=1
        pbar.update(1)

            
driver.quit()

The links to all 100 board games has been collected and stored successfully


Games Scraped:   1%|          | 1/100 [00:33<54:44, 33.18s/it]

Error scraping boardgame at https://boardgamegeek.com/boardgame/224517/brass-birmingham - list index out of range


Games Scraped:   2%|▏         | 2/100 [00:50<38:53, 23.81s/it]

Error scraping boardgame at https://boardgamegeek.com/boardgame/161936/pandemic-legacy-season-1 - list index out of range


Games Scraped:   3%|▎         | 3/100 [01:07<33:27, 20.70s/it]

Error scraping boardgame at https://boardgamegeek.com/boardgame/342942/ark-nova - list index out of range


Games Scraped:   4%|▍         | 4/100 [01:21<29:09, 18.22s/it]

Error scraping boardgame at https://boardgamegeek.com/boardgame/174430/gloomhaven - list index out of range


Games Scraped:  14%|█▍        | 14/100 [03:54<21:56, 15.31s/it]

Error scraping boardgame at https://boardgamegeek.com/boardgame/12333/twilight-struggle - list index out of range


Games Scraped:  20%|██        | 20/100 [07:56<1:10:33, 52.91s/it]

Error scraping boardgame at https://boardgamegeek.com/boardgame/28720/brass-lancashire - HTTPConnectionPool(host='localhost', port=51479): Read timed out. (read timeout=120)


Games Scraped:  21%|██        | 21/100 [09:56<1:36:11, 73.05s/it]

Error scraping boardgame at https://boardgamegeek.com/boardgame/173346/7-wonders-duel - HTTPConnectionPool(host='localhost', port=51479): Read timed out. (read timeout=120)


Games Scraped:  52%|█████▏    | 52/100 [18:30<12:06, 15.14s/it]  

Error scraping boardgame at https://boardgamegeek.com/boardgame/3076/puerto-rico - list index out of range


Games Scraped:  77%|███████▋  | 77/100 [24:29<05:27, 14.22s/it]

Error scraping boardgame at https://boardgamegeek.com/boardgame/35677/le-havre - list index out of range


Games Scraped: 100%|██████████| 100/100 [29:43<00:00, 17.83s/it]


In [None]:
final_destination = "boardgame-geek-dataset.csv"

with open(destination, 'r', encoding='utf-8') as f:
    data = json.load(f)

def flatten_lists(record):
    flattened = {}
    for key, value in record.items():
        if isinstance(value, list):
            flattened[key] = "; ".join(str(v) for v in value)
        else:
            flattened[key] = value
    return flattened

flattened_data = [flatten_lists(entry) for entry in data]

fieldnames = flattened_data[0].keys()

with open(final_destination, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(flattened_data)

print(f"Successfully converted {destination} to {final_destination}")

Successfully converted boardgamegeek.json to boardgame-geek-dataset.csv


Sandbox

In [None]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

url = "https://boardgamegeek.com/boardgame/167791/terraforming-mars/stats"
driver.get(url)     