[website](https://boardgamegeek.com/boardgame/224517/brass-birmingham/stats)

In [1]:
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    StaleElementReferenceException,
    WebDriverException,
    InvalidSessionIdException
)
import time
from tqdm import tqdm
import logging
import json
import csv
import pandas as pd
import os

In [None]:
logging.basicConfig(
    filename='scraper.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

In [101]:
#Function to extract players and play-time in SECTION 0

def players_time(players, timing):
    def extract_min_max(elements):
        if elements:
            text = elements[0].text.strip()
            if '–' in text:
                minimum = int(text.split('–')[0])
                maximum = int(text.split('–')[1])
            else:
                minimum = maximum = int(text)
        else:
            minimum = maximum = None
        return minimum, maximum

    min_players, max_players = extract_min_max(players)
    min_time, max_time = extract_min_max(timing)

    return min_players, max_players, min_time, max_time

In [100]:
#function to cleanly collect game credits in SECTION 1

def get_credits(credits, index, new_line='\n'):
    try:
        raw = credits[index].text
        text = raw 

        result = text.split(new_line)
        cleaned = [t.strip() for t in result if t.strip() and t.strip().upper() != 'N/A']
        
        return cleaned if cleaned else None
    except (IndexError, ValueError, AttributeError):
        return None

In [105]:
# collect game stats in SECTION 2

def get_stats(stats_elem, stats_index, sub_index, new_line='\n'):
    try:
        stat_category = stats_elem[stats_index].text.split(new_line)
        pre_stat = stat_category[sub_index].strip()
        if ',' in pre_stat:
            stat = int(pre_stat.replace(',',''))
        elif ' / ' in pre_stat:
            stat = float(pre_stat.split(' / ')[0])
        else: stat = int(pre_stat) if pre_stat.isdigit() else float(pre_stat)
        return stat
    except (IndexError, ValueError, AttributeError):
        return None

In [104]:
# function to handle ratings in SECTION 3

def get_rating(ratings, rating_index):
    raw_rating = ratings[rating_index].text.strip()
    if 'k' and '.' in raw_rating:
        rating = int(raw_rating.replace('.', '').replace('k', '00'))
    elif 'k' in raw_rating:
        rating = int(raw_rating.replace('k', '000'))
    else: rating = int(raw_rating)
    return rating

Extraction Cell

In [None]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

allboardgames = 0
all_games_links = []

destination = "boardgamegeek.json" 
if not os.path.exists(destination):
    with open(destination, 'w') as f:
        json.dump([], f)

first_iteration = True
row_number = 1

page_one = 1 
page_end = 2

try:
    logging.info("="*30 + " Selenium Takeover " + "="*30 )
    logging.info(f"Collecting boardgame links across {page_end - page_one} pages...")
    for page in range(page_one, page_end):
        url = f"https://boardgamegeek.com/browse/boardgame/page/{page}"
        driver.get(url)
        time.sleep(10)

        try:
            game_links_per_page = [link.get_attribute('href') for link in driver.find_elements(By.XPATH, '//a[@class="primary"]')]
            all_games_links.extend(game_links_per_page)
        except (NoSuchElementException, StaleElementReferenceException) as e:
                logging.warning(f"Error on page {page}: {e}")
except (TimeoutException, WebDriverException, InvalidSessionIdException) as e:
    logging.error(f"Navigation error: {e}")

game_links = 'GoToGames.txt'
with open(game_links, 'w', encoding='utf-8') as file:
    for game_link in all_games_links:
         file.write(game_link + '\n')

logging.info(f"{len(all_games_links)} links found for this session and stored at {game_links}...")

with tqdm(total=len(all_games_links), desc="Games Scraped") as pbar:

    
    for href in all_games_links:
        
        try:
            start_time = time.time()
            driver.get(href)
            time.sleep(10)

            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

        #Section 0
        #xt/players and play-time
            try:
                players = driver.find_elements(By.XPATH, '//span[@ng-if="::geekitemctrl.geekitem.data.item.minplayers > 0 || geekitemctrl.geekitem.data.item.maxplayers > 0"]')
                timing = driver.find_elements(By.XPATH, '//span[@min="::geekitemctrl.geekitem.data.item.minplaytime" and @max="::geekitemctrl.geekitem.data.item.maxplaytime"]')
                
                description_elem = driver.find_elements(By.XPATH, '//span[@itemprop="description"]')
                description = description_elem[0].text.strip()
                min_players, max_players, min_time, max_time = players_time(players, timing)

                bg = {
                    'row_id': row_number,
                    'description': description,
                    'player_counts': {
                        'min_players': min_players,
                        'max_players': max_players
                    },
                    'playtime': {
                        'min_playtime': min_time,
                        'max_playtime': max_time
                    }
                }
            except Exception as e:
                 logging.warning(f"{e} during players & playtime extraction at {href}")

        #Section 1
        #click on credits
            try:
                SeeFullCredits = WebDriverWait(driver,10).until(
                    EC.element_to_be_clickable((By.XPATH, '//a[@ui-sref="geekitem.credits"]'))
                )
                SeeFullCredits.click()
            except Exception as e:
                logging.warning(f'Could not click Credits at {href}: {e}')

        #wait for game_name and release year element
            WebDriverWait(driver,10).until(
            EC.visibility_of_element_located((By.XPATH, '//span[@ng-bind-html="creditsctrl.geekitem.data.item[info.keyname]|to_trusted"]'))
            )

        #xt/game_name, release year
            try:
                spans = driver.find_elements(By.XPATH, '//span[@ng-bind-html="creditsctrl.geekitem.data.item[info.keyname]|to_trusted"]')
                credits = driver.find_elements(By.XPATH, '//div[@ng-if="info.datatype == \'geekitem_linkdata\'"]')

                bg['boardgame'] = spans[0].text.strip()

                game_name = [0].text.strip()
                logging.info(f"Now Scraping: {game_name} | URL: {href}")

                bg['game_info'] = {
                    'release_year' : int(spans[1].text.strip()),
                    "categories": get_credits(credits, 10),
                    "mechanisms": get_credits(credits, 11),
                    "family": get_credits(credits, 12)
                }

        #xt/game-credits with the get_credits func.
                bg['credits'] = {
                    "designers": get_credits(credits, 0),
                    "solo_designer": get_credits(credits, 1),
                    "artists": get_credits(credits, 2),
                    "publishers": get_credits(credits, 3),
                    "developer": get_credits(credits, 4),
                    "graphic_designer": get_credits(credits, 5),
                    "sculptor": get_credits(credits, 6),
                    "editor": get_credits(credits, 7),
                    "writer": get_credits(credits, 8),
                    "insert_designer": get_credits(credits, 9)
                }
            except Exception as e:
                logging.warning(f"{e} in credits section of {driver.current_url}")
        
        #Section 2
        #click on stats section
            SeeGameStats = WebDriverWait(driver,10).until(
            EC.element_to_be_clickable((By.XPATH, '//a[@ui-sref="geekitem.stats({})"]'))
            )
            SeeGameStats.click()

        #wait until stats appear
            WebDriverWait(driver,10).until(
                EC.visibility_of_element_located((By.XPATH, '//div[@class="row game-stats"]'))
            )

            try:
        #xt/relevant stats with get_stats func.
                stats_elem = driver.find_elements(By.XPATH, '//ul[@class="outline fs-responsive-sm outline-border-col-xs"]')

        #game stats
                bg['game_stats'] = {
                    "average_rating": get_stats(stats_elem, 0, 1),
                    "num_of_ratings": get_stats(stats_elem, 0, 3),
                    "std_deviation": get_stats(stats_elem, 0, 5),
                    "weight": get_stats(stats_elem, 0, 7),
                    "comments": get_stats(stats_elem, 0, 9),
                    "fans": get_stats(stats_elem, 0, 11),
                    "page_views": get_stats(stats_elem, 0, 13)
                }
            except Exception as e:
                logging.warning(f"{e} in game stats of {driver.current_url}")

        #ranks stats
            bg['ranks']={}
            try:
                rank_labels = driver.find_elements(By.XPATH, '//span[@class="rank-title ng-binding"]')
                rank_values = driver.find_elements(By.XPATH, '//a[@class="rank-value ng-binding ng-scope"]')

                for label, value in zip(rank_labels, rank_values):
                    bg['ranks'][label.text.strip().lower()] = int(value.text.replace(",","").strip())

            except Exception as e:
                logging.warning(f"{e} in rank stats of {href}")

        #play stats
            try:
                bg['play_stats'] = {
                    "all_time_plays": get_stats(stats_elem, 2, 1),
                    "this_month_plays": get_stats(stats_elem, 2, 3)
                }
            except Exception as e:
                logging.warning(f"{e} in play stats of {driver.current_url}")

        #collecton stats
            try:
                bg['collection_stats'] = {
                    "own": get_stats(stats_elem, 3, 1),
                    "previously_owned": get_stats(stats_elem, 3, 3),
                    "for_trade": get_stats(stats_elem, 3, 5),
                    "want_in_trade": get_stats(stats_elem, 3, 8),
                    "wishlist": get_stats(stats_elem, 3, 11)
                }
            except Exception as e:
                logging.warning(f"{e} in collection stats of {driver.current_url}")

        #Section 3
        #wait until ratings appear
            WebDriverWait(driver, 10).until(
                lambda d: len(d.find_elements(By.XPATH, "//*[name()='text']")) >= 20
            )
            try:
                ratings = driver.find_elements(By.XPATH, "//*[name()='text']") #workaround xpath for html that include namespaces like SVG in this case
            
            #xt/ratings with the get_rating function    
                bg['ratings'] = {
                    f"rated_{i}": get_rating(ratings, 9 + i)
                    for i in range(1, 11)
                }
            except Exception as e:
                logging.warning(f"{e} in scraping ratings of {driver.current_url}")

            bg['link_to_game']=href

            end_time = time.time()
            duration = end_time - start_time

            logging.info(f"Scraping completed for {game_name} in {duration:.2f} seconds.")
        
            with open(destination, 'r', encoding='utf-8') as f:
                    content = f.read()
                    data = json.loads(content) if content.strip() else []
            data.append(bg)
            
            with open(destination, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=4, ensure_ascii=False)

            logging.info(f"Records dumped in json file: {destination}")
    
        except Exception as e:
                logging.error(f'Error scraping boardgame, {game_name} at {href} - {e}')

        row_number +=1
        allboardgames+=1
        pbar.update(1)

            
driver.quit()

Games Scraped:   1%|          | 1/100 [00:38<1:04:01, 38.81s/it]

Error scraping boardgame at https://boardgamegeek.com/boardgame/224517/brass-birmingham - Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=138.0.7204.183)
Stacktrace:
	GetHandleVerifier [0x0x79ba83+63395]
	GetHandleVerifier [0x0x79bac4+63460]
	(No symbol) [0x0x5e2113]
	(No symbol) [0x0x5c0fb9]
	(No symbol) [0x0x655ace]
	(No symbol) [0x0x670099]
	(No symbol) [0x0x64f1a6]
	(No symbol) [0x0x61e7b2]
	(No symbol) [0x0x61f654]
	GetHandleVerifier [0x0xa18883+2672035]
	GetHandleVerifier [0x0xa13cba+2652634]
	GetHandleVerifier [0x0x7c2bca+223466]
	GetHandleVerifier [0x0x7b2cb8+158168]
	GetHandleVerifier [0x0x7b978d+185517]
	GetHandleVerifier [0x0x7a3b78+96408]
	GetHandleVerifier [0x0x7a3d02+96802]
	GetHandleVerifier [0x0x78e90a+9770]
	BaseThreadInitThunk [0x0x76f9fcc9+25]
	RtlGetAppContainerNamedObjectPath [0x0x77a182ae+286]
	RtlGetAppContainerNamedObjectPath [0x0x77a1827e+238]

Error scraping boardgame at https://boardgamegeek

Games Scraped: 100%|██████████| 100/100 [00:39<00:00,  2.53it/s]

Error scraping boardgame at https://boardgamegeek.com/boardgame/285774/marvel-champions-the-card-game - Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=138.0.7204.183)
Stacktrace:
	GetHandleVerifier [0x0x79ba83+63395]
	GetHandleVerifier [0x0x79bac4+63460]
	(No symbol) [0x0x5e2113]
	(No symbol) [0x0x5c0fb9]
	(No symbol) [0x0x655ace]
	(No symbol) [0x0x670099]
	(No symbol) [0x0x64f1a6]
	(No symbol) [0x0x61e7b2]
	(No symbol) [0x0x61f654]
	GetHandleVerifier [0x0xa18883+2672035]
	GetHandleVerifier [0x0xa13cba+2652634]
	GetHandleVerifier [0x0x7c2bca+223466]
	GetHandleVerifier [0x0x7b2cb8+158168]
	GetHandleVerifier [0x0x7b978d+185517]
	GetHandleVerifier [0x0x7a3b78+96408]
	GetHandleVerifier [0x0x7a3d02+96802]
	GetHandleVerifier [0x0x78e90a+9770]
	BaseThreadInitThunk [0x0x76f9fcc9+25]
	RtlGetAppContainerNamedObjectPath [0x0x77a182ae+286]
	RtlGetAppContainerNamedObjectPath [0x0x77a1827e+238]

Error scraping boardgame at https:/




In [None]:
final_destination = "boardgame-geek-dataset.csv"

with open(destination, 'r', encoding='utf-8') as f:
    data = json.load(f)

def flatten_lists(record):
    flattened = {}
    for key, value in record.items():
        if isinstance(value, list):
            flattened[key] = "; ".join(str(v) for v in value)
        else:
            flattened[key] = value
    return flattened

flattened_data = [flatten_lists(entry) for entry in data]

all_keys = set()
for record in flattened_data:
    all_keys.update(record.keys())
fieldnames = sorted(all_keys)

with open(final_destination, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(flattened_data)

print(f"Successfully converted {destination} to {final_destination}")

Sandbox

In [None]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

url = "https://boardgamegeek.com/boardgame/167791/terraforming-mars/stats"
driver.get(url)     