In [19]:
import requests
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import pandas as pd
import re
import os


start_time = time.time()
# login page
login_url = 'https://boardgamegeek.com/login?redirect_server=1'  

# create an instance of the Chrome WebDriver with Selenium
driver = webdriver.Chrome()

# Navigate to the login page
driver.get(login_url)


wait = WebDriverWait(driver, 10)

try:
    # Wait for the consent button to become clickable
    consent_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((
            By.XPATH,
            "//button[@aria-label=\"I'm OK with that\"]"
        ))
    )
    consent_button.click()
    print("Consent-button clicked.")
except Exception as e:
    print("Failed to click consent-button:", e)

wait = WebDriverWait(driver, 10)


# finding the next cookie pop-up question and clicking "OK"
try:
    cookie_button = driver.find_element(By.XPATH, '//button[contains(text(), "I\'m OK with that")]')
    cookie_button.click()
    print("2nd consent-button clicked.")
except Exception as e:
    print("Failed to click 2nd consent-button:", e)

WebDriverWait(driver, 20).until(EC.staleness_of(driver.find_element(By.TAG_NAME, 'html')))


# Locate the username and password input fields and the sign-in button using their HTML attributes
username_input = driver.find_element(By.ID, 'inputUsername')  
password_input = driver.find_element(By.ID, 'inputPassword')  
signin_button = driver.find_element(By.XPATH, '//button[contains(text(), "Sign In")]') 

# Input my username and password
username_input.send_keys('fotbill13')  
password_input.send_keys('panatha1') 
driver.implicitly_wait(5)
# Submit the login form by clicking the sign-in button
signin_button.click()

# Wait until the username/profile element is visible
WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href*="/user/"]'))
)
print("Login successful")

# Fetch content from a page after login
driver.get('https://boardgamegeek.com/browse/boardgame/page/')




# ========== LOOPING THE PAGES ==========
# Starting the for-loop for a desired range of pages of 100 board games each
r = range(240, 253)
for i in r:
    print('Page:', i, 'of', r[-1])
    page_start_time = time.time()
    resx = driver.get('https://boardgamegeek.com/browse/boardgame/page/'+str(i))
    time.sleep(5)  # inserting a 5 second pause due to the web pages crawl policy
    soupx = BeautifulSoup(driver.page_source, 'html.parser')
    
    #  Get the titles of the board games
    titles = soupx.select('.primary')
    titles_list = [titles[k].get_text() for k in range(len(titles))]
    
    # Get the geek rating, average rating and number of voters
    ratingsx = soupx.select('.collection_bggrating')
    td_elementsx = soupx.find_all('td', class_='collection_bggrating', align='center')
    td_listx = []
    for td in td_elementsx:
        number = td.get_text().strip() 
        td_listx.append(number)
    ratings_listx = []
    for td_list_i in td_listx:
        if td_list_i == 'N/A':
            ratings_listx.append(td_list_i)
        else:
            ratings_listx.append(float(td_list_i))
    sublistsx = []
    for j in range(0, len(ratings_listx), 3):
        sublistx = ratings_listx[j:j + 3]
        sublistsx.append(sublistx)
    geek_rating = [inner_list[0] for inner_list in sublistsx]
    avg_rating = [inner_list[1] for inner_list in sublistsx]
    voters = [inner_list[2] if inner_list[2]=='N/A' else int(inner_list[2]) for inner_list in sublistsx]



    # ====== PRICE ====== 
    # Get all td blocks that might contain shop prices
    tds = soupx.find_all('td', class_='collection_shop')

    price_list = []
    for td in tds:
        amazon_tag = td.find('a', class_='ulprice', href=True)
        if amazon_tag and 'amazon.com' in amazon_tag['href']:
            price_span = amazon_tag.find('span', class_='positive')
            if price_span:
                price_text = price_span.text.strip().replace('$', '').replace(',', '')
                try:
                    price = float(price_text)
                    price_list.append(price)
                except ValueError:
                    price_list.append('N/A')
            else:
                price_list.append('N/A')
        else:
            price_list.append('N/A')




    # ---------- Individual board game pages ------------
    # Get the links of each Board Game in the current page       
    links = soupx.find_all('a', class_='primary')
    link_urls = []
    for link in links:
        link_href = link.get('href')
        link_urls.append(link_href)

    # Initialize the lists for the 100 board games of the current page
    year_list = []
    weight_list = []
    min_players_list = []
    max_players_list = []
    min_time_list = []
    max_time_list = []
    age_list  = []
    type1_list = []
    type2_list = []
    designer_rows = []
    artist_rows = []
    publisher_rows = []


    # Looping through each of the 100 board game links of each page
    for link in link_urls:
        driver.get(f'https://boardgamegeek.com{link}')
        time.sleep(10)     
        soup2 = BeautifulSoup(driver.page_source, 'html.parser')


        # ====== YEAR ====== 
        year = soup2.find('span', class_ = 'game-year ng-binding ng-scope')
        if year:
            year = year.get_text(strip=True).strip('()')
            year_list.append(int(year))
        else:
            year_list.append('N/A')
        

        # ====== COMPLEXITY (WEIGHT) ====== 
        weight = 'N/A'
        # Select all span elements that include ng-binding and any gameplay-weight class
        weight_spans = soup2.select('span.ng-binding[class*="gameplay-weight-"]')
        if weight_spans:
            try:
                weight = float(weight_spans[0].text.strip())
            except ValueError:
                pass
        weight_list.append(weight)


        # Find all <p> tags with the class (because there are many: Players, Age, Playtime etc.)
        p_tags = soup2.find_all('p', class_='gameplay-item-primary mb-0')


        # ====== AGE ====== 
        age_value = 'N/A'
        for p in p_tags:
            if 'Age:' in p.text:
                age_span = p.find('span', itemprop='suggestedMinAge')
                if age_span:
                    age_text = age_span.text.strip().replace('+', '')
                    try:
                        age_value = int(age_text)
                    except ValueError:
                        age_value = 'N/A'
                break  # No need to keep looping once we found it

        age_list.append(age_value)



        # ====== MIN and MAX NUMBER OF PLAYERS ====== 
        min_players = 'N/A'
        max_players = 'N/A'

        # Find the <p> tag with class 'gameplay-item-primary mb-0' that contains the player info
        p_tags = soup2.find_all('p', class_='gameplay-item-primary mb-0')

        for p in p_tags:
            if 'Players' in p.text:
                # Find the meta tags inside it
                min_tag = p.find('meta', itemprop='minValue')
                max_tag = p.find('meta', itemprop='maxValue')

                if min_tag and min_tag.has_attr('content'):
                    min_players = int(min_tag['content'])

                if max_tag and max_tag.has_attr('content'):
                    max_players = int(max_tag['content'])

                # Add them to your lists
                min_players_list.append(min_players)
                max_players_list.append(max_players)
                break  # No need to continue after we found 'Players'



        # ====== MIN and MAX NUMBER OF PLAYERS ====== 
        min_playtime = 'N/A'
        max_playtime = 'N/A'

        for p in p_tags:
            text = p.text.strip()
            if 'Min' in text and 'Age' not in text and 'Weight' not in text:
                # Extract all numbers from the string (usually 1 or 2)
                numbers = re.findall(r'\d+', text)
                if len(numbers) == 1:
                    try:
                        min_playtime = max_playtime = int(numbers[0])
                    except ValueError:
                        pass
                elif len(numbers) >= 2:
                    try:
                        min_playtime = int(numbers[0])
                        max_playtime = int(numbers[1])
                    except ValueError:
                        pass

                break  # Found playtime, no need to continue

        # Append to final lists
        min_time_list.append(min_playtime)
        max_time_list.append(max_playtime)



        # ====== BOARD GAME TYPE/TYPES ====== 
        # Default values
        game_types = ['N/A', 'N/A']

        # Find all <a> tags with href that contains '/boardgamesubdomain'
        type_tags = soup2.select('a[href^="/boardgamesubdomain/"]')

        # Extract the type names (text of <a> tags)
        type_names = [a.text.strip() for a in type_tags]

        # Keep only the first two
        for j in range(min(2, len(type_names))):
            game_types[j] = type_names[j]

        # Append to your lists
        type1_list.append(game_types[0])
        type2_list.append(game_types[1])



        # ====== DESIGNERS ====== 
        # Find all designer <a> tags
        designer_links = soup2.find_all('a', href=lambda x: x and '/boardgamedesigner/' in x)
        designers = [a.get_text(strip=True) for a in designer_links]

        # Keep only first 5 designers
        designers = designers[:5]

        # Pad with N/A if fewer than 5
        designers.extend(['N/A'] * (5 - len(designers)))
        designer_rows.append(designers)


        # ====== ARTISTS ====== 
        # Find all artist <a> tags
        artist_links = soup2.find_all('a', href=lambda x: x and '/boardgameartist/' in x)
        artists = [a.get_text(strip=True) for a in artist_links]

        # Keep only first 5 artists
        artists = artists[:5]

        # Pad with N/A if fewer than 5
        artists.extend(['N/A'] * (5 - len(artists)))
        artist_rows.append(artists)


        # ====== PUBLISHERS ====== 
        # Find all publisher <a> tags
        publisher_links = soup2.find_all('a', href=lambda x: x and '/boardgamepublisher/' in x)
        publishers = [a.get_text(strip=True) for a in publisher_links]

        # Keep only first 5 publishers
        publishers = publishers[:5]

        # Pad with N/A if fewer than 5
        publishers.extend(['N/A'] * (5 - len(publishers)))
        publisher_rows.append(publishers)



    # ============== DataFrames Creation ==============
    # 1) bg_info_df
    bg_info_df = pd.DataFrame({
        'Title': titles_list,
        'Geek rating': geek_rating,
        'Avg rating': avg_rating,
        'Num of voters': voters,
        'Price': price_list,
        'Year': year_list,
        'Complexity': weight_list,
        'Min players': min_players_list,
        'Max players': max_players_list,
        'Min time': min_time_list,
        'Max time': max_time_list,
        'Min age': age_list,
        'Type 1': type1_list,
        'Type 2': type2_list
    })


    # Convert titles_list to a DataFrame in order to use it in the next dataframes
    titles_df = pd.DataFrame({'Title': titles_list})

    # 2) bg_designers_df
    bg_designers_df = pd.DataFrame(
    designer_rows,
    columns=['Designer 1', 'Designer 2', 'Designer 3', 'Designer 4', 'Designer 5'])
    # Concatenate titles with designers DataFrame
    bg_designers_df = pd.concat([titles_df, bg_designers_df], axis=1)

    # 3) bg_artists_df
    bg_artists_df = pd.DataFrame(
    designer_rows,
    columns=['Designer 1', 'Designer 2', 'Designer 3', 'Designer 4', 'Designer 5'])
    bg_artists_df = pd.concat([titles_df, bg_artists_df], axis=1)

    # 4) bg_publishers_df
    bg_publishers_df = pd.DataFrame(
    designer_rows,
    columns=['Designer 1', 'Designer 2', 'Designer 3', 'Designer 4', 'Designer 5'])
    bg_publishers_df = pd.concat([titles_df, bg_publishers_df], axis=1)


    # ============== CSV creation ==============
    # Create folders if they don't exist
    os.makedirs('info', exist_ok=True)
    os.makedirs('designers', exist_ok=True)
    os.makedirs('artists', exist_ok=True)
    os.makedirs('publishers', exist_ok=True)

    # Save CSVs inside their respective folders
    bg_info_df.to_csv(f'info/bg_info_{i:03}.csv', index=False)
    bg_designers_df.to_csv(f'designers/bg_designers_{i:03}.csv', index=False)
    bg_artists_df.to_csv(f'artists/bg_artists_{i:03}.csv', index=False)
    bg_publishers_df.to_csv(f'publishers/bg_publishers_{i:03}.csv', index=False)



    page_end_time = time.time()
    print(f'Page scrape time: {int((page_end_time - page_start_time) / 60)} minutes')


Consent-button clicked.
2nd consent-button clicked.
Login successful
Page: 240 of 252
Page scrape time: 32 minutes
Page: 241 of 252
Page scrape time: 25 minutes
Page: 242 of 252
Page scrape time: 25 minutes
Page: 243 of 252
Page scrape time: 26 minutes
Page: 244 of 252
Page scrape time: 25 minutes
Page: 245 of 252
Page scrape time: 25 minutes
Page: 246 of 252
Page scrape time: 25 minutes
Page: 247 of 252
Page scrape time: 26 minutes
Page: 248 of 252
Page scrape time: 26 minutes
Page: 249 of 252
Page scrape time: 26 minutes
Page: 250 of 252
Page scrape time: 25 minutes
Page: 251 of 252
Page scrape time: 26 minutes
Page: 252 of 252
Page scrape time: 25 minutes


In [20]:
# Now we will merge the created csv files to a single csv (for each part)

# 1) we create the bg_info.csv
folder = "info" 
# list files inside the subfolder
files = os.listdir(folder)
csv_files = [f for f in files if f.startswith("bg_info_") and f.endswith(".csv")]
csv_files.sort()  # keep them in order
# prepend folder name so pandas can find them
df_list = [pd.read_csv(os.path.join(folder, f)) for f in csv_files]
# merge
merged_df = pd.concat(df_list, ignore_index=True)
# save merged file in the main folder 
merged_df.to_csv("bg_info.csv", index=False)



# 2) we create the bg_artists.csv
folder = "artists" 
# list files inside the subfolder
files = os.listdir(folder)
csv_files = [f for f in files if f.startswith("bg_artists_") and f.endswith(".csv")]
csv_files.sort() 
# prepend folder name so pandas can find them
df_list = [pd.read_csv(os.path.join(folder, f)) for f in csv_files]
# merge
merged_df = pd.concat(df_list, ignore_index=True)
# save merged file in the main folder 
merged_df.to_csv("bg_artists.csv", index=False)



# 3) we create the bg_designers.csv
folder = "designers" 
# list files inside the subfolder
files = os.listdir(folder)
csv_files = [f for f in files if f.startswith("bg_designers_") and f.endswith(".csv")]
csv_files.sort()  
# prepend folder name so pandas can find them
df_list = [pd.read_csv(os.path.join(folder, f)) for f in csv_files]
# merge
merged_df = pd.concat(df_list, ignore_index=True)
# save merged file in the main folder 
merged_df.to_csv("bg_designers.csv", index=False)



# 4) we create the bg_publishers.csv
folder = "publishers" 
# list files inside the subfolder
files = os.listdir(folder)
csv_files = [f for f in files if f.startswith("bg_publishers_") and f.endswith(".csv")]
csv_files.sort()  
# prepend folder name so pandas can find them
df_list = [pd.read_csv(os.path.join(folder, f)) for f in csv_files]
# merge
merged_df = pd.concat(df_list, ignore_index=True)
# save merged file in the main folder 
merged_df.to_csv("bg_publishers.csv", index=False)
