In [7]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
import numpy as np
import time

In [6]:
# A .csv file with boardgame ratings was downloaded from BGG (https://boardgamegeek.com/data_dumps/bg_ranks)

df = pd.read_csv('boardgames_ranks.csv')
cols_to_drop = ['abstracts_rank', 'cgs_rank', 'childrensgames_rank',
       'familygames_rank', 'partygames_rank', 'strategygames_rank',
       'thematic_rank', 'wargames_rank']

# Dropping the columns with ratings by category and limiting the dataframe to 5000 entries
df = df.drop(columns=cols_to_drop)
df = df[:5000]
df

In [8]:
# categories for the new boardgame dataframe
col = ['name', 'weight', 'minplayers', 'maxplayers', 'minplaytime', 'maxplaytime', 'age', 'category', 'mechanic']
d = {i: [] for i in col}

# scraping data for the top-5000 boardgames using the BGG API
for game_id in tqdm(list(df['id'][:5000])):
    url = f'https://api.geekdo.com/xmlapi/boardgame/{game_id}?stats=1'
    res = requests.get(url)
    soup = bs(res.text, 'lxml')
    categories = []
    mechanics = []
    
    d['name'].append(soup.find('name', attrs={'primary': 'true'}).text)
    d['weight'].append(soup.select('averageweight')[0].text)
    d['minplayers'].append(soup.select('minplayers')[0].text)
    d['maxplayers'].append(soup.select('maxplayers')[0].text)
    d['minplaytime'].append(soup.select('minplaytime')[0].text)
    d['maxplaytime'].append(soup.select('maxplaytime')[0].text)
    d['age'].append(soup.select('age')[0].text)
    
    for cat in soup.select('boardgamecategory'):
        categories.append((cat.text))
    d['category'].append(categories)
    
    for mech in soup.select('boardgamemechanic'):
        mechanics.append((mech.text))
    d['mechanic'].append(mechanics)
    
    if len(d['name'])%100 ==0:
        df2 = pd.DataFrame(data=d, columns=col)
        df2.to_csv('bgg.csv', index=False)
        time.sleep(5)

In [14]:
# combining the dataframes and saving the .csv file
df_combined = pd.concat([df, df2.drop('name', axis=1)], axis=1)
df_combined.to_csv('bgg_top5000.csv', index=False)

In [16]:
# final dataframe

df = pd.read_csv("bgg_top5000.csv")
df

Unnamed: 0,id,name,yearpublished,rank,bayesaverage,average,usersrated,weight,minplayers,maxplayers,minplaytime,maxplaytime,age,category,mechanic
0,224517,Brass: Birmingham,2018,1,8.41611,8.59849,45505,3.8809,2,4,60,120,14,"['Age of Reason', 'Economic', 'Industry / Manu...","['Hand Management', 'Income', 'Loans', 'Market..."
1,161936,Pandemic Legacy: Season 1,2015,2,8.38079,8.52838,53369,2.8315,2,4,60,60,13,"['Environmental', 'Medical']","['Action Points', 'Cooperative Game', 'Hand Ma..."
2,174430,Gloomhaven,2017,3,8.35418,8.59040,62022,3.9112,1,4,60,120,14,"['Adventure', 'Exploration', 'Fantasy', 'Fight...","['Action Queue', 'Action Retrieval', 'Campaign..."
3,342942,Ark Nova,2021,4,8.33397,8.53500,43007,3.7593,1,4,90,150,14,"['Animals', 'Economic', 'Environmental']","['Action Queue', 'End Game Bonuses', 'Hand Man..."
4,233078,Twilight Imperium: Fourth Edition,2017,5,8.24169,8.60196,23644,4.3152,3,6,240,480,14,"['Civilization', 'Economic', 'Exploration', 'N...","['Action Drafting', 'Area-Impulse', 'Dice Roll..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,249582,The Dark Summer: Normandy 1944,2021,4996,5.74559,7.88088,273,2.8667,1,2,480,480,0,"['Wargame', 'World War II']","['Chit-Pull System', 'Dice Rolling']"
4996,963,The Gardens of the Alhambra,1993,4997,5.74555,6.15148,1370,1.7913,2,4,45,60,8,['Abstract Strategy'],"['Area Majority / Influence', 'Tile Placement']"
4997,72644,Perplexus,2001,4998,5.74546,6.94747,442,1.5000,1,1,30,30,6,"['Action / Dexterity', 'Maze']",[]
4998,97273,Upon a Salty Ocean,2011,4999,5.74544,6.47869,712,2.8971,2,4,120,120,13,"['Economic', 'Nautical', 'Renaissance']","['Action Points', 'Variable Phase Order']"
