In [1]:
import requests
from bs4 import BeautifulSoup
import scipy.io
import matplotlib.pyplot as plt
import matplotlib 
import pandas as pd
import numpy as np
import pickle
from time import sleep
import timeit


In [2]:
def request(msg, slp=1):
    '''A wrapper to make robust https requests.'''
    while True:
        try:
            r = requests.get(msg)
            if r.status_code == 200:
                return r
            print("Server Error! Response Code %i. Retrying..." % (r.status_code))
            sleep(slp)
        except:
            print("An exception has occurred, probably a momentory loss of connection. Waiting one seconds...")
            sleep(1)


In [23]:
# Initialize a DF to hold all our scraped game info
df_all = pd.DataFrame(columns=["id", "name", "rating", "nrate"])
min_nrate = 501
npage = 39

# Scrap successful pages in the results until we get down to games with < 500 ratings each
while min_nrate > 100:
    # Get full HTML for a specific page in the full listing of boardgames sorted by 
    r = request("https://boardgamegeek.com/browse/boardgame/page/%i?sort=numvoters&sortdir=desc" % (npage,))
    soup = BeautifulSoup(r.text, "html.parser")    
    
    # Get rows for the table listing all the games on this page
    table = soup.find_all("tr", attrs={"id": "row_"})  # Get list of all the rows (tags) in the list of games on this page
    df = pd.DataFrame(columns=["id", "name", "rating", "nrate"], index=range(len(table)))  # DF to hold this pages results
    
    # Loop through each row and pull out the info for that game
    for idx, row in enumerate(table):
        # Row may or may not start with a "boardgame rank" link, if YES then strip it
        links = row.find_all("a")
        if "name" in links[0].attrs.keys():
            del links[0]
        gamelink = links[1]  # Get the relative URL for the specific game
        gameid = int(gamelink["href"].split("/")[2])  # Get the game ID by parsing the relative URL
        gamename = gamelink.contents[0]  # Get the actual name of the game as the link contents

        ratings = row.find_all("td", attrs={"class": "collection_bggrating"})
        avg = float("".join(ratings[1].contents[0].split()))
        nratings = int("".join(ratings[2].contents[0].split()))

        df.iloc[idx, :] = [gameid, gamename, avg, nratings]

    # Concatenate the results of this page to the master dataframe
    min_nrate = df["nrate"].min()  # The smallest number of ratings of any game on the page
    print("Page %i scraped, minimum number of ratings was %i" % (npage, min_nrate))
    df_all = pd.concat([df_all, df], axis=0, ignore_index=True)
    npage += 1
    sleep(2) # Keep the BGG server happy.

Page 39 scraped, minimum number of ratings was 478
Page 40 scraped, minimum number of ratings was 462
Page 41 scraped, minimum number of ratings was 443
Page 42 scraped, minimum number of ratings was 426
Page 43 scraped, minimum number of ratings was 413
Page 44 scraped, minimum number of ratings was 400
Page 45 scraped, minimum number of ratings was 388
Page 46 scraped, minimum number of ratings was 375
Page 47 scraped, minimum number of ratings was 365
Page 48 scraped, minimum number of ratings was 355
Page 49 scraped, minimum number of ratings was 345
Page 50 scraped, minimum number of ratings was 335
Page 51 scraped, minimum number of ratings was 325
Page 52 scraped, minimum number of ratings was 316
Page 53 scraped, minimum number of ratings was 304
Page 54 scraped, minimum number of ratings was 297
Page 55 scraped, minimum number of ratings was 288
Page 56 scraped, minimum number of ratings was 279
Page 57 scraped, minimum number of ratings was 272
Page 58 scraped, minimum number

In [5]:
df_all = pd.read_csv("games.csv.gz")
del df_all['Unnamed: 0']
df_all.head()

Unnamed: 0,id,name,rating,nrate
0,13,Catan,7.23,76523
1,822,Carcassonne,7.43,76119
2,30549,Pandemic,7.66,73223
3,36218,Dominion,7.67,62426
4,68448,7 Wonders,7.81,60610


In [6]:
# Prepare a "# of FULL pages of ratings" column to track # API calls needed
df_all["nfullpages"] = (df_all["nrate"]-50).apply(round, ndigits=-2)/100  # Round DOWN to nearest 100
df_all.head()

Unnamed: 0,id,name,rating,nrate,nfullpages
0,13,Catan,7.23,76523,765.0
1,822,Carcassonne,7.43,76119,761.0
2,30549,Pandemic,7.66,73223,732.0
3,36218,Dominion,7.67,62426,624.0
4,68448,7 Wonders,7.81,60610,606.0


In [7]:
import sqlite3
connex = sqlite3.connect("ratings.db")
cur = connex.cursor()

In [8]:
df_toy = df_all.loc[df_all["nrate"] < 5000, ].copy()


In [None]:
from IPython.display import clear_output


#############################################################
# Gathering all ratings from all games in toy data set
#############################################################
# Get ratings page-by-page for all games, but do it in chunks of 250 games
for nm, grp in df_toy.groupby(np.arange(len(df_toy))//10):
    if nm < 117:
        continue
    # Initialize a DF to hold all the responses for this chunk of games
    df_ratings = pd.DataFrame(columns=["gameid", "username", "rating"], index=range(grp["nrate"].sum()+100000))

    # Initialize indices for writing to the ratings dataframe
    dfidx_start = 0
    dfidx = 0
    
    # For this group of games, make calls until all FULL pages of every game have been pulled
    pagenum = 1
    while len(grp[grp["nfullpages"] > 0]) > 0:
        clear_output()
        print("%i: %i" % (nm, pagenum))
        # Get a restricted DF with only still-active games (have ratings pages left)
        active_games = grp[grp["nfullpages"] > 0]

        # Set the next chunk of the DF "gameid" column using the list of game IDs
        id_list = []
        for game in active_games["id"]:
            id_list += [game]*100
        dfidx_end = dfidx_start + len(active_games)*100
        df_ratings.iloc[dfidx_start:dfidx_end, df_ratings.columns.get_loc("gameid")] = id_list

        # Make the request with the list of all game IDs that have ratings left
        id_strs = [str(gid) for gid in active_games["id"]]
        gameids = ",".join(id_strs)
        url = "https://www.boardgamegeek.com/xmlapi2/thing?id=%s&ratingcomments=1&page=%i" % (gameids, pagenum)
        r = request(url, 20)
        soup = BeautifulSoup(r.text, "xml")
        comments = soup("comment")

        # Parse the response and assign it into the dataframe
        l1 = [0]*len(active_games)*100
        l2 = [0]*len(active_games)*100
        for j, comm in enumerate(comments):
            l1[j] = comm["username"]
            l2[j] = float(comm["rating"])
        df_ratings.iloc[dfidx_start:dfidx_end, df_ratings.columns.get_loc("username")] = l1
        df_ratings.iloc[dfidx_start:dfidx_end, df_ratings.columns.get_loc("rating")] = l2

        
        grp["nfullpages"] -= 1  # Decrement the number of FULL pages of each game id
        dfidx_start = dfidx_end     
        pagenum += 1
        sleep(10)  # Keep the server happy
    
    # Strip off the empty rows
    df_ratings = df_ratings.dropna(how="all")
    # Write this batch of all FULL pages of ratings for this chunk of games to the DB
    df_ratings.to_sql(name="data", con=connex, if_exists="append", index=False)    
    print("Processed ratings for batch #%i of games." % (nm))

1021: 1
Processed ratings for batch #1021 of games.
Processed ratings for batch #1022 of games.


In [121]:
print("Dropping the 20 most-rated games would give a total number of ratings of %i" 
      % (df.loc[500:, "nrate"].sum(),))

Dropping the 20 most-rated games would give a total number of ratings of 4731


In [85]:
df_ratings["rating"][68101]

4.0

In [None]:
#############################################################
# Request the final partial page of ratings for each game
#############################################################
# Restore the correct number of FULL pages
df_toy["nfullpages"] = (df_toy["nrate"]-50).apply(round, ndigits=-2)/100  # Round DOWN to nearest 100, then divide by 100

# Initialize a DF to hold all the responses over all the chunks of games
df_ratings = pd.DataFrame(columns=["gameid", "username", "rating"], index=range(len(df_toy)*100))

# Initialize indices for writing to the ratings dataframe
dfidx_start = 0
dfidx = 0

# Loop through game-by-game and request the final page of ratings for each game
for idx, row in df_toy.iterrows():
    # Get the game ID and the last page number to request
    pagenum = row["nfullpages"] + 1
    gameid = row["id"]
    
    # Make the request for just the last page of ratings of this game
    sleep(1)  # Keep the server happy
    r = request("http://www.boardgamegeek.com/xmlapi2/thing?id=%i&ratingcomments=1&page=%i" % (gameid, pagenum), 5)
    soup = BeautifulSoup(r.text, "xml")
    comments = soup("comment")
#         print("Response status was %i - length of comments is %i" % (r.status_code, len(comments)))

    # Set the next chunk of the DF "gameids" column with this gameid
    id_list = [gameid]*len(comments)
    dfidx_end = dfidx_start + len(comments)
    df_ratings.iloc[dfidx_start:dfidx_end, df_ratings.columns.get_loc("gameid")] = id_list

    # Parse the response and assign it into the dataframe
    l1 = [0]*len(comments)
    l2 = [0]*len(comments)
    j = 0
    for comm in comments:
        l1[j] = comm["username"]
        l2[j] = float(comm["rating"])
        j += 1
    df_ratings.iloc[dfidx_start:dfidx_end, df_ratings.columns.get_loc("username")] = l1
    df_ratings.iloc[dfidx_start:dfidx_end, df_ratings.columns.get_loc("rating")] = l2

    dfidx_start = dfidx_end   # Increment the starting index for next round        

    if idx%100 == 0:
        print("Finished with a chunk of 100 games.")
        
# Strip off the empty rows
df_ratings = df_ratings.dropna(how="all")

# Write this final batch of all partial pages of ratings for this chunk of games to the DB
df_ratings.to_sql(name="data", con=connex, if_exists="append", index=False)  

Finished with a chunk of 100 games.
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Finished with a chunk of 100 games.
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Finished with a chunk of 100

Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Finished with a chunk of 100 games.
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Respo

Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Finished with a chunk of 100 games.
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Respo

Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Finished with a chunk of 100 games.
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Respo

Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Finished with a chunk of 100 games.
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Respo

Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Finished with a chunk of 100 games.
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Respo

Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Finished with a chunk of 100 games.
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Finished with a chunk of 100 games.
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 429. Retrying...
Server Error! Response Code 