In [None]:
!pip3 install html5lib bs4

In [None]:
import os
from pprint import pprint
from bs4 import BeautifulSoup

from settings import downloadsFolder, platformsOrdered

In [None]:
filenames = sorted([name for name in os.listdir(downloadsFolder) 
                     if name.split("_")[0] in platformsOrdered])

In [None]:
def parseMetascore(soup, urlpath, resultsDict):
    # metascore
    ms = soup.find('div', attrs = {'class':'score_summary metascore_summary'})
    metascoreFind = ms.find('span', attrs = {'itemprop':'ratingValue'})
    resultsDict["metascore"] = int(metascoreFind.text) if metascoreFind else 0

    # metascore number of reviews
    summary = ms.find('div', attrs = {'class' : 'summary'})
    criticReviews = summary.find('a', attrs = {'href':'%s/critic-reviews' % urlpath})
    resultsDict["metascoreBased"] = int(criticReviews.find('span').text.strip()) if criticReviews else 0

def parseUserscore(soup, urlpath, resultsDict):
    # userscore
    us = soup.find('div', attrs = {'class':'userscore_wrap feature_userscore'})
    # print (us.prettify())
    userscoreTags = us.select("div[class^=metascore_w\ user\ large\ game]") # begins with operator
    if len(userscoreTags) !=1: 
        raise Error("number of userscore tags not equal 1")
    userscoreText = userscoreTags[0].text.strip()
    resultsDict["userscore"] = 0 if userscoreText=="tbd" else float(userscoreText)

    # userscore number of reviews
    usersummary = us.find('div', attrs = {'class' : 'summary'})
    userReviews = usersummary.find('a', attrs = {'href':'%s/user-reviews' % urlpath})
    answer=0
    if userReviews:
        answer = int(userReviews.text.replace("Ratings","").strip())
    else:
        um = usersummary.find('span', attrs = {'class':'connect4_msg'}).text.strip()
        answer = -int(um.replace("Awaiting","").replace("more rating","").replace("s",""))
    resultsDict["userscoreBased"] = answer

def parseOtherInfos(soup, resultsDict):
    # forget HTML, just parse the text
    textlines = [lines.strip() for lines in soup.body.text.split("\n") 
                    if lines.strip() != ""]
    #print("\n".join(textlines))
    
    # number of players
    try:
        nopsIndex = textlines.index("# of players:")
        nops = textlines[nopsIndex+1] if nopsIndex else ""
    except:
        nops = ""
    resultsDict["nops"]=nops
    # developer company
    resultsDict["developer"] = textlines[textlines.index("Developer:")+1]
    resultsDict["released"] = textlines[textlines.index("Release Date:")+1]

    # genres are all in one line, but with many spaces inbetween    
    i = next(i for i,text in enumerate(textlines) if text.startswith("Genre(s):"))
    resultsDict["genres"] = textlines[i].replace("Genre(s):", "").replace(" ", "") # .split(",")



In [None]:
filename2results={}
for i, name in enumerate(filenames): # [30:36]):
    platform, rest = name.split("_")
    game = rest.replace(".html", "")
    resultsDict={"platform" : platform, "game": game}

    urlpath = "/game/%s/%s" % (platform, game)
    print (i, platform, game, end =": ") # urlpath, end=" ")

    with open(os.path.join(downloadsFolder, name), "r") as f:
        page = f.read()
    soup = BeautifulSoup(page, 'html5lib')
    # print(soup.prettify())

    parseMetascore(soup, urlpath, resultsDict)
    print ("ms={metascore:d} ({metascoreBased:d} revs)".format(**resultsDict), end="")

    resultsDict["userscore"], resultsDict["userscoreBased"] = 0, 0
    try:
        parseUserscore(soup, urlpath, resultsDict)
    except:
        pass
    # pprint(resultsDict)
    print ("; us={userscore:.1f} ({userscoreBased:d} revs)".format(**resultsDict), end="")

    parseOtherInfos(soup, resultsDict)
    print ("; released={released:s}; Dev={developer:s}; Genres={genres:s}; #plyrs={nops:s}".format(**resultsDict))

    filename2results[game+"_"+platform] = resultsDict


In [None]:
pprint(filename2results)