In [8]:
# Imports
from bs4 import BeautifulSoup
from bs4 import Tag
import re
import json
import urllib2
import urllib
import sys

In [9]:
#
#
# build a list of all the countries and associated country codes
#
#
countryListPage = "http://www.databaseolympics.com/country/countrylist.htm"

# get the html for the given page and soupify it
response = urllib2.urlopen(countryListPage);
html = response.read();
countrySoup = BeautifulSoup(html);
countryCodes = {};
allCodes = countrySoup.find_all("a",href=re.compile("^\/country\/countrypage.*"))
for cnt in allCodes:
    code = cnt['href'].split("=")[-1]
    countryCodes[code] = cnt.string
    
print "Imported",len(countryCodes),"country codes"

with open('CountryTable.json', 'w') as outfile:
    json.dump(countryCodes, outfile, indent=4, sort_keys=True, separators=(',', ':'))
outfile.close()

Imported 219 country codes


In [83]:
#
#
# build a list of all the athletes and associated athlete codes
# These are the functions used to parse the data
#
#

athletePage = "http://www.databaseolympics.com/players/playerpage.htm?ilkid="


# take in a <tr ...> </tr> row of a table
# and parse it. Return an object holding proper info
def ParseMedalTableRow(tableRow):
    columns = tableRow.find_all("td")
    newRowData = {};
    newRowData["Year"] = columns[0].string
    newRowData["GameCode"] = columns[0].a['href'].split("=")[-1]
    newRowData["Age"] = columns[1].string
    newRowData["Sport"] = columns[2].a['href'].split("=")[-1]
    newRowData["Event"] = columns[3].string
    newRowData["Medal"] = columns[4].string
    newRowData["Country"] = columns[5].string
    newRowData["Result"] = columns[6].string
    return newRowData;



# parse an individual athlete page
# return medal row info in return list item [0]
# and birthday in return list item [1]
def ParseSingleAthletePage(url):
    # get the html from source
    response = urllib2.urlopen(url);
    html = response.read();
    athleteSoup = BeautifulSoup(html);
    
    # get the birthdate
    birthdayTag = athleteSoup.find_all("a",href=re.compile("^\/players\/birthdays.htm.*"))
    if len(birthdayTag) > 0:
        birthday = birthdayTag[0].string
    else:
        birthday = "n/a"

    # get the medals
    # find the header of the stats table and init data structures
    statTableHeaderRow = athleteSoup.find_all("tr",class_='statHead')
    dataRows = [];
    if len(statTableHeaderRow) > 0:

        # loop through siblings of the stat table header,
        # i.e. the data rows of the table
        # and copy the data
        for row in statTableHeaderRow[0].next_siblings:
            if isinstance(row,Tag):
                parsedRow = ParseMedalTableRow(row)
                dataRows.append(parsedRow)
                
    # return the medal entries for this athlete, and the birthdate
    return [dataRows, birthday]
    
    
# extract an athlete code from a playerpage link
# takes a BeautifulSoup4 Tag and extracts the 
# player code from the href attribute
def ExtractAthleteCodeString(urlTag, encoding):
    
    if not isinstance(urlTag,Tag):
        return "Not a bs4 Tag"
    
    code = urlTag['href'].split("=")[-1].split(" ")[0]
    code = urllib2.quote(code.encode(encoding))
    return code
    
    
    
# parse the page for a list of athletes, by first letter of last name
def ParseAtlhetePageList(url, athleteTable, medalTable, problemTags):
    
    # get the html from source
    response = urllib2.urlopen(url);
    html = response.read();
    athleteSoup = BeautifulSoup(html);
    
    # get all the athlete codes
    allCodes = athleteSoup.find_all("a",href=re.compile("^\/playerpage.htm.*"))
    
    # for each code, get athlete info
    for ath in allCodes:
        try:
            code = ExtractAthleteCodeString(ath,athleteSoup.original_encoding)
            athleteData = ParseSingleAthletePage(athletePage+code)
            for medal in athleteData[0]:
                medal["Athlete"] = code
            medalTable.extend(athleteData[0])
            athleteTable[code] = [ath.string, athleteData[1]]
        except:
            newProblem = {};
            newProblem['href'] = ath['href']
            print "problem!",newProblem
            problemTags.append(newProblem)
            
            

In [85]:
athleteListPage = "http://www.databaseolympics.com/players/playerlist.htm?lt="
for c in '':#'abcdefghijklmnopqrstuvwxyz':
    athleteTable = {};
    medalTable = [];
    problemTags = [];
    print "Parsing for " + c
    sys.stdout.flush()
    ParseAtlhetePageList(athleteListPage+c,athleteTable, medalTable, problemTags)
    print "Athletes so far:",len(athleteTable)
    print "Medals so far:",len(medalTable)
    print "Problems so far:",len(problemTags)
    print
    sys.stdout.flush()
    with open('AthleteTable_'+c+'.json', 'w') as outfile:
        json.dump(athleteTable, outfile, indent=4, sort_keys=True, separators=(',', ':'))
        outfile.close()
    with open('MedalTable_'+c+'.json', 'w') as outfile:
        json.dump(medalTable, outfile, indent=4, sort_keys=True, separators=(',', ':'))
        outfile.close()    
    with open('ProblemTable_'+c+'.json', 'w') as outfile:
        json.dump(problemTags, outfile, indent=4, sort_keys=True, separators=(',', ':'))
        outfile.close()


In [86]:
print len(medalTable)
print medalTable[10]
print athleteTable[medalTable[10]['Athlete']]

1812
{u'Country': u'CUB', u'Age': u'36', u'GameCode': u'25', u'Event': u'Baseball', u'Result': None, u'Year': u'2000', u'Sport': u'BAB', u'Medal': u'SILVER', u'Athlete': u'PACHEANT01'}
[u'Pacheco,\xa0Antonio', u'7/4/1964']


In [91]:
# new function to parse problems
# parse the page for a list of athletes, by first letter of last name
def ParseProblemAtlhetePageList(url, athleteTable, medalTable, problems):
    
    # get the html from source
    response = urllib2.urlopen(url);
    html = response.read();
    athleteSoup = BeautifulSoup(html);
    
    
    for prob in problems:
        href = prob['href'].split(" ")[0].split("=")[-1]
        print href
        # get all the athlete codes
        problemTag = athleteSoup.find_all("a",href=re.compile(".*"+href+".*"))
        ath = problemTag[0]
        # for each code, get athlete info
        try:
            code = ExtractAthleteCodeString(ath,athleteSoup.original_encoding)
            athleteData = ParseSingleAthletePage(athletePage+code)
            for medal in athleteData[0]:
                medal["Athlete"] = code
            if len(athleteData[0])>0:
                medalTable.extend(athleteData[0])
            else:
                print "Empty medals for " + code
            athleteTable[code] = [ath.string, athleteData[1]]
        except:
            newProblem = {};
            newProblem['href'] = ath['href']
            print "problem!",newProblem

In [96]:
# load the problem set for each letter
athletePageBase = "http://www.databaseolympics.com/players/playerpage.htm?ilkid="
athleteListPage = "http://www.databaseolympics.com/players/playerlist.htm?lt="

for c in 'abcdefghijklmnopqrstuvwxyz':
    with open('MedalTable_'+c+'.json') as data_file:    
        medalTable = json.load(data_file)
    with open('AthleteTable_'+c+'.json') as data_file:    
        athleteTable = json.load(data_file)
    with open('ProblemTable_'+c+'.json') as data_file:    
        problems = json.load(data_file)

    print "Fixing for " + c    
    print "Length of AthleteTable:",len(athleteTable)
    print "Number of problems:",len(problems)
    sys.stdout.flush()
    ParseProblemAtlhetePageList(athleteListPage+c,athleteTable,medalTable,problems)
    print "New Length of AthleteTable",len(athleteTable)
    sys.stdout.flush()
    print
    with open('AthleteTable_'+c+'.json', 'w') as outfile:
        json.dump(athleteTable, outfile, indent=4, sort_keys=True, separators=(',', ':'))
        outfile.close()
    with open('MedalTable_'+c+'.json', 'w') as outfile:
        json.dump(medalTable, outfile, indent=4, sort_keys=True, separators=(',', ':'))
        outfile.close()    

Fixing for a
Length of AthleteTable: 910
Number of problems: 1
ARANZ01
New Length of AthleteTable 911

Fixing for b
Length of AthleteTable: 2030
Number of problems: 0
New Length of AthleteTable 2030

Fixing for c
Length of AthleteTable: 1241
Number of problems: 0
New Length of AthleteTable 1241

Fixing for d
Length of AthleteTable: 1199
Number of problems: 2
DALIPDRA01
DOBERJEN01
New Length of AthleteTable 1201

Fixing for e
Length of AthleteTable: 418
Number of problems: 0
New Length of AthleteTable 418

Fixing for f
Length of AthleteTable: 777
Number of problems: 1
FERREAND01
New Length of AthleteTable 778

Fixing for g
Length of AthleteTable: 1245
Number of problems: 1
GROGAJAM01
New Length of AthleteTable 1246

Fixing for h
Length of AthleteTable: 1368
Number of problems: 0
New Length of AthleteTable 1368

Fixing for i
Length of AthleteTable: 251
Number of problems: 0
New Length of AthleteTable 251

Fixing for j
Length of AthleteTable: 649
Number of problems: 0
New Length of Athlet

In [94]:
athleteTable["ROBINARN01"]

[u'Robinson,\xa0Arni', u'4/7/1948']