In [None]:
import requests
import re
import pandas as pd
import time
import os
import sys
import pickle
from functools import reduce

In [None]:
sys.path.append("/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages")

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

## Use selenium to scrape runner data

First, define functions that will be used

In [None]:
# Function to choose year, number of records per page, event, and sex
def select_year(year, num, sex):
    global sex2
    sex2 = sex
    
    year_dropdown = driver.find_element_by_id('list_event_main_group')
    year_dropdown.send_keys(year)

    event_dropdown = driver.find_element_by_id('list_event')
    event_dropdown.send_keys('999999107FA309000000008D')
    
    number_dropdown = driver.find_element_by_id('fe-lists-new-num-results')
    number_dropdown.send_keys(str(num))

    gender_dropdown = driver.find_element_by_id('list_search-sex')
    gender_dropdown.send_keys(sex)
    
    year_dropdown.send_keys(Keys.RETURN)

In [None]:
# Function to scrape a list of runners on one page
def scrape_runners(num):    
    for i in range (6, (num+6)):  
        elems = driver.find_elements_by_xpath("//a[@href]")
        elems[i].click()
        search = driver.find_elements_by_xpath('//table[@class="list-table names"]')
        splits = [search[0].text.split('\n'), search[2].text.split('\n'), search[3].text.split('\n')]
        runners.append(splits)
        driver.back()
    return runners

In [None]:
# Function to clean data from one search
def clean_runners():
    '''
    1) Extract each runner's time, country, age group, and bib number
    '''
    
    for i in range(len(runners)):
        features = []

        timelist = runners[i][1][-1].split(' ')
        time = timelist[-1]

        countrylist = runners[i][0][0].split(' ')
        country = countrylist[-1]

        agelist = runners[i][0][1].split(' ')
        age = agelist[-1]

        biblist = runners[i][0][2].split(' ')
        bib = biblist[-1]

        features.extend((country, age, bib, time))
        runners[i].extend([features])

    '''
    2) Convert each runner's list of splits into an array of split times, converted into seconds
    '''
        
    for i in range(len(runners)):
        try:
            split_times = [x.split()[3] for x in runners[i][2]]
            split_times.remove('Day')
            int_times = [list(map(int, i.split(":"))) for i in split_times]
            split_seconds = [(a*60 + b) for a, b in int_times]
            split_seconds[4:6] = [reduce(lambda x, y: x + y, split_seconds[4:6])]
            del split_seconds[-1]
            runners[i].extend([split_seconds])
        except:
            print (runners[i][0])
            continue

In [None]:
# Function to search multiple pages
def multiple_pages(pages, runners):
    page_count = 0
    
    while page_count < pages:
        scrape_runners(runners)
        page_count += 1
        search = driver.find_elements_by_xpath('//a[@class="pages-nav-button"]')
        search[-1].click()
    
    if page_count == pages:
        clean_runners()

### Run functions to collect data

In [None]:
chromedriver = "/Applications/chromedriver"
driver = webdriver.Chrome(chromedriver)
driver.get('http://chicago-history.r.mikatiming.de/2015/')

In [None]:
select_year(2016, 500, 'Women')

In [None]:
runners = []

In [None]:
multiple_pages(2, 500)

In [None]:
len(runners)

In [None]:
runners[0]

**Commands to use on an "as needed" basis**

In [None]:
### Search for a specific range within one page
for i in range (410, 506):  
    elems = driver.find_elements_by_xpath("//a[@href]")
    elems[i].click()
    search = driver.find_elements_by_xpath('//table[@class="list-table names"]')
    splits = [search[0].text.split('\n'), search[2].text.split('\n'), search[3].text.split('\n')]
    runners.append(splits)
    driver.back()

In [None]:
### Clean runners from a small search (normally built into 'multiple_pages' - only used here for within-page searches)
clean_runners()

In [None]:
### Click to move to the next page
search = driver.find_elements_by_xpath('//a[@class="pages-nav-button"]')
search[-1].click()

### Clean up and save data collected so far

Lists are a little cluttered - drop what is unnecessary, convert to a dataframe, and pickle it

In [None]:
df_list = [i[-2:] for i in runners]
df_list_merged = [a + b for a,b in df_list]

In [None]:
# Need to create extra columns due to messy data
columns = ['country', 'age_group', 'bib', 'finaltime', '5K', '10K', '15K', '20K', 
          '25K', '30K', '35K', '40K', 'foo', 'bar', 'foobar']

In [None]:
df = pd.DataFrame(df_list_merged, columns = columns)

In [None]:
df.head()

In [None]:
# For each search, adjust the name to fit search criteria (year, sex, & record numbers)
file = 'marathon df_14000-15000_2016_women'
fileobj = open(file,'wb') 
pickle.dump(df,fileobj) 