# Webscraping Function

This notebook describes how I made a function to scrape information from the Greyhound Board of Great Britain Website so that I can update my dataframe.

This is a two stage process.

First I must see if there were any races on the dates at the desired track (Crayford as this is the main focus of my attention currently).  If so then it will get the unique race IDs for these races.

Due to how the website operates, I use Selenium, the headless web browser, to enter the necessary date and track information.

Once I have the race ID numbers, then I can loop through these to scrape the necessary information that I need.

In [None]:
# Import Packages

import re
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

from sqlalchemy import create_engine
engine = create_engine('postgresql://localhost:5432/danielpayne')

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep

## Stage One - Acquire Race IDs.

In [None]:
# This defines a function that gets the maximum number of pages from the initial page

def page_number(soupObj):
    try:
        page = soup.find('div', class_="rgWrap rgInfoPart").get_text()
        max_page = int(page.split('in ')[1][:2])
    except:
        max_page = 1
    return max_page


In [None]:
# Now I can go through the dates that I need and get the Race Ids for the races.

def greyhound_scraper(start_date, end_date):

    # The URL to go to.
    url = ("http://www.gbgb.org.uk/Results.aspx")

    # Empty list for which ti append the Race IDS
    race_ids = []


    # open the driver for Selenium and input the url
    driver = webdriver.Chrome(executable_path="../../../chromedriver")
    driver.get(url)

    #Enter the date needed here and make sure format is correct
    dates =  pd.date_range(start_date, end_date).map(lambda x: x.strftime('%d/%m/%Y'))
    #dates = [start_date, end_date]
    for date in dates:
        # This finds the drop down menu element
        menu = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_mainContent_cmscontent_TrackRaces_ddlTrack"]/span/span[2]')
        menu.click()
        # This selects the race track.  In this case Crayford which is 6th on the drop down list
        track = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_mainContent_cmscontent_TrackRaces_ddlTrack_DropDown"]/div/ul/li[6]')
        sleep(1)
        track.click()
        # This is the element where you can enter the date
        date_entry = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_mainContent_cmscontent_TrackRaces_dtpDate_dateInput"]')
        date_entry.clear()
        date_entry.send_keys(date, Keys.RETURN)
        sleep(1)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        if soup.find_all('tr', class_=(re.compile('rgRow|rgAltRow'))) == []:
            print 'No records for:', date
            pass
        else:
            n = 1
            for i in range(page_number(soup)):                                       # Using page number function for each dog
                if i < page_number(soup): 
                    print 'Page', n, 'of', page_number(soup), 'for', date
                html = driver.page_source
                soup = BeautifulSoup(html, 'html.parser')
                results = soup.find_all('tr', class_=(re.compile('rgRow|rgAltRow'))) # Regex to get the alternate rows info is in
                for result in results:
                    for race in result.find_all('td'):
                        if race.a is not None and 'View Race' in race.a:            # Calling the info in the "a" tag for "Race"
                            string =  str(race.a)                                    # Make into a string
                            id_num = re.search('\d+', string)                        # Search for any numeric value
                            race_number = int(id_num.group())                        # Return integer of that number
                            race_ids.append(race_number)                    # Append onto list
                if n == page_number(soup):
                    pass
                else:
                    n += 1
                    content = driver.find_element_by_class_name('rgPageNext')            # Turning page onto next

                    content.click()
                    sleep(2)                                                        # Allow page to load

    driver.close()
    return race_ids

In [None]:
# Now run the function as so.  This example would look the races for 14th July
race_list = greyhound_scraper('2017-07-14', '2017-07-14')

## Stage Two - Loop through race Ids and get the necessary information for my dataframe

** First I define functions that can extract each required element**

In [None]:
# Defining functions to extract each piece of information from the racecard.

def get_track(soupObj):
    for header in soupObj.find_all("div", class_="resultsBlockHeader"):
        track = header.find("div", class_="track").get_text(strip=True)
    return track

def get_date(soupObj):
    for header in soupObj.find_all("div", class_="resultsBlockHeader"):
        date = header.find("div", class_="date").get_text(strip=True)
    return date

def get_datetime(soupObj):
    for header in soupObj.find_all("div", class_="resultsBlockHeader"):
        datetime = header.find("div", class_="datetime").get_text(strip=True)
    return datetime

def get_grade(soupObj):
    for header in soupObj.find_all("div", class_="resultsBlockHeader"):
        grade = header.find("div", class_="grade").get_text(strip=True)
    return grade

def get_distance(soupObj):
    for header in soupObj.find_all("div", class_="resultsBlockHeader"):
        distance = header.find("div", class_="distance").get_text(strip=True)
    return distance

def get_prizes(soupObj):
    for header in soupObj.find_all("div", class_="resultsBlockHeader"):
        prizes = header.find("div", class_="prizes").get_text(strip=True)
    return prizes

def get_going_allowance(soupObj):
    header = soup.find("div", class_="resultsBlockFooter")
    result = header.find("div").get_text(strip=True)
    try:
        going = result.split(':')[1]
    except:
        going = result
    return going

def get_dog_name(soupObj):
    greyhound_participants = []
    for header in soupObj.find_all("div", class_="resultsBlockHeader"):
        results = header.find_next_sibling("div", class_="resultsBlock").find_all("ul", class_="line1")
        for result in results:
            dog_name = result.find("li", class_="greyhound").get_text(strip=True)
            greyhound_participants.append(dog_name)
    return greyhound_participants

def get_position(soupObj):
    position = []
    for header in soupObj.find_all("div", class_="resultsBlockHeader"):
        results = header.find_next_sibling("div", class_="resultsBlock").find_all("ul", class_="line1")
        for result in results:
            dog_name = result.find("li", class_="first essential fin").get_text(strip=True)
            position.append(dog_name)
    return position

def get_trap_number(soupObj):
    traps = []
    for header in soupObj.find_all("div", class_="resultsBlockHeader"):
        results = header.find_next_sibling("div", class_="resultsBlock").find_all("ul", class_="line1")
        for result in results:
            dog_name = result.find("li", class_="trap").get_text(strip=True)
            traps.append(dog_name)
    return traps
    
def get_odds(soupObj):
    odds = []
    for header in soupObj.find_all("div", class_="resultsBlockHeader"):
        results = header.find_next_sibling("div", class_="resultsBlock").find_all("ul", class_="line1")
        for result in results:
            dog_name = result.find("li", class_="sp").get_text(strip=True)
            odds.append(dog_name)
    return odds
    
def get_time_trap(soupObj):
    time_trap = []
    for header in soupObj.find_all("div", class_="resultsBlockHeader"):
        results = header.find_next_sibling("div", class_="resultsBlock").find_all("ul", class_="line1")
        for result in results:
            dog_name = result.find("li", class_="timeSec").get_text(strip=True)
            time_trap.append(dog_name)
    return time_trap

def get_time_distance(soupObj):
    time_dist = []
    for header in soupObj.find_all("div", class_="resultsBlockHeader"):
        results = header.find_next_sibling("div", class_="resultsBlock").find_all("ul", class_="line1")
        for result in results:
            dog_name = result.find("li", class_="timeDistance").get_text(strip=True)
            time_dist.append(dog_name)
    return time_dist

def get_weight(soupObj):
    weights = []
    results = soup.find_all("ul", class_="line2")
    for result in results:
        comment = result.find("li", class_="first essential").get_text(strip=True)
        comment = [entry for entry in comment.split('  ') if entry != '']
        try:
            weight_num = re.search('\d\d.\d', comment[4])
            weight = float(weight_num.group())
        except:
            weight = 'NA'
        weights.append(weight)
    return weights

# This function not only retrieves the name of the Dam (mother) but also when the greyhound was born

def get_dam(soupObj):
    dams = []
    born = []
    results = soup.find_all("ul", class_="line2")
    for result in results:
        comment = result.find("li", class_="first essential").get_text(strip=True)
        comment = [entry for entry in comment.split('  ') if entry != '']
        if len(comment) > 3:
            dam_data = comment[3]
            dam_data = dam_data.split(' ')
            try:
                born_data = dam_data[-2]
            except:
                born_data = 'NA'
            dam_data = ' '.join(dam_data[:-2])
            born.append(born_data)
            dams.append(dam_data)
        else:
            born.append('nan')
            dams.append('nan')
    return dams, born

# This function not only gets the Sire (father) of the dog but also a random element possibly the sex

def get_sire(soupObj):
    sires = []
    random = []
    results = soup.find_all("ul", class_="line2")
    for result in results:
        comment = result.find("li", class_="first essential").get_text(strip=True)
        comment = [entry for entry in comment.split('  ') if entry != '']
        if len(comment) > 1:
            sire_data = comment[1]
            sires.append(sire_data[2:])
            random.append(sire_data[:2])
        else:
            sires.append('nan')
            random.append('nan')
    return sires, random

def get_trainer(soupObject):
    trainers = []
    results = soup.find_all("ul", class_="line2")
    for result in results:
        tr_info = result.find("li", class_="essential trainer").get_text(strip=True)
        trainer = tr_info.split(':')[1]
        trainers.append(trainer[:-1])
    return trainers   

def get_comment(soupObj):
    comments = []
    results = soup.find_all("ul", class_="line3")
    for result in results:
        comment_info = result.find("li", class_="first essential comment").get_text(strip=True)
        comment_info = comment_info.split(':')[1]
        comments.append(comment_info)
    return comments

In [None]:
# Make a copy of the list so don't have to change code below!!
gbgb_race_ids = race_list

In [None]:
# Define the Url stem to which I can add the Race Ids to loop through racecards. 
url_stem = 'http://www.gbgb.org.uk/resultsRace.aspx?id='

# Define a dictionary that will hold all racecard information
gbgb_dict = {'Race_id' : [],
            'Name': [],
            'Position' : [],
            'Sire': [],
            'Dam' : [],
            'Born' : [],
            'Random' : [],
            'Trap_no' : [],
            'Odds' : [],
            'Time_trap' : [],
            'Time_distance' : [],
            'Weight' : [],
            'Track' : [],
            'Trainer' : [],
            'Comment' : [],
            'Date' : [],
            'Datetime' : [],
            'Grade' : [],
            'Distance' : [],
            'Prizes' : [],
            'Going_allowance': []}

i = 1
for race in gbgb_race_ids:
    Url = url_stem + str(race)
    page = requests.get(Url)
    soup = BeautifulSoup(page.content, 'html.parser')
    gbgb_names = get_dog_name(soup)                                                   # Using functions defined earlier
    if gbgb_names is not None:                                                        # Check in case of empty result
        if i % 50 == 0:
            print i, 'out of', len(gbgb_race_ids), 'race_id :', gbgb_race_ids[i]  # Progress counter
        gbgb_dict['Name'].append(gbgb_names)                            
        gbgb_dict['Race_id'].append([race] * len(gbgb_names))                         # Append Race_id to each dog entry
        gbgb_dict['Position'].append(get_position(soup))
        gbgb_dict['Sire'].append(get_sire(soup)[0])
        gbgb_dict['Dam'].append(get_dam(soup)[0])
        gbgb_dict['Born'].append(get_dam(soup)[1])
        gbgb_dict['Random'].append(get_sire(soup)[1])
        gbgb_dict['Trap_no'].append(get_trap_number(soup))
        gbgb_dict['Odds'].append(get_odds(soup))
        gbgb_dict['Time_trap'].append(get_time_trap(soup))
        gbgb_dict['Time_distance'].append(get_time_distance(soup))
        gbgb_dict['Weight'].append(get_weight(soup))
        gbgb_dict['Track'].append([get_track(soup)] * len(gbgb_names))              # Some values appear only once
        gbgb_dict['Trainer'].append(get_trainer(soup))
        gbgb_dict['Comment'].append(get_comment(soup))
        gbgb_dict['Date'].append([get_date(soup)] * len(gbgb_names))
        gbgb_dict['Datetime'].append([get_datetime(soup)] * len(gbgb_names))
        gbgb_dict['Grade'].append([get_grade(soup)] * len(gbgb_names))
        gbgb_dict['Distance'].append([get_distance(soup)] * len(gbgb_names))
        gbgb_dict['Prizes'].append([get_prizes(soup)] * len(gbgb_names))
        gbgb_dict['Going_allowance'].append([get_going_allowance(soup)] * len(gbgb_names))
        i += 1
    else:
        print 'Race_id to pass:', gbgb_race_ids[i]
        i += 1
        pass
        

In [None]:
# Make a copy as the dictionary is temporary
test = gbgb_dict.copy()

#### Each element is a list of lists so I need to expand these out into a single list

In [None]:
# Each element is a list of lists so they are appended together to make one list.  Lengths are checked for equality.

for key in test.keys():
    test[key] = [a for b in test[key] for a in b]
    
for key in test.keys():
    print key, len(test[key])

In [None]:
# Make into a dataframe
race_update = pd.DataFrame(test)

In [None]:
# Save to an SQL database
race_update.to_csv('crayford_race_update_raw.csv', encoding = 'utf-8')