In [45]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from lxml import html
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import pickle
import os

In [49]:
login_url = "http://cliptheapex.com/login"
USERNAME = "<USERNAME>" #change to your cliptheapex account username
PASSWORD = "<PASSWORD>" #change to your cliptheapex account password
directory = './formula-1-race-data/dataframes/' #change to the directory where you will store your pickled dataframes

In [10]:
def parse_html_table(table):
    """This function converts HTML tables to a pandas DataFrame"""
    n_columns = 0
    n_rows=0
    column_names = []

    # Find number of rows and columns
    # we also find the column titles if we can
    for row in table.find_all('tr'):

        # Determine the number of rows in the table
        td_tags = row.find_all('td')
        if len(td_tags) > 0:
            n_rows+=1
            if n_columns == 0:
                # Set the number of columns for our table
                n_columns = len(td_tags)

        # Handle column names if we find them
        th_tags = row.find_all('th') 
        if len(th_tags) > 0 and len(column_names) == 0:
            for th in th_tags:
                column_names.append(th.get_text())

    # Safeguard on Column Titles
    if len(column_names) > 0 and len(column_names) != n_columns:
        raise Exception("Column titles do not match the number of columns")

    columns = column_names if len(column_names) > 0 else range(0,n_columns)
    df = pd.DataFrame(columns = columns,
                      index= range(0,n_rows))
    row_marker = 0
    for row in table.find_all('tr'):
        column_marker = 0
        columns = row.find_all('td')
        for column in columns:
            df.iat[row_marker,column_marker] = column.get_text()
            column_marker += 1
        if len(columns) > 0:
            row_marker += 1

    # Convert to float if possible
    for col in df:
        try:
            df[col] = df[col].astype(float)
        except ValueError:
            pass

    return df

In [103]:
fp = webdriver.FirefoxProfile()
browser = webdriver.Firefox(firefox_profile=fp)
browser.get(login_url)

# Login to your cliptheapex account
username = browser.find_element_by_id("ctrl_pageLogin_login")
password = browser.find_element_by_id("ctrl_pageLogin_password")
username.send_keys(USERNAME)
password.send_keys(PASSWORD)
browser.find_element_by_xpath("/html/body/div[1]/div[2]/div/div/form/dl[3]/dd/input").click()

# Navigate to the overtaking page
browser.find_element_by_xpath("/html/body/div[1]/header/div/div[2]/div/nav/div/ul[1]/li[3]/a[1]").click()
browser.find_element_by_xpath("/html/body/div[1]/header/div/div[2]/div/nav/div/ul[1]/li[3]/div/ul/li[1]/a").click()

seasons_x_paths = ['/html/body/div[1]/div[2]/div/div/div[3]/div[1]/table/tbody/tr[' + str(i) + ']/td[1]/a' for i in range(1,3)]
races_x_paths = ['/html/body/div[1]/div[2]/div/div/div[4]/div[2]/table/tbody/tr[' + str(i) + ']/td[2]/a' for i in range(1,22)]

df = pd.DataFrame()

# Parsing HTML Tables with BeautifulSoup
for season in seasons_x_paths:
    for race in races_x_paths:
        try:
            browser.find_element_by_xpath(season).click()
            browser.find_element_by_xpath(race).click()
            try:
                browser.find_element_by_xpath("/html/body/div[1]/div[2]/div/div/div[4]/div[4]/div[2]/label/select/option[5]").click()
            except NoSuchElementException:
                print "link not present....continuing script"
            html_source = browser.page_source
            soup = BeautifulSoup(html_source.encode('utf-8'), 'lxml') # Parse the HTML as a string
            tables = soup.find_all('tbody')
            df_sub = parse_html_table(tables[0])
            df_sub['name'] = soup.select('h1')[0].text.strip()[5:]
            df_sub['year'] = soup.select('h1')[0].text.strip()[0:5]
            df = df.append(df_sub)
            browser.find_element_by_xpath("/html/body/div[1]/div[2]/div/div/div[1]/nav/fieldset/span/span[3]/a").click()
        except NoSuchElementException:
            print "link not present....continuing script"

# Pickle the dataframe
file_name = "overtaking.pickle"
file_path = os.path.join(directory, file_name)
df.to_pickle(file_path)

link not present....continuing script
link not present....continuing script
link not present....continuing script
link not present....continuing script
link not present....continuing script
link not present....continuing script
link not present....continuing script
link not present....continuing script
link not present....continuing script
link not present....continuing script


In [88]:
def read_from_pickle(directory):
    df = pd.DataFrame()
    df_filepath = [(directory + f) for f in os.listdir(directory)]
    for a in df_filepath[1:]:
        with open(a, 'rb') as file:
            try:
                while True:
                    df = pickle.load(file)
            except EOFError:
                pass
    return df

In [104]:
df = read_from_pickle(directory)

In [105]:
df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,name,year
26,45.0,4.0,Sebastian Vettel,5.0,10,-12.0,Sergio Pérez,17.0,10,Abu Dhabi Grand Prix,2015
27,45.0,12.0,Max Verstappen,6.0,9,-11.0,Jenson Button,17.0,9,Abu Dhabi Grand Prix,2015
28,47.0,8.0,Felipe Massa,20.0,8,-1.0,Daniil Kvyat,21.0,8,Abu Dhabi Grand Prix,2015
29,52.0,10.0,Romain Grosjean,8.0,3,-15.0,Carlos Sainz,23.0,3,Abu Dhabi Grand Prix,2015
30,53.0,9.0,Romain Grosjean,9.0,2,-18.0,Daniil Kvyat,27.0,2,Abu Dhabi Grand Prix,2015


In [106]:
df.head(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,name,year
0,4.0,5.0,Lewis Hamilton,3.0,12,0.0,Felipe Massa,3.0,7,Australian Grand Prix,2016
1,5.0,8.0,Daniel Ricciardo,4.0,7,0.0,Nico Hülkenberg,4.0,11,Australian Grand Prix,2016
2,5.0,16.0,Valtteri Bottas,4.0,12,0.0,Felipe Nasr,4.0,5,Australian Grand Prix,2016
3,9.0,14.0,Valtteri Bottas,8.0,8,0.0,Marcus Ericsson,8.0,2,Australian Grand Prix,2016
4,10.0,6.0,Daniel Ricciardo,9.0,2,0.0,Felipe Massa,9.0,1,Australian Grand Prix,2016
5,10.0,13.0,Valtteri Bottas,9.0,7,0.0,Pascal Wehrlein,9.0,1,Australian Grand Prix,2016
6,10.0,16.0,Romain Grosjean,9.0,8,0.0,Felipe Nasr,9.0,0,Australian Grand Prix,2016
7,10.0,19.0,Carlos Sainz,1.0,8,-8.0,Esteban Gutiérrez,9.0,Unknown,Australian Grand Prix,2016
8,10.0,18.0,Carlos Sainz,1.0,8,-8.0,Rio Haryanto,9.0,2,Australian Grand Prix,2016
9,10.0,19.0,Esteban Gutiérrez,9.0,Unknown,0.0,Rio Haryanto,9.0,2,Australian Grand Prix,2016
