# TSL - PDF Grabber

A notebook for finding and downloading TSL timing sheet PDFs.


In [172]:
import datetime

#Use this year as default
YEAR = datetime.datetime.now().year

year = year

domain = 'http://www.tsl-timing.com'


results_url='http://www.tsl-timing.com/Results'

series = 'toca'
series_url='http://www.tsl-timing.com/Results/{}/{}'.format(series, year)

event_id = 191403

event_url='http://www.tsl-timing.com/event/{}'.format(event_id)

In [157]:
download_dir_base = 'tsl_results_data'

In [104]:
import requests
from bs4 import BeautifulSoup
import os

## Get All Results Info

Get a list of links to all clubs and series.

In [159]:
resultspage=requests.get(results_url)
resultssoup=BeautifulSoup(resultspage.content)

In [167]:
resultsseries=resultssoup.find('div',{'class':'clubListContainer'}).findAll('a')
resultsseries[0]

<a href="/Results/bf3gt/" title="View British GT and Support events for 2019.">
<div class="clubListDiv">
<div class="clubListLogoDiv">
<img class="clubListLogo" src="/Images/series/2019/f3logo3.jpg"/>
</div>
<div class="clubListTitle">British GT and Support</div>
</div>
</a>

In [170]:
import pandas as pd

In [171]:
def get_TSL_series(results_url='http://www.tsl-timing.com/Results'):
    
    resultspage=requests.get(results_url)
    resultssoup=BeautifulSoup(resultspage.content)
    
    resultsseries=resultssoup.find('div',{'class':'clubListContainer'}).findAll('a')
    
    _data = []
    for seriesresult in resultsseries:
        _series_url = seriesresult['href']
        _series = _series_url.strip('/').split('/')[-1]
        _series_logo_path = seriesresult.find('img')['src']
        _series_event = seriesresult.find('div',{'class':'clubListTitle'}).text
        #print(_series_url,_series_logo_path, _series, _series_event )
        _data.append({'_series_url':_series_url,
                      '_series_logo_path':_series_logo_path,
                      '_series':_series,
                      '_series_event':_series_event})
        
    return pd.DataFrame( _data )
    
get_TSL_series()

Unnamed: 0,_series,_series_event,_series_logo_path,_series_url
0,bf3gt,British GT and Support,/Images/series/2019/f3logo3.jpg,/Results/bf3gt/
1,bsb,British Superbikes,/Images/series/2019/bsblogo3.jpg,/Results/bsb/
2,toca,BTCC and Support,/Images/series/2019/btcclogo3.jpg,/Results/toca/
3,f1,F1 Team Testing,/Images/series/f1.jpg,/Results/f1/
4,pwc,GT World Challenge America,/Images/series/2019/wc-usa.jpg,/Results/pwc/
5,gtasia,GT World Challenge Asia,/Images/series/2019/GTAsia.jpg,/Results/gtasia/
6,BarcMclaren,McLaren GT4,/Images/series/BARCMcLaren.png,/Results/BarcMclaren/
7,BritishRX,Motorsport UK British Rally X,/Images/series/2019/BRX.jpg,/Results/BritishRX/
8,roc,Race of Champions,/Images/series/ROC.png,/Results/roc/
9,tcruk,TCR UK,/Images/series/tcruk.png,/Results/tcruk/


## Get Series Pages

Get a list of links for each event in a series.

In [179]:
def get_TSL_series_events(series='toca', year = YEAR ):
    
    series_url='http://www.tsl-timing.com/Results/{}/{}'.format(series, year)
    
    seriespage=requests.get(series_url)
    seriessoup=BeautifulSoup(seriespage.content)

    seriesevents=seriessoup.find('div',{'id':'races'}).findAll('a')

    _data = []
    
    for seriesevent in seriesevents:
        _event_url = seriesevent['href']
        _event_txt = seriesevent.find('div',{'class':'clubEventText'}).text

        _event_txt_parts = _event_txt.strip('\n').split('\n')
        _event_date = _event_txt_parts[0]
        _event_name = _event_txt_parts[1]

        #print(_event_txt_parts)
        _data.append( {'_event_url':_event_url,
                       '_event_date':_event_date,
                       '_event_name':_event_name } )
        
    return pd.DataFrame( _data )

get_TSL_series_events('toca')

Unnamed: 0,_event_date,_event_name,_event_url
0,6th Apr - 7th Apr 2019,British Touring Car Championship - Round 1 - 6...,/event/191403
1,27th Apr - 28th Apr 2019,BTCC,/event/191703
2,18th May - 19th May 2019,BTCC,/event/192003
3,15th Jun - 16th Jun 2019,BTCC,/event/192403
4,29th Jun - 30th Jun 2019,BTCC,/event/192603
5,3rd Aug - 4th Aug 2019,BTCC,/event/193103
6,17th Aug - 18th Aug 2019,BTCC,/event/193303
7,14th Sept - 15th Sept 2019,BTCC,/event/193703
8,28th Sept - 29th Sept 2019,BTCC,/event/193903
9,12th Oct - 13th Oct 2019,BTCC,/event/194103


### Get Event PDFs

Download PDFs relating to a particular event.

In [155]:
eventpage=requests.get(event_url)
eventsoup=BeautifulSoup(eventpage.content)

In [156]:
#check that event data is available
data_available = False if eventsoup.find("h3", string="Event data available soon") else True

True

In [154]:
event_map_url = eventsoup.find('img',{'class':'eventMapImage'})['src']
event_map_url

'/Images/Tracks/f3/OultonParkInternational.jpg'

In [137]:
events=eventsoup.findAll('div',{'class':'championshipDiv'})

In [116]:
p = '{}/{}/{}'.format(download_dir_base,series,year)

In [133]:
if not os.path.exists(p):
    os.makedirs(p)

download=True

for event in events:
    championship_name = event.find('h3').text
    championship_url = event.find('a')['href']
    if download:
        print('Downloading: {} [{}]'.format(championship_name, championship_url))
        if championship_url.endswith('.pdf'):
            cmd = 'curl -o "{fp}" {url}'.format(url='{}{}'.format(domain,championship_url),fp='{}/{}.pdf'.format(p,championship_name))
            os.system(cmd)
        print('Files downloaded to: {}'.format(p))

Downloading F4 British Championship [/file/?f=TOCA/2019/191403bf4.pdf]
Downloading Kwik Fit British Touring Car Championship [/file/?f=TOCA/2019/191403trg.pdf]
Downloading Michelin Ginetta Junior Championship [/file/?f=TOCA/2019/191403gij.pdf]
Downloading Millers Oils Ginetta GT4 SuperCup Championship [/file/?f=TOCA/2019/191403g50.pdf]
Downloading Porsche Carrera Cup GB [/file/?f=TOCA/2019/191403por.pdf]
Downloading Renault UK Clio Cup [/file/?f=TOCA/2019/191403cli.pdf]
Files downloaded to ./toca/2019


Put all that together...

In [190]:


def get_TSL_event_data(event_id = 191403, download = False, dirpath='results'):
    
    event_url='http://www.tsl-timing.com/event/{}'.format(event_id)
    
    eventpage=requests.get(event_url)
    eventsoup=BeautifulSoup(eventpage.content)
    
    data_available = False if eventsoup.find("h3", string="Event data available soon") else True
    
    _data=[]
    
    if data_available:
        events=eventsoup.findAll('div',{'class':'championshipDiv'})
        
        if not os.path.exists(dirpath):
            os.makedirs(dirpath)

        for event in events:
            championship_name = event.find('h3').text
            championship_url = event.find('a')['href']
            championship_stub = championship_url.split('/')[-1].split('.')[0].replace(str(event_id),'')
            _data.append({'championship_name':championship_name,
                          'championship_url':championship_url,
                          'championship_stub': championship_stub})
            
            if download:
                print('Downloading: {} [{}]'.format(championship_name, championship_url))
                if championship_url.endswith('.pdf'):
                    cmd = 'curl -o "{fp}" {url}'.format(url='{}{}'.format(domain,championship_url),
                                                        fp='{}/{}.pdf'.format(dirpath,championship_name))
                    os.system(cmd)
        
        if download:
            print('Files downloaded to: {}'.format(dirpath))
        
    return pd.DataFrame( _data )

#p = '{}/{}/{}'.format(download_dir_base,series,year)
#get_TSL_event_data(dirpath = p)
get_TSL_event_data()

#Looks like we can pull out further srubs from end of PDF filename?

Unnamed: 0,championship_name,championship_stub,championship_url
0,F4 British Championship,bf4,/file/?f=TOCA/2019/191403bf4.pdf
1,Kwik Fit British Touring Car Championship,trg,/file/?f=TOCA/2019/191403trg.pdf
2,Michelin Ginetta Junior Championship,gij,/file/?f=TOCA/2019/191403gij.pdf
3,Millers Oils Ginetta GT4 SuperCup Championship,g50,/file/?f=TOCA/2019/191403g50.pdf
4,Porsche Carrera Cup GB,por,/file/?f=TOCA/2019/191403por.pdf
5,Renault UK Clio Cup,cli,/file/?f=TOCA/2019/191403cli.pdf


In [134]:
!ls ./toca/2019

F4 British Championship.pdf
Kwik Fit British Touring Car Championship.pdf
Michelin Ginetta Junior Championship.pdf
Millers Oils Ginetta GT4 SuperCup Championship.pdf
Porsche Carrera Cup GB.pdf
Renault UK Clio Cup.pdf


In [191]:
#https://www.tsl-timing.com/event/191363
get_TSL_event_data(191363, True)

Downloading: Dunlop Endurance Championship [/file/?f=EERC/2019/191363bce.pdf]
Files downloaded to: results


Unnamed: 0,championship_name,championship_stub,championship_url
0,Dunlop Endurance Championship,bce,/file/?f=EERC/2019/191363bce.pdf


In [192]:
!ls results

Dunlop Endurance Championship.pdf


## Alternative PDF Grabber

The PDF grabber will download copies of all timing sheet booklets for the current season.

Explore alternative variant using `requests-html`.

In [21]:
#!pip3 install requests-html

In [95]:
listing_url='http://www.tsl-timing.com/Results/toca/'

#scrape races tab
#go to event page
#for each series:
## grab name
## select appropriate series folder name
## create folder if not exist
## grab "View PDF Book" URL
## download booklet to appropriate series folder with event name as part of URL

In [78]:
import requests
from bs4 import BeautifulSoup

html = requests.get(listing_url).text

soup = BeautifulSoup(html)

div = soup.find('div', attrs={'id' : 'races'})

stub='http://www.tsl-timing.com{}'


from requests_html import HTMLSession
session = HTMLSession()


!mkdir -p 2019

for a in div.findAll('a'):
    event_url = stub.format(a['href'])
    r = session.get(event_url)

    xp = r.html.xpath('//*[@id="contentContainer"]/section/h3')
    if xp and xp[0].text=='Event data available soon':
        break
    
    #the pts PDFs are champtionship points files
    links = [l for l in r.html.links if l.endswith('.pdf') and 'pts' not in l]
    #Grab PDFs
    for l in links:
        pdf_url=stub.format(l)
        fn = l.split('/')[-1]
        #!echo $pdf_url
        !curl -o 2019/{fn} {pdf_url}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  589k  100  589k    0     0  1344k      0 --:--:-- --:--:-- --:--:-- 1343k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1564k  100 1564k    0     0  2142k      0 --:--:-- --:--:-- --:--:-- 2140k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  867k  100  867k    0     0  1903k      0 --:--:-- --:--:-- --:--:-- 1906k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  594k  100  594k    0     0  1539k      0 --:--:-- --:--:-- --:--:-- 1536k
  % Total    % Received % Xferd  Average Speed   Tim

In [77]:
!ls 2019

191403.pdf    191403cli.pdf 191403gij.pdf 191403trg.pdf
191403bf4.pdf 191403g50.pdf 191403por.pdf


The filenames are coded according to datestamp and a letter code identifying the series.

In [None]:
# See the R notebook - tabula scraper