In [1]:
# http://www.wrc.com/live-ticker/daten/2016/201/stage.201.23.html

In [2]:
import pandas as pd
 
def getStageResultsBase(year,rallyid,stages):
    ''' Get stage results and overall results at end of stage '''
     
    # Accept one stage number or a list of stage numbers
    stages=[stages] if not isinstance(stages,list) else stages
     
    #There are actually two tables on the stage results page
    df_stage=pd.DataFrame()
    df_overallpart=pd.DataFrame()
     
    #Grab data for each stage
    for stage in stages:
        url='http://www.wrc.com/live-ticker/daten/{year}/{rallyid}/stage.{rallyid}.{stage}.all.html'.format(year=year, rallyid=rallyid, stage=stage)
        #scrape the data
        results=pd.read_html(url,encoding='utf-8')
        results[0].columns=['pos', 'carNo', 'driverName', 'time', 'diffPrev', 'diffFirst']
        results[1].columns=['pos', 'carNo', 'driverName', 'time', 'diffPrev', 'diffFirst']
         
        #Simple cleaning - cast the data types as required
        for i in [0,1]:
            results[i].fillna(0,inplace=True)
            results[i]['pos']=results[i]['pos'].astype(float).astype(int)
            for j in ['carNo','driverName','time','diffPrev','diffFirst']:
                results[i][j]=results[i][j].astype(str)
         
        #Add a stage identifier
        results[0]['stage']=stage
        results[1]['stage']=stage
         
        #Add the scraped stage data to combined stage results data frames
        df_stage=pd.concat([df_stage,results[0]])
        df_overallpart=pd.concat([df_overallpart,results[1]])
 
    return df_stage.reset_index(drop=True), df_overallpart.reset_index(drop=True)

In [3]:
getStageResultsBase(2016, 201, 23)

(    pos carNo     driverName    time diffPrev diffFirst  stage
 0     1     1       S. OGIER  3:36.8      0.0         0     23
 1     2     3    T. NEUVILLE  3:38.0      1.2      +1.2     23
 2     3    20       D. SORDO  3:39.5      1.5      +2.7     23
 3     4     4      H. PADDON  3:39.6      0.1      +2.8     23
 4     5     2     J. LATVALA  3:40.7      1.1      +3.9     23
 5     6     9   A. MIKKELSEN  3:41.8      1.1      +5.0     23
 6     7     5     M. OSTBERG  3:42.2      0.4      +5.4     23
 7     8    12       O. TANAK  3:43.1      0.9      +6.3     23
 8     9    37    L. BERTELLI  3:49.9      6.8     +13.1     23
 9    10    31       E. LAPPI  3:51.1      1.2     +14.3     23
 10   11    42       N. FUCHS  3:51.6      0.5     +14.8     23
 11   12    35  K. AL-SUWAIDI  4:00.3      8.7     +23.5     23
 12   13    63      B. REEVES  4:00.6      0.3     +23.8     23
 13   14    33     H. PTASZEK  4:01.3      0.7     +24.5     23
 14   15    65       H. BATES  4:06.6   

In [4]:
#If we have hh:mm:ss format we can easily cast a timedelta
def regularTimeString(strtime):
 
    #Go defensive, just in case we're passed eg 0 as an int
    strtime=str(strtime)
    strtime=strtime.strip('+')
 
    modifier=''
    if strtime.startswith('-'):
        modifier='-'
        strtime=strtime.strip('-')
 
    timeComponents=strtime.split(':')
    ss=timeComponents[-1]
    mm=timeComponents[-2] if len(timeComponents)>1 else 0
    hh=timeComponents[-3] if len(timeComponents)>2 else 0
    timestr='{}{}:{}:{}'.format(modifier,hh,mm,ss)
    return pd.to_timedelta(timestr)

In [5]:
def getStageResults(year,rallyid,stages):
    df_stage, df_overallpart = getStageResultsBase(year,rallyid,stages)
    for col in ['time','diffPrev','diffFirst']:
        df_stage['td_'+col]=df_stage.apply(lambda x: regularTimeString(x[col]),axis=1)
        df_overallpart['td_'+col]=df_overallpart.apply(lambda x: regularTimeString(x[col]),axis=1)
    return df_stage, df_overallpart

In [6]:
getStageResults(2016, 201, 23)

(    pos carNo     driverName    time diffPrev diffFirst  stage  \
 0     1     1       S. OGIER  3:36.8      0.0         0     23   
 1     2     3    T. NEUVILLE  3:38.0      1.2      +1.2     23   
 2     3    20       D. SORDO  3:39.5      1.5      +2.7     23   
 3     4     4      H. PADDON  3:39.6      0.1      +2.8     23   
 4     5     2     J. LATVALA  3:40.7      1.1      +3.9     23   
 5     6     9   A. MIKKELSEN  3:41.8      1.1      +5.0     23   
 6     7     5     M. OSTBERG  3:42.2      0.4      +5.4     23   
 7     8    12       O. TANAK  3:43.1      0.9      +6.3     23   
 8     9    37    L. BERTELLI  3:49.9      6.8     +13.1     23   
 9    10    31       E. LAPPI  3:51.1      1.2     +14.3     23   
 10   11    42       N. FUCHS  3:51.6      0.5     +14.8     23   
 11   12    35  K. AL-SUWAIDI  4:00.3      8.7     +23.5     23   
 12   13    63      B. REEVES  4:00.6      0.3     +23.8     23   
 13   14    33     H. PTASZEK  4:01.3      0.7     +24.5     2

In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
 
rc1=df_overall[df_overall['groupClass']=='RC1'].reset_index(drop=True)
 
fig, ax = plt.subplots(figsize=(15,8))
ax.get_yaxis().set_ticklabels([])
rc1.groupby('driverName').plot(x='stage',y='pos',ax=ax,legend=None);

NameError: name 'df_overall' is not defined

In [8]:
# https://blog.ouseful.info/2017/01/25/a-first-attempt-at-wrangling-wrc-world-rally-championship-data-with-pandas-and-matplotlib/