# Extract data
The aim of this notebook is to extract professional cyclists' data for a season. Cleaning of the data will be done as well. Feature creation and analysis will be done on the next notebook.

We extract all the riders at the beginning of the tour. Thereafter we itterate through each rider's profile on `procyclingstats.com` and scrape the races that they have partaken in, for the season. Thereafter, we itterate through all the races and extract their profiles (diificulty, uci status). The data is saved. 

In [46]:
# imports
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

In [47]:
RACE = 'tour-de-france'
YEAR = 2020
STAGE = 1
URL1 = 'https://www.procyclingstats.com/race/{}/{}/stage-{}/result/result'.format(RACE, YEAR, STAGE)
race_page = requests.get(URL1)
race_html = BeautifulSoup(race_page.content, 'html.parser')

In [95]:
class DataObject:
    def __init__(self, name, row_length):
        self.data = None
        self.data_added = False
        self.row_length = 0
        self.data_name = None

    def add_data(self, data):
        self.data = data
        self.data_added = True

In [310]:
def is_not_int(value):
    try:
        int(value)
        return False
    except ValueError:
        return True

def get_text(cell):
    ''' Return the text from the html cell. '''
    # some cells have a span or hyperlink element with text in it
    if cell.a != None:
        url = cell.a.get('href')
        if url.startswith('rider/') or url.startswith('race/'):
            return url, cell.a.get_text()
        return cell.a.get_text()
    elif cell.span != None:
        return cell.span.get_text()
    else:
        return cell.get_text()
    
def get_stage_data(html, data_row_length, print_row=False):
    # all the racers are in a table data cell ('td')
    row_cells = html.find_all('td')
    data = {'stage': list(), 'gc': list(), 'green': list(), 'youth': list(), 'kom': list(), 'teams': list()}
    datasets = list(data.keys())
    data_id = 0
    row_length = data_row_length[datasets[data_id]]
    old_length = row_length
    data_obj = DataObject(datasets[data_id], row_length)
    
    temp_list = list()
    error_list = list()
    error_row = False
    #print(data, datasets, data_id, datasets[data_id])
    
    row = list()
    last_ix = 1

    # itterate through all data cells and append their text values to a row
    for row_ix, cell in enumerate(row_cells):
        
        #if datasets[data_id] in ['kom', 'teams']:
        #    print('{} of {}'.format(row_ix, len(row_cells)))
        
        text = get_text(cell)
        if type(text) is type('str'):
            row.append(text)
        else:
            row.append(text[0])
            row.append(text[1])
            
        if len(row) == 1:
            not_int = is_not_int(row[0])
            if (not_int or int(row[0]) != last_ix) and len(temp_list) != 0:
                print('UPDATING 1:{} has {} participants'.format(datasets[data_id], len(temp_list)))
                ds = datasets[data_id]
                data[ds] = temp_list
                data_id += 1
                old_length = row_length
                
                temp_list = list()
                last_ix = 1
                try:
                    row_length = data_row_length[datasets[data_id]]
                except KeyError:
                    break
            
            if not_int:
                error_row = True
                
        if error_row:
            if len(row) == old_length:
                print('ERROR: {}'.format(row))
                error_list.append(row)
                error_row = False
                row = list()
        else:
            # 'row_length' data cells make an entire row
            if len(row) == row_length:
                if print_row:
                    print(row)
                # data list gets saved in data subset
                pos = int(row[0])
                temp_list.append(row)
                last_ix = pos + 1
                row = list()
                
    print('UPDATING 1:{} has {} participants'.format(datasets[data_id], len(temp_list)))
    ds = datasets[data_id]
    data[ds] = temp_list
    return data

data_row_length = {'stage': 11, 'gc': 9, 'green': 8, 'youth': 9, 'kom': 8, 'teams': 4}
data = get_stage_data(race_html, data_row_length)
#data.get('stage')

UPDATING 1:stage has 175 participants
ERROR: ['OTL', '', '+ -3:46:13', '155', 'rider/john-degenkolb', ' Degenkolb John', '31', 'Lotto Soudal', '', '', '-']
UPDATING 1:gc has 175 participants
UPDATING 1:green has 24 participants
UPDATING 1:youth has 26 participants
UPDATING 1:kom has 3 participants
UPDATING 1:teams has 22 participants


In [160]:
#def make_dfs(data):
#    
#    df = pd.DataFrame(data, columns=['stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', \
#                          'age', 'team', 'uci','pnt', 'time'])

#df = pd.DataFrame(data['stage'], columns=['stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'uci','pnt', 'time'])
#df = pd.DataFrame(data['gc'], columns=['gcPos', 'bib', 'url', 'name', 'age', 'team', 'uci', 'time', 'more'])
#df = pd.DataFrame(data['green'], columns=['greenPos', 'bib', 'url', 'name', 'age', 'team', 'pnts', 'pnts_chng'])
#df = pd.DataFrame(data['youth'], columns=['youthPos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'youthTime'])
#df = pd.DataFrame(data['kom'], columns=['komPos', 'bib', 'url', 'name', 'age', 'team', 'pnts',  'pnts_chng'])
#df = pd.DataFrame(data['teams'], columns=['teamPos', 'change', 'name', 'time'])
df.head()

Unnamed: 0,teamPos,change,name,time
0,1,,Trek - Segafredo,11:19:09
1,2,,UAE-Team Emirates,",,"
2,3,,"Cofidis, Solutions Crédits",",,"
3,4,,Team Sunweb,",,"
4,5,,Deceuninck - Quick Step,",,"


In [4]:
# make the dataframe
df = pd.DataFrame(data, columns=['stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', \
                          'age', 'team', 'uci','pnt', 'time'])
df.name = df.name.str.upper()
df.team = df.team.str.upper()
# data erroneoulsy added to df
cutoff_ix = df[df.gcPos == ''].index[0]

first_df = df.loc[:cutoff_ix-1, :]
print('There were {} cyclists that finished stage 1'.format(first_df.shape[0]))
first_df.head()

There were 175 cyclists that finished stage 1


Unnamed: 0,stagePos,gcPos,timeAdd,bib,url,name,age,team,uci,pnt,time
0,1,1,+0:00,135,rider/alexander-kristoff,KRISTOFF ALEXANDER,33,UAE-TEAM EMIRATES,120,100,3:46:23
1,2,2,+0:04,105,rider/mads-pedersen,PEDERSEN MADS,24,TREK - SEGAFREDO,50,70,",,"
2,3,3,+0:06,203,rider/cees-bol,BOL CEES,25,TEAM SUNWEB,25,50,",,"
3,4,4,+0:10,43,rider/sam-bennett,BENNETT SAM,29,DECEUNINCK - QUICK STEP,15,40,",,"
4,5,5,+0:10,21,rider/peter-sagan,SAGAN PETER,30,BORA - HANSGROHE,5,32,",,"


## Create dataframes
We are going to separate the data frame above into 3 more succinct dataframes. 

- `riders_df` that contians the 175 riders' personal infomration.
- `stages_df` that containes that stages competed in by all the 175 riders in the past season
- `races_df` that contains the information about the races that the 175 riders competed in in the past season (a race can have multiple stages)

In [5]:
riders_df = first_df[['name', 'age', 'url', 'team']]
riders_df = riders_df.sort_values('name')
riders_df['riderID'] = range(riders_df.shape[0])
riders_df = riders_df.set_index('riderID')
riders_df.head()

Unnamed: 0_level_0,name,age,url,team
riderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,ALAPHILIPPE JULIAN,28,rider/julian-alaphilippe,DECEUNINCK - QUICK STEP
1,AMADOR ANDREY,34,rider/andrey-amador,INEOS GRENADIERS
2,ANACONA WINNER,32,rider/winner-anacona,TEAM ARKÉA SAMSIC
3,ARNDT NIKIAS,28,rider/nikias-arndt,TEAM SUNWEB
4,ARU FABIO,30,rider/fabio-aru,UAE-TEAM EMIRATES


In [8]:
def find_races(url, rider_id, rider_name):
    ''' Extract all the races in the season for the rider on the rider's profile (url)'''
    page = requests.get(url)
    rider_html = BeautifulSoup(page.content, 'html.parser')
    results_html = rider_html.body.tbody
    # all races in data rows
    rows = results_html.find_all('tr')

    races = list()
    for row in rows:
        items = row.find_all('td')
        # extract text values from data cell
        row = list()
        for item in items:
            text = get_text(item)
            if type(text) is type('str'):
                row.append(text)
            else:
                row.append(text[0])
                row.append(text[1])
        races.append(row)
    df = pd.DataFrame(races, columns=['date', 'stagePos', 'gcPos', 'unknown', 'url', 'stage', 'distance', 'pcs', 'uci', 'more'])
    print('"{}" competed in {} stages'.format(rider_name, df.shape[0]))
    df['riderID'] = rider_id
    return df

# initialise list of dataframes
stages_list = list()
# itterate through each cyclist and add his races for the season
# to a list of dataframes
for ix, row in riders_df.iterrows():
    print('{} of {}'.format(ix, riders_df.shape[0]))
    url = row['url']
    name = row['name']
    rider_url = 'https://www.procyclingstats.com/{}'.format(url)
    rider_df = find_races(rider_url, ix, name)
    stages_list.append(rider_df)

0 of 175
" ALAPHILIPPE JULIAN" competed in 48 stages
1 of 175
" AMADOR ANDREY" competed in 30 stages
2 of 175
" ANACONA WINNER" competed in 46 stages
3 of 175
" ARNDT NIKIAS" competed in 37 stages
4 of 175
" ARU FABIO" competed in 36 stages
5 of 175
" ASGREEN KASPER" competed in 45 stages
6 of 175
" BARDET ROMAIN" competed in 53 stages
7 of 175
" BARGUIL WARREN" competed in 38 stages
8 of 175
" BARTHE CYRIL" competed in 53 stages
9 of 175
" BAUER JACK" competed in 34 stages
10 of 175
" BENNETT GEORGE" competed in 40 stages
11 of 175
" BENNETT SAM" competed in 51 stages
12 of 175
" BENOOT TIESJ" competed in 29 stages
13 of 175
" BERNAL EGAN" competed in 44 stages
14 of 175
" BETTIOL ALBERTO" competed in 33 stages
15 of 175
" BEWLEY SAM" competed in 31 stages
16 of 175
" BILBAO PELLO" competed in 51 stages
17 of 175
" BOASSON HAGEN EDVALD" competed in 36 stages
18 of 175
" BOL CEES" competed in 31 stages
19 of 175
" BONIFAZIO NICCOLÒ" competed in 38 stages
20 of 175
" BONNET WILLIAM" com

" ZAKARIN ILNUR" competed in 25 stages
165 of 175
"DE BUYST JASPER" competed in 35 stages
166 of 175
"DE GENDT THOMAS" competed in 48 stages
167 of 175
"DE LA CRUZ DAVID" competed in 41 stages
168 of 175
"DE MARCHI ALESSANDRO" competed in 26 stages
169 of 175
"KRAGH ANDERSEN SØREN" competed in 34 stages
170 of 175
"VAN AERT WOUT" competed in 24 stages
171 of 175
"VAN ASBROECK TOM" competed in 44 stages
172 of 175
"VAN AVERMAET GREG" competed in 40 stages
173 of 175
"VAN BAARLE DYLAN" competed in 41 stages
174 of 175
"VAN GARDEREN TEJAY" competed in 31 stages


In [9]:
stages_ = pd.concat(stages_list).reset_index(drop=True)
print('{} stages have been loaded'.format(stages_.shape[0]))
# drop 2 unnecessary columns
stages_ = stages_.drop(['unknown', 'more'], axis=1)
stages_.head()

6765 stages have been loaded


Unnamed: 0,date,stagePos,gcPos,url,stage,distance,pcs,uci,riderID
0,› 20.09,,,race/tour-de-france/2020/stage-1,Tour de France,,,,0
1,06.09,160.0,38.0,race/tour-de-france/2020/stage-9,Stage 9 - Pau › Laruns,153.0,,,0
2,05.09,38.0,26.0,race/tour-de-france/2020/stage-8,Stage 8 - Cazères › Loudenvielle,141.0,,,0
3,04.09,12.0,11.0,race/tour-de-france/2020/stage-7,Stage 7 - Millau › Lavaur,168.0,6.0,,0
4,03.09,5.0,16.0,race/tour-de-france/2020/stage-6,Stage 6 - Le Teil › Mont Aigoual,191.0,32.0,5.0,0


In [15]:
# TODO: Test this
# TODO: take points classifications. make columns for them . Add them to race information
#to_drop = stages_[(stages_.date == '') & (stages_.gcPos == '')]
#print(to_drop.shape)
#to_drop.head(20)
# jersey and points classification have no date or gc position
#all_ = all_[(all_.date != '') | (all_.gc_pos != '')]
#print(all_.shape)
#all_.head()

In [12]:
races_df = stages_[(stages_.distance == '') & (stages_.stagePos == '')][['stage', 'url']]
races_df.head()
# TODO: Does this still work?
races_df.columns = ['race', 'url']
#races_df['uci_rank'] = races_df.uci_rank.str.replace(')', '')
races_df['raceID'] = range(races_df.shape[0])
races_df.head()
# TODO: Access race html and see if you can get more info about race
# TODO: stage_df?

Unnamed: 0,race,url,raceID
0,Tour de France,race/tour-de-france/2020/stage-1,0
11,Critérium du Dauphiné,race/dauphine/2020/stage-1,1
22,Paris - Nice,race/paris-nice/2020/stage-1,2
35,Tour Colombia 2.1,race/colombia-21/2020/stage-1,3
44,Vuelta a San Juan Internacional,race/vuelta-ciclista-a-la-provincia-de-san-jua...,4


In [251]:
# add race id
stages_df = pd.concat([stages_, races_df['raceID']], axis=1, sort=False)
stages_df['raceID'] = stages_df['raceID'].fillna(method='ffill').astype(int)
print(stages_df.shape)
# drop the races & classifications from stages
stages_df = stages_df[(stages_df.stagePos != '') | (stages_df.distance != '')]
stages_df = stages_df[(stages_df.date != '')]
print(stages_df.shape)

stages_df.head()

(6765, 10)
(5022, 10)


Unnamed: 0,date,stagePos,gcPos,url,stage,distance,pcs,uci,riderID,raceID
1,6.09,160,38,race/tour-de-france/2020/stage-9,Stage 9 - Pau › Laruns,153,,,0,0
2,5.09,38,26,race/tour-de-france/2020/stage-8,Stage 8 - Cazères › Loudenvielle,141,,,0,0
3,4.09,12,11,race/tour-de-france/2020/stage-7,Stage 7 - Millau › Lavaur,168,6.0,,0,0
4,3.09,5,16,race/tour-de-france/2020/stage-6,Stage 6 - Le Teil › Mont Aigoual,191,32.0,5.0,0,0
5,2.09,16,16,race/tour-de-france/2020/stage-5,Stage 5 - Gap › Privas,183,,,0,0


In [311]:
stage_url = 'https://www.procyclingstats.com/race/tour-de-france/2020/stage-3'
page = requests.get(stage_url)
stage_html = BeautifulSoup(page.content, 'html.parser')
print('(ITT)' in str(stage_html.h2), str(stage_html.h2))

data_row_length = data_row_length = {'stage': 11, 'gc': 11, 'green': 10, 'youth': 11, 'kom': 10, 'teams': 6}
data = get_stage_data(stage_html, data_row_length, True)

False <h2><span class="yearmob hide">2020  </span><span class="blue">Stage 3</span>  »  <span class="red">Nice  ›  Sisteron  </span> <span class="red distance">(198k)</span></h2>
['1', '153', '+29:13', '151', 'rider/caleb-ewan', ' Ewan Caleb', '26', 'Lotto Soudal', '120', '100', '5:17:42']
['2', '142', '+29:04', '43', 'rider/sam-bennett', ' Bennett Sam', '29', 'Deceuninck - Quick Step', '50', '70', ',,']
['3', '143', '+29:06', '191', 'rider/giacomo-nizzolo', ' Nizzolo Giacomo', '31', 'NTT Pro Cycling', '25', '50', ',,']
['4', '133', '+28:30', '174', 'rider/hugo-hofstetter', ' Hofstetter Hugo', '26', 'Israel Start-Up Nation', '15', '40', ',,']
['5', '81', '+18:00', '21', 'rider/peter-sagan', ' Sagan Peter', '30', 'BORA - hansgrohe', '5', '32', ',,']
['6', '83', '+18:00', '108', 'rider/edward-theuns', ' Theuns Edward', '29', 'Trek - Segafredo', '', '26', ',,']
['7', '144', '+29:06', '203', 'rider/cees-bol', ' Bol Cees', '25', 'Team Sunweb', '', '22', ',,']
['8', '107', '+20:57', '117', '

In [292]:
stage_df = pd.DataFrame(data['gc'], gc_col = ['gcPos', 'prevGcPos', 'gcChng', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'gcTime', 'more'])
stage_df.head()

Unnamed: 0,stagePos,posChange,url,name,age,teamName,uci,pnts,stageTime
0,1,,rider/arnaud-demare,Démare Arnaud,28,Groupama - FDJ,100,15,5:38:27
1,2,,rider/bryan-coquard,Coquard Bryan,28,B&B Hotels - Vital Concept p/b KTM,75,10,",,"
2,3,,rider/julian-alaphilippe,Alaphilippe Julian,28,Deceuninck - Quick Step,60,7,",,"
3,4,,rider/clement-venturini,Venturini Clément,26,AG2R La Mondiale,50,4,0:05
4,5,,rider/anthony-turgis,Turgis Anthony,26,Team Total Direct Energie,40,2,",,"


In [305]:
ONE_DAY_RACE = 0
FIRST_STAGE_IN_TOUR = 1
OTHER_TOUR_STAGE = 2
ITT = 3

def fix_time(data, time_col):
    tdf = data[data[time_col] == ',,'][[time_col]]
    to_change_ix = list(tdf.index)
    data.loc[data.index.isin(to_change_ix), time_col] = None
    data[time_col] = data[time_col].fillna(method='ffill')
    return data

def build_df(data, result_type):
    if result_type == ONE_DAY_RACE:
        stage_col = ['stagePos', 'gcPos', 'url', 'name', 'age', 'teamName', 'uciStage', 'pnt', 'stageTime']
    elif result_type == ITT:
        stage_col = ['stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'stgAvgPace', 'stageTime']
    elif result_type == FIRST_STAGE_IN_TOUR:
        stage_col = ['stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'stageTime']
        gc_col = ['gcPos', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'gcTime', 'more']
        green_col = ['greenPos', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'greenPntsChng']
        youth_col = ['youthPos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'youthTime']
        kom_col = ['komPos', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'komPntsChng']
        teams_col = ['teamPos', 'empty', 'teamName', 'teamTime']
    elif result_type == OTHER_TOUR_STAGE:
        stage_col = ['stagePos', 'gcPos', 'gcTime', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'stageTime']
        gc_col = ['gcPos', 'prevGcPos', 'gcChng', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'gcTime', 'more']
        green_col = ['greenPos', 'prevGreenPos', 'greenChng', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'greenPntsChng']
        youth_col = ['youthPos', 'prevYouthPos', 'youthChng', 'gcPos', 'gcTime', 'bib', 'url', 'name', 'age', 'team', 'youthTime']
        kom_col = ['komPos', 'prevKomPos', 'komChng', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'komPntsChng']
        teams_col = ['teamPos', 'prevTeamPos', 'teamChng', 'empty', 'teamName', 'teamTime']
    
    stage_df = pd.DataFrame(data['stage'], columns=stage_col)
    stage_df = fix_time(stage_df, 'stageTime')
    
    if result_type in [FIRST_STAGE_IN_TOUR, OTHER_TOUR_STAGE]:
        # one day races only have 1 data frame
        stage_df = stage_df.set_index('bib')
        gc_df = pd.DataFrame(data['gc'], columns=gc_col)
        gc_df = gc_df[['bib', 'uciGc']].set_index('bib')

        green_df = pd.DataFrame(data['green'], columns=green_col)
        green_df = green_df[['bib', 'greenPos', 'greenPnts']].set_index('bib')

        youth_df = pd.DataFrame(data['youth'], columns=youth_col)
        youth_df = youth_df[['bib', 'youthPos', 'youthTime']].set_index('bib')
        youth_df = fix_time(youth_df, 'youthTime')

        kom_df= pd.DataFrame(data['kom'], columns=kom_col)
        kom_df = kom_df[['bib', 'komPos', 'komPnts']].set_index('bib')

        team_df = pd.DataFrame(data['teams'], columns=teams_col)
        team_df = team_df[['teamPos', 'teamName', 'teamTime']]

        df = pd.concat([stage_df, gc_df, green_df, youth_df, kom_df], axis=1, sort=False)
        return df
    else:
        return stage_df

df = build_df(data, ITT)
df.head()

Unnamed: 0,stagePos,gcPos,timeAdd,bib,url,name,age,teamName,uciStage,pnt,stgAvgPace,stageTime
0,1,2,+0:58,98,rider/soren-kragh-andersen,Kragh Andersen Søren,25,Team Sunweb,60.0,50,48.064,18:51
1,2,1,+0:00,37,rider/maximilian-schachmann,Schachmann Maximilian,26,BORA - hansgrohe,25.0,30,47.81,0:06
2,3,23,+2:16,42,rider/kasper-asgreen,Asgreen Kasper,25,Deceuninck - Quick Step,10.0,18,47.559,0:12
3,4,85,+14:52,64,rider/thomas-de-gendt,De Gendt Thomas,33,Lotto Soudal,,13,47.517,0:13
4,5,29,+3:41,82,rider/pello-bilbao,Bilbao Pello,30,Bahrain - McLaren,,10,47.435,0:15


In [307]:
# get the points classifications
def get_points(url):
    ''' Extract all the races in the season for the rider on the rider's profile (url)'''
    page = requests.get(url)
    stage_html = BeautifulSoup(page.content, 'html.parser')
    
    if 'One day race' in str(stage_html.h2):
        print('ONE DAY RACE')
        data_row_length = {'stage': 9}
        result_type = ONE_DAY_RACE
    elif '(ITT)' in str(stage_html.h2):
        print('(ITT)')
        data_row_length = {'stage': 12}
        result_type = ITT
    elif url.endswith('/result') or url.endswith('/stage-1'):
        result_type = FIRST_STAGE_IN_TOUR
        data_row_length = {'stage': 11, 'gc': 9, 'green': 8, 'youth': 9, 'kom': 8, 'teams': 4}
    else:
        result_type = OTHER_TOUR_STAGE
        data_row_length = {'stage': 11, 'gc': 11, 'green': 10, 'youth': 11, 'kom': 10, 'teams': 6}
    data = get_stage_data(stage_html, data_row_length)
    df = build_df(data, result_type)
    return df
    
    
url_uniques = stages_df.url.unique()
count = 0
all_races = list()
print(len(url_uniques))
for uix, url in enumerate(url_uniques):
    
    stage_url = 'https://www.procyclingstats.com/{}'.format(url)
    print('{} in {}: {}'.format(uix, len(url_uniques), stage_url))
    #stage_url = 'https://www.procyclingstats.com/race/tour-de-france/2020/stage-2'
    rdf = get_points(stage_url)
    all_races.append(rdf)
    #if count == 5: break

190
0 in 190: race/tour-de-france/2020/stage-9
UPDATING 1:stage has 166 participants
ERROR: ['DNF', '', '+ - 38:40:01', '132', 'rider/fabio-aru', ' Aru Fabio', '30', 'UAE-Team Emirates', '', '', '-']
ERROR: ['DNF', '', '+ - 38:40:01', '152', 'rider/steff-cras', ' Cras Steff', '24', 'Lotto Soudal', '', '', ',,']
UPDATING 1:gc has 166 participants
UPDATING 1:green has 102 participants
UPDATING 1:youth has 25 participants
UPDATING 1:kom has 43 participants
UPDATING 1:teams has 22 participants
1 in 190: race/tour-de-france/2020/stage-8
UPDATING 1:stage has 168 participants
ERROR: ['DNF', '', '+ - 34:44:52', '191', 'rider/giacomo-nizzolo', ' Nizzolo Giacomo', '31', 'NTT Pro Cycling', '', '', '-']
ERROR: ['DNF', '', '+ - 34:44:52', '52', 'rider/william-bonnet', ' Bonnet William', '38', 'Groupama - FDJ', '', '', ',,']
ERROR: ['DNF', '', '+ - 34:44:52', '86', 'rider/diego-rosa', ' Rosa Diego', '31', 'Team Arkéa Samsic', '', '', ',,']
ERROR: ['DNF', '', '+ - 34:44:52', '183', 'rider/lilian-calm

UPDATING 1:stage has 151 participants
ERROR: ['DNF', '', '+ - 13:14:35', '121', 'rider/tiesj-benoot', ' Benoot Tiesj', '26', 'Team Sunweb', '', '', '-']
ERROR: ['DNF', '', '+ - 13:14:35', '193', 'rider/mathieu-burgaudeau', ' Burgaudeau Mathieu', '21', 'Team Total Direct Energie', '', '', ',,']
ERROR: ['DNF', '', '+ - 13:14:35', '225', 'rider/quentin-pacher', ' Pacher Quentin', '28', 'B&B Hotels - Vital Concept p/b KTM', '', '', ',,']
ERROR: ['DNS', '', '+ - 13:14:35', '114', 'rider/juan-pedro-lopez', ' López Juan Pedro', '23', 'Trek - Segafredo', '', '', ',,']
ERROR: ['DNS', '', '+ - 13:14:35', '171', 'rider/dan-martin', ' Martin Dan', '33', 'Israel Start-Up Nation', '', '', ',,']
UPDATING 1:gc has 2 participants
ERROR: ['-', '21', 'rider/emanuel-buchmann', ' Buchmann Emanuel', '27', 'BORA - hansgrohe', '0:20', '..', '4', '5', '▲1']
UPDATING 1:green has 1 participants
UPDATING 1:youth has 1 participants
13 in 190: race/dauphine/2020/stage-2
UPDATING 1:stage has 156 participants
ERROR: 

UPDATING 1:stage has 122 participants
ERROR: ['DNF', '', '+ - 18:49:00', '32', 'rider/pascal-ackermann', ' Ackermann Pascal', '26', 'BORA - hansgrohe', '', '', '-']
ERROR: ['DNF', '', '+ - 18:49:00', '141', 'rider/julien-el-fares', 'El Fares Julien', '34', 'NIPPO DELKO One Provence', '', '', ',,']
ERROR: ['DNF', '', '+ - 18:49:00', '17', 'rider/diego-rosa', ' Rosa Diego', '30', 'Team Arkéa Samsic', '', '', ',,']
ERROR: ['DNF', '', '+ - 18:49:00', '108', 'rider/michael-woods', ' Woods Michael', '33', 'EF Pro Cycling', '', '', ',,']
ERROR: ['DNF', '', '+ - 18:49:00', '103', 'rider/lawson-craddock', ' Craddock Lawson', '28', 'EF Pro Cycling', '', '', ',,']
ERROR: ['DNF', '', '+ - 18:49:00', '56', 'rider/oliver-naesen', ' Naesen Oliver', '29', 'AG2R La Mondiale', '', '', ',,']
ERROR: ['DNF', '', '+ - 18:49:00', '162', 'rider/frederik-backaert', ' Backaert Frederik', '29', 'B&B Hotels - Vital Concept p/b KTM', '', '', ',,']
ERROR: ['DNS', '', '+ - 18:49:00', '106', 'rider/tejay-van-garderen

IndexError: list index out of range

In [107]:
# remove the race headings
print(riders_df.shape)
riders_df = riders_df[(riders_df.distance != '') & (riders_df.gc_pos != '')]
riders_df.drop(['uci'], axis = 1, inplace=True)
print(riders_df.shape)
riders_df.head(10)

(5462, 9)
(3492, 8)


Unnamed: 0,date,pos,gc_pos,stage,distance,pcs,rider,race_id
1,4.09,152,142,Stage 7 - Millau › Lavaur,168.0,,kristoff alexander,0
2,3.09,147,139,Stage 6 - Le Teil › Mont Aigoual,191.0,,kristoff alexander,0
3,2.09,14,135,Stage 5 - Gap › Privas,183.0,2.0,kristoff alexander,0
4,1.09,158,138,Stage 4 - Sisteron › Orcières-Merlette,160.5,,kristoff alexander,0
5,31.08,15,127,Stage 3 - Nice › Sisteron,198.0,1.0,kristoff alexander,0
6,30.08,137,129,Stage 2 - Nice › Nice,186.0,,kristoff alexander,0
7,29.08,1,1,Stage 1 - Nice › Nice,156.0,100.0,kristoff alexander,0
12,15.08,138,135,Stage 4 - Ugine › Megève,148.5,,kristoff alexander,1
13,14.08,151,144,Stage 3 - Corenc › Saint-Martin-de-Belleville,157.0,,kristoff alexander,1
14,13.08,135,142,Stage 2 - Vienne › Col de Porte,135.0,,kristoff alexander,1


In [105]:
#DNF=Did not finish / DNS=Did not start / OTL = Outside time limit / DF
riders_df[riders_df.pos =='DF']

Unnamed: 0,date,pos,gc_pos,stage,distance,rider,race_id


In [79]:
#result.groupby(['race', 'stage']).count()[['pos']]
#all_df['rc'] = np.where(all_df['distance']=='', all_df['race'], '')