# Extract data
The aim of this notebook is to extract professional cyclists' data for a season. Cleaning of the data will be done as well. Feature creation and analysis will be done on the next notebook.

We extract all the riders at the beginning of the tour. Thereafter we itterate through each rider's profile on `procyclingstats.com` and scrape the races that they have partaken in, for the season. Thereafter, we itterate through all the races and extract their profiles (diificulty, uci status). The data is saved. 

In [1]:
# imports
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import re

In [2]:
RACE = 'tour-de-france'
YEAR = 2020
STAGE = 1
URL1 = 'https://www.procyclingstats.com/race/{}/{}/stage-{}/result/result'.format(RACE, YEAR, STAGE)
race_page = requests.get(URL1)
race_html = BeautifulSoup(race_page.content, 'html.parser')

In [3]:
class DataObject:
    def __init__(self, name, row_length):
        self.data = None
        self.data_added = False
        self.row_length = 0
        self.data_name = None

    def add_data(self, data):
        self.data = data
        self.data_added = True

In [4]:
def is_not_int(value):
    try:
        int(value)
        return False
    except ValueError:
        return True

def get_text(cell):
    ''' Return the text from the html cell. '''
    # some cells have a span or hyperlink element with text in it
    if cell.a != None:
        url = cell.a.get('href')
        if url.startswith('rider/') or url.startswith('race/'):
            return url, cell.a.get_text()
        return cell.a.get_text()
    elif cell.span != None:
        return cell.span.get_text()
    else:
        return cell.get_text()
    
def get_stage_data(html, data_row_length, print_row=False):
    # all the racers are in a table data cell ('td')
    row_cells = html.find_all('td')
    data = {'stage': list(), 'gc': list(), 'green': list(), 'youth': list(), 'kom': list(), 'teams': list()}
    datasets = list(data.keys())
    data_id = 0
    row_length = data_row_length[datasets[data_id]]
    old_length = row_length
    data_obj = DataObject(datasets[data_id], row_length)
    last_list_length = 0
    
    temp_list = list()
    error_list = list()
    error_row = False
    
    row = list()
    last_ix = 1

    # itterate through all data cells and append their text values to a row
    for row_ix, cell in enumerate(row_cells):
        
        text = get_text(cell)
        if type(text) is type('str'):
            row.append(text)
        else:
            row.append(text[0])
            row.append(text[1])
            
        if len(row) == 1:
            not_int = is_not_int(row[0])
            
            if not_int:
                print('Error row: {}'.format(row[0]))
                error_row = True
            
            if not not_int and int(row[0]) == 1 and len(temp_list) != 0:
                print('UPDATING 1: {} has {} participants'.format(datasets[data_id], len(temp_list)))
                ds = datasets[data_id]
                data[ds] = temp_list

                old_length = row_length
                
                data_id += 1
                temp_list = list()
                last_ix = 1
                error_row = False
                try:
                    row_length = data_row_length[datasets[data_id]]
                except KeyError:
                    break
        #print(error_row)
        if error_row and len(row) == old_length:
            row.append(row[0])
            row[0] = last_ix
            print('ERROR: {}'.format(row))

            temp_list.append(row)

            # TODO: change to temp list
#                error_list.append(row)
#                error_row = False
            row = list()
        elif not error_row and len(row) == row_length:
            # 'row_length' data cells make an entire row
            if print_row:
                print(row)
            # data list gets saved in data subset
            pos = int(row[0])
            # DQ/ DNF/ OL column
            row.append(np.nan)
            temp_list.append(row)
            last_ix = pos + 1
            row = list()
                
    print('UPDATING 1:{} has {} participants'.format(datasets[data_id], len(temp_list)))
    ds = datasets[data_id]
    data[ds] = temp_list
    return data

data_row_length = {'stage': 11, 'gc': 9, 'green': 8, 'youth': 9, 'kom': 8, 'teams': 4}
data = get_stage_data(race_html, data_row_length)


Error row: OTL
ERROR: [176, '', '+ -3:46:13', '155', 'rider/john-degenkolb', ' Degenkolb John', '31', 'Lotto Soudal', '', '', '-', 'OTL']
UPDATING 1: stage has 176 participants
UPDATING 1: gc has 175 participants
UPDATING 1: green has 24 participants
UPDATING 1: youth has 26 participants
UPDATING 1: kom has 3 participants
UPDATING 1:teams has 22 participants


In [5]:
def fix_time(data, time_col):
    tdf = data[data[time_col] == ',,'][[time_col]]
    to_change_ix = list(tdf.index)
    data.loc[data.index.isin(to_change_ix), time_col] = None
    data[time_col] = data[time_col].fillna(method='ffill')
    return data

stage_df = pd.DataFrame(data['stage'], columns=['stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'uciStg','stagePnt', 'stageTime', 'DNF'])
stage_df = fix_time(stage_df, 'stageTime')
stage_df = stage_df.set_index('bib')

gc_df = pd.DataFrame(data['gc'], columns=['gcPos', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'time', 'more', 'DNF'])
gc_df = gc_df[['bib', 'uciGc']].set_index('bib')

green_df = pd.DataFrame(data['green'], columns=['greenPos', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'pntsChng', 'DNF'])
green_df = green_df[['bib', 'greenPos', 'greenPnts']].set_index('bib')

youth_df = pd.DataFrame(data['youth'], columns=['youthPos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'youthTime', 'DNF'])[['bib', 'youthPos', 'youthTime']]
youth_df = fix_time(youth_df, 'youthTime')
youth_df = youth_df.set_index('bib')

kom_df = pd.DataFrame(data['kom'], columns=['komPos', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'pntsChnge', 'DNF'])
kom_df = kom_df[['bib', 'komPos', 'komPnts']].set_index('bib')

teams_df = pd.DataFrame(data['teams'], columns=['teamPos', 'change', 'teamName', 'teamTime', 'DNF'])
teams_df = teams_df[['teamPos', 'teamName', 'teamTime']]

df = pd.concat([stage_df, gc_df, green_df, youth_df, kom_df], axis=1, sort=False)
df.head()

Unnamed: 0,stagePos,gcPos,timeAdd,url,name,age,team,uciStg,stagePnt,stageTime,DNF,uciGc,greenPos,greenPnts,youthPos,youthTime,komPos,komPnts
135,1,1,+0:00,rider/alexander-kristoff,Kristoff Alexander,33,UAE-Team Emirates,120,100,3:46:23,,25.0,1,59,,,,
105,2,2,+0:04,rider/mads-pedersen,Pedersen Mads,24,Trek - Segafredo,50,70,3:46:23,,,2,30,1.0,3:46:17,,
203,3,3,+0:06,rider/cees-bol,Bol Cees,25,Team Sunweb,25,50,3:46:23,,,6,20,2.0,0:02,,
43,4,4,+0:10,rider/sam-bennett,Bennett Sam,29,Deceuninck - Quick Step,15,40,3:46:23,,,4,28,,,,
21,5,5,+0:10,rider/peter-sagan,Sagan Peter,30,BORA - hansgrohe,5,32,3:46:23,,,3,29,,,,


In [6]:
try:
    print(df.DNF.unique())

    for finish_error in df.DNF.unique():
        if finish_error is not np.nan:
            df[finish_error] = 0

            df.loc[df.DNF == finish_error, [finish_error]] = 1
    df = df.drop(['DNF'], axis=1)
except: 
    print('')

df['youth'] = 1
df.loc[df.youthTime.isnull(), ['youth']] = 0
# make the dataframe
df.name = df.name.str.upper()
df.team = df.team.str.upper()

print('There were {} cyclists that finished stage 1'.format(df.shape[0]))
df.head()

[nan 'OTL']
There were 176 cyclists that finished stage 1


Unnamed: 0,stagePos,gcPos,timeAdd,url,name,age,team,uciStg,stagePnt,stageTime,uciGc,greenPos,greenPnts,youthPos,youthTime,komPos,komPnts,OTL,youth
135,1,1,+0:00,rider/alexander-kristoff,KRISTOFF ALEXANDER,33,UAE-TEAM EMIRATES,120,100,3:46:23,25.0,1,59,,,,,0,0
105,2,2,+0:04,rider/mads-pedersen,PEDERSEN MADS,24,TREK - SEGAFREDO,50,70,3:46:23,,2,30,1.0,3:46:17,,,0,1
203,3,3,+0:06,rider/cees-bol,BOL CEES,25,TEAM SUNWEB,25,50,3:46:23,,6,20,2.0,0:02,,,0,1
43,4,4,+0:10,rider/sam-bennett,BENNETT SAM,29,DECEUNINCK - QUICK STEP,15,40,3:46:23,,4,28,,,,,0,0
21,5,5,+0:10,rider/peter-sagan,SAGAN PETER,30,BORA - HANSGROHE,5,32,3:46:23,,3,29,,,,,0,0


## Create dataframes
We are going to separate the data frame above into 3 more succinct dataframes. 

- `riders_df` that contians the 175 riders' personal infomration.
- `stages_df` that containes that stages competed in by all the 175 riders in the past season
- `races_df` that contains the information about the races that the 175 riders competed in in the past season (a race can have multiple stages)

In [7]:
riders_df = df[['name', 'age', 'url', 'team']]
riders_df = riders_df.sort_values('name')
riders_df['riderID'] = range(riders_df.shape[0])
riders_df = riders_df.set_index('riderID')
riders_df.head()

Unnamed: 0_level_0,name,age,url,team
riderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,ALAPHILIPPE JULIAN,28,rider/julian-alaphilippe,DECEUNINCK - QUICK STEP
1,AMADOR ANDREY,34,rider/andrey-amador,INEOS GRENADIERS
2,ANACONA WINNER,32,rider/winner-anacona,TEAM ARKÉA SAMSIC
3,ARNDT NIKIAS,28,rider/nikias-arndt,TEAM SUNWEB
4,ARU FABIO,30,rider/fabio-aru,UAE-TEAM EMIRATES


In [8]:
def find_races(url, rider_id, rider_name):
    ''' Extract all the races in the season for the rider on the rider's profile (url)'''
    page = requests.get(url)
    rider_html = BeautifulSoup(page.content, 'html.parser')
    results_html = rider_html.body.tbody
    # all races in data rows
    rows = results_html.find_all('tr')

    races = list()
    for row in rows:
        items = row.find_all('td')
        # extract text values from data cell
        row = list()
        for item in items:
            text = get_text(item)
            if type(text) is type('str'):
                row.append(text)
            else:
                row.append(text[0])
                row.append(text[1])
        races.append(row)
    df = pd.DataFrame(races, columns=['date', 'stagePos', 'gcPos', 'unknown', 'url', 'stage', 'distance', 'pcs', 'uci', 'more'])
    print('"{}" competed in {} stages'.format(rider_name, df.shape[0]))
    df['riderID'] = rider_id
    return df

# initialise list of dataframes
stages_list = list()
# itterate through each cyclist and add his races for the season
# to a list of dataframes
for ix, row in riders_df.iterrows():
    print('{} of {}'.format(ix, riders_df.shape[0]))
    url = row['url']
    name = row['name']
    rider_url = 'https://www.procyclingstats.com/{}'.format(url)
    rider_df = find_races(rider_url, ix, name)
    stages_list.append(rider_df)

0 of 176
" ALAPHILIPPE JULIAN" competed in 50 stages
1 of 176
" AMADOR ANDREY" competed in 32 stages
2 of 176
" ANACONA WINNER" competed in 48 stages
3 of 176
" ARNDT NIKIAS" competed in 39 stages
4 of 176
" ARU FABIO" competed in 36 stages
5 of 176
" ASGREEN KASPER" competed in 47 stages
6 of 176
" BARDET ROMAIN" competed in 55 stages
7 of 176
" BARGUIL WARREN" competed in 40 stages
8 of 176
" BARTHE CYRIL" competed in 55 stages
9 of 176
" BAUER JACK" competed in 36 stages
10 of 176
" BENNETT GEORGE" competed in 42 stages
11 of 176
" BENNETT SAM" competed in 53 stages
12 of 176
" BENOOT TIESJ" competed in 31 stages
13 of 176
" BERNAL EGAN" competed in 46 stages
14 of 176
" BETTIOL ALBERTO" competed in 35 stages
15 of 176
" BEWLEY SAM" competed in 32 stages
16 of 176
" BILBAO PELLO" competed in 53 stages
17 of 176
" BOASSON HAGEN EDVALD" competed in 38 stages
18 of 176
" BOL CEES" competed in 33 stages
19 of 176
" BONIFAZIO NICCOLÒ" competed in 40 stages
20 of 176
" BONNET WILLIAM" com

" YATES ADAM" competed in 27 stages
165 of 176
" ZAKARIN ILNUR" competed in 27 stages
166 of 176
"DE BUYST JASPER" competed in 37 stages
167 of 176
"DE GENDT THOMAS" competed in 50 stages
168 of 176
"DE LA CRUZ DAVID" competed in 43 stages
169 of 176
"DE MARCHI ALESSANDRO" competed in 28 stages
170 of 176
"KRAGH ANDERSEN SØREN" competed in 36 stages
171 of 176
"VAN AERT WOUT" competed in 26 stages
172 of 176
"VAN ASBROECK TOM" competed in 46 stages
173 of 176
"VAN AVERMAET GREG" competed in 42 stages
174 of 176
"VAN BAARLE DYLAN" competed in 43 stages
175 of 176
"VAN GARDEREN TEJAY" competed in 33 stages


In [9]:
stages_ = pd.concat(stages_list).reset_index(drop=True)
print('{} stages have been loaded'.format(stages_.shape[0]))
# drop 2 unnecessary columns
stages_ = stages_.drop(['unknown', 'more'], axis=1)
stages_.head()

7131 stages have been loaded


Unnamed: 0,date,stagePos,gcPos,url,stage,distance,pcs,uci,riderID
0,› 20.09,,,race/tour-de-france/2020/stage-1,Tour de France,,,,0
1,09.09,147.0,45.0,race/tour-de-france/2020/stage-11,Stage 11 - Châtelaillon-Plage › Poitiers,167.5,,,0
2,08.09,160.0,41.0,race/tour-de-france/2020/stage-10,Stage 10 - île d'Oléron (Le Château-d'Oléron) ...,168.5,,,0
3,06.09,160.0,38.0,race/tour-de-france/2020/stage-9,Stage 9 - Pau › Laruns,153.0,,,0
4,05.09,38.0,26.0,race/tour-de-france/2020/stage-8,Stage 8 - Cazères › Loudenvielle,141.0,,,0


In [10]:
def _get_unique_races(data):
    data.columns = ['race', 'url']
    num_races = len(data.race.unique())
    num_url = len(data.url.unique())
    
    if num_races == num_url:
        rdf = data.groupby(['race', 'url']).count()
        rdf = rdf.reset_index()
        rdf['cutUrl'] = rdf.apply(lambda x: x['url'][0: x['url'].rindex('/')], axis =1)
        rdf.index.name = 'raceIX'
        return rdf
    else:
        return None
        
def _add_race_id_to_data(races_df, all_data):
    if 'raceID' not in all_data.columns:
        print('Creating "raceID" collumn')
        all_data['raceID'] = np.nan
        
    for race_ix, race_row in races_df.iterrows():
        url_begin = race_row['cutUrl']
        all_data.loc[all_data.url.str.startswith(url_begin), 'raceID'] = race_ix
        
    return all_data

def create_races_df(data):
    ''' '''
    # multistage races 
    rdf = data[(data.distance == '') & (data.stagePos == '')][['stage', 'url']]
    rdf2 = _get_unique_races(rdf)
    data1 = _add_race_id_to_data(rdf2, data)
        
    # one day races
    odr = data1[data1.raceID.isnull()][['stage', 'url']]
    rdf3 = _get_unique_races(odr)
    data2 = _add_race_id_to_data(rdf3, data1)
    
    races_df = pd.concat([rdf2, rdf3], ignore_index=True)
    
    return races_df, data2
    
races_df, rhdf = create_races_df(stages_)
rhdf = rhdf.astype({'raceID': int})
print('There are {} unique races.\nThere are {} stages(s) without a raceID.'.format(races_df.shape[0], rhdf[rhdf.raceID.isnull()].shape[0]))
races_df.head()

Creating "raceID" collumn
There are 79 unique races.
There are 0 stages(s) without a raceID.


Unnamed: 0,race,url,cutUrl
0,Critérium du Dauphiné,race/dauphine/2020/stage-1,race/dauphine/2020
1,Czech Tour,race/czech-cycling-tour/2020/stage-1,race/czech-cycling-tour/2020
2,Etoile de Bessèges,race/etoile-de-besseges/2020/stage-1,race/etoile-de-besseges/2020
3,Jayco Herald Sun Tour,race/herald-sun-tour/2020/stage-1,race/herald-sun-tour/2020
4,La Route d'Occitanie - La Dépêche du Midi,race/la-route-d-occitanie/2020/stage-1,race/la-route-d-occitanie/2020


In [11]:
rhdf.head()

Unnamed: 0,date,stagePos,gcPos,url,stage,distance,pcs,uci,riderID,raceID
0,› 20.09,,,race/tour-de-france/2020/stage-1,Tour de France,,,,0,13
1,09.09,147.0,45.0,race/tour-de-france/2020/stage-11,Stage 11 - Châtelaillon-Plage › Poitiers,167.5,,,0,13
2,08.09,160.0,41.0,race/tour-de-france/2020/stage-10,Stage 10 - île d'Oléron (Le Château-d'Oléron) ...,168.5,,,0,13
3,06.09,160.0,38.0,race/tour-de-france/2020/stage-9,Stage 9 - Pau › Laruns,153.0,,,0,13
4,05.09,38.0,26.0,race/tour-de-france/2020/stage-8,Stage 8 - Cazères › Loudenvielle,141.0,,,0,13


In [12]:
print(rhdf.shape)
# drop multi stage race heagings
rider_race_history_df = rhdf[(rhdf.distance != '') | (rhdf.stagePos != '')]
# drop end of stage race classifcation
# will be added in more detail
rider_race_history_df = rider_race_history_df[(rider_race_history_df.date != '') | (rider_race_history_df.gcPos != '')]
print(rider_race_history_df.shape)
rider_race_history_df.head()

(7131, 10)
(5375, 10)


Unnamed: 0,date,stagePos,gcPos,url,stage,distance,pcs,uci,riderID,raceID
1,9.09,147,45,race/tour-de-france/2020/stage-11,Stage 11 - Châtelaillon-Plage › Poitiers,167.5,,,0,13
2,8.09,160,41,race/tour-de-france/2020/stage-10,Stage 10 - île d'Oléron (Le Château-d'Oléron) ...,168.5,,,0,13
3,6.09,160,38,race/tour-de-france/2020/stage-9,Stage 9 - Pau › Laruns,153.0,,,0,13
4,5.09,38,26,race/tour-de-france/2020/stage-8,Stage 8 - Cazères › Loudenvielle,141.0,,,0,13
5,4.09,12,11,race/tour-de-france/2020/stage-7,Stage 7 - Millau › Lavaur,168.0,6.0,,0,13


In [13]:
stages_df = rider_race_history_df[['date', 'stage', 'url', 'distance']]
print(len(stages_df.groupby(['date']).count()))
print(len(stages_df.groupby(['stage']).count()))
print(len(stages_df.groupby(['url']).count()))
stages_df = stages_df.groupby(['stage', 'url', 'date', 'distance']).count()
stages_df = stages_df.reset_index()
stages_df.index.name = 'stagesID'
print(stages_df.shape)
stages_df.head()

92
192
192
(192, 4)


Unnamed: 0_level_0,stage,url,date,distance
stagesID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Bretagne Classic - Ouest-France,race/bretagne-classic/2020/result,25.08,247.75
1,Cadel Evans Great Ocean Road Race,race/great-ocean-race/2020/result,2.02,171.1
2,Circuito de Getxo-Memorial Hermanos Otxoa,race/circuito-de-getxo/2020/result,2.08,177.0
3,Clasica de Almeria,race/clasica-de-almeria/2020/result,16.02,187.6
4,European Continental Championships - ITT,race/uec-road-european-championships-itt/2020/...,24.08,25.6


In [14]:
rider_race_history_df['stageID'] = np.nan

for stage_ix, stage_row in stages_df.iterrows():
    stage_url = stage_row['url']
    rider_race_history_df.loc[rider_race_history_df.url == stage_url, 'stageID'] = stage_ix
    
print('{} races are unlabelled'.format(rider_race_history_df[rider_race_history_df.stageID.isnull()].shape[0]))
race_history_df = rider_race_history_df.drop(['url', 'stage', 'distance', 'date'], axis=1)
race_history_df = race_history_df.astype({'stageID': int})
race_history_df.head()

0 races are unlabelled


Unnamed: 0,stagePos,gcPos,pcs,uci,riderID,raceID,stageID
1,147,45,,,0,13,76
2,160,41,,,0,13,75
3,160,38,,,0,13,186
4,38,26,,,0,13,184
5,12,11,6.0,,0,13,180


In [15]:
stages_df.head()

Unnamed: 0_level_0,stage,url,date,distance
stagesID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Bretagne Classic - Ouest-France,race/bretagne-classic/2020/result,25.08,247.75
1,Cadel Evans Great Ocean Road Race,race/great-ocean-race/2020/result,2.02,171.1
2,Circuito de Getxo-Memorial Hermanos Otxoa,race/circuito-de-getxo/2020/result,2.08,177.0
3,Clasica de Almeria,race/clasica-de-almeria/2020/result,16.02,187.6
4,European Continental Championships - ITT,race/uec-road-european-championships-itt/2020/...,24.08,25.6


In [74]:
all_stages = list()
for stage_ix, stage  in stages_df.iterrows():
    url = stage['url']
    stage_url = 'https://www.procyclingstats.com/{}'.format(url)
    print('{} of {} for \'{}\''.format(stage_ix, stages_df.shape[0], url))
    page = requests.get(stage_url)
    stage_html = BeautifulSoup(page.content, 'html.parser')
    
    res_ = stage_html.find_all("div", class_="res-right")
    res_text = res_[0].find_all(text=True)
    stage_info = list()
    mountains = list()
    found_race_rank = False
    pnts_quality_found = False
    for tix, text in enumerate(res_text):
        web_regex = re.search('(www.(.)+\.(.)+)+', text) \
                        or re.search('((.)+\.com(.)+)+', text) \
                        or re.search('((.)+\.(\w)*(\d)*/)', text) \
                        or 'googletag.cmd.push(' in text
        #if web_regex: print(text)
        if text not in ['Race information', 'Date: ', 'Avg. speed winner:', 'rd', \
                        'Race category: ', 'Parcours type:', 'PCS point scale:', \
                        ' ', 'Start/finish:', ' › ', 'Climbs: ', ', ', 'Race profile', \
                        'Finish photo', 'Finish photo', 'LiveStats', 'Websites:', \
                        'Race ranking position', 'ranking', 'th', 'nd', 'st', '\n', \
                        'breakdown', 'Position and points as on startdate of race.'] \
        and not web_regex:
            if len(stage_info) <= 6 or found_race_rank:
                stage_info.append(text)
                if re.search('(\d)* pnt', text):
                    break
            else:
            #elif not found_race_rank:
                if is_not_int(text):
                    mountains.append(text)
                    #print(mountains)
                else:
                    stage_info.append(mountains)
                    stage_info.append(len(mountains))
                    stage_info.append(text)
                    found_race_rank = True
        
        if text is 'ranking':
            break
    #print(stage_info)
    stage_info = list(stage) + stage_info
    all_stages.append(stage_info)
    

0 of 192 for 'race/bretagne-classic/2020/result'
1 of 192 for 'race/great-ocean-race/2020/result'
2 of 192 for 'race/circuito-de-getxo/2020/result'
3 of 192 for 'race/clasica-de-almeria/2020/result'
4 of 192 for 'race/uec-road-european-championships-itt/2020/result'
5 of 192 for 'race/uec-road-european-championships/2020/result'
6 of 192 for 'race/les-boucles-du-dus-ardeche/2020/result'
7 of 192 for 'race/giro-dell-emilia/2020/result'
8 of 192 for 'race/gran-piemonte/2020/result'
9 of 192 for 'race/gran-trittico-lombardo/2020/result'
10 of 192 for 'race/gp-d-ouverture/2020/result'
11 of 192 for 'race/gp-de-la-ville-de-lillers/2020/result'
12 of 192 for 'race/grote-prijs-jean-pierre-monsere/2020/result'
13 of 192 for 'race/il-lombardia/2020/result'
14 of 192 for 'race/kuurne-brussel-kuurne/2020/result'
15 of 192 for 'race/gp-samyn/2020/result'
16 of 192 for 'race/malaysian-international-classic-race/2020/result'
17 of 192 for 'race/milano-sanremo/2020/result'
18 of 192 for 'race/milano-

155 of 192 for 'race/etoile-de-besseges/2020/stage-5'
156 of 192 for 'race/volta-ao-algarve/2020/stage-5'
157 of 192 for 'race/ruta-del-sol/2020/stage-5'
158 of 192 for 'race/uae-tour/2020/stage-5'
159 of 192 for 'race/vuelta-a-burgos/2020/stage-5'
160 of 192 for 'race/paris-nice/2020/stage-5'
161 of 192 for 'race/tour-de-france/2020/stage-5'
162 of 192 for 'race/tour-down-under/2020/stage-5'
163 of 192 for 'race/tour-de-langkawi/2020/stage-5'
164 of 192 for 'race/la-tropicale-amissa-bongo/2020/stage-5'
165 of 192 for 'race/dauphine/2020/stage-5'
166 of 192 for 'race/herald-sun-tour/2020/stage-5'
167 of 192 for 'race/colombia-21/2020/stage-5'
168 of 192 for 'race/vuelta-a-la-comunidad-valenciana/2020/stage-5'
169 of 192 for 'race/tour-de-saudi-arabia/2020/stage-5'
170 of 192 for 'race/vuelta-ciclista-a-la-provincia-de-san-juan/2020/stage-5'
171 of 192 for 'race/tour-de-pologne/2020/stage-5'
172 of 192 for 'race/vuelta-ciclista-a-la-provincia-de-san-juan/2020/stage-6'
173 of 192 for 'ra

In [75]:
stgs_df = pd.DataFrame(all_stages, columns=['stageName', 'url', 'date', 'distance', \
                                     'dateFull', 'averageSpeed', 'raceCtgr', \
                                     'parcoursType', 'PCSPointScale', 'start',\
                                     'end', 'mountains', 'numMount', 'raceRank', \
                                     'racePoints'])
stgs_df['racePoints'] = stgs_df.racePoints.str.replace('pnt', '')
stgs_df = stgs_df.drop('raceRank',  axis=1)
stgs_df.head()

Unnamed: 0,stageName,url,date,distance,dateFull,averageSpeed,raceCtgr,parcoursType,PCSPointScale,start,end,mountains,numMount,racePoints
0,Bretagne Classic - Ouest-France,race/bretagne-classic/2020/result,25.08,247.75,25th August 2020,41.15 km/h,Men Elite,45,1.WT.B,Plouay,Plouay,"[Restergal, Côte du Lezot, Restergal, Côte de ...",5.0,390
1,Cadel Evans Great Ocean Road Race,race/great-ocean-race/2020/result,2.02,171.1,2nd February 2020,41.76 km/h,Men Elite,53,1.WT.C,Geelong,Geelong,[],0.0,395
2,Circuito de Getxo-Memorial Hermanos Otxoa,race/circuito-de-getxo/2020/result,2.08,177.0,2nd August 2020,42.29 km/h,Men Elite,0*,1.1,Getxo,Getxo,[],0.0,238
3,Clasica de Almeria,race/clasica-de-almeria/2020/result,16.02,187.6,16th February 2020,42.63 km/h,Men Elite,0*,1.HC,Roquetas de Mar,Roquetas de Mar,[],0.0,397
4,European Continental Championships - ITT,race/uec-road-european-championships-itt/2020/...,24.08,25.6,24th August 2020,50.69 km/h,Men Elite,0*,UCI.Cont.Ch.TT,Plouay,Plouay,[],0.0,86


In [76]:
errors = stgs_df[stgs_df.racePoints.isnull()]
errors

Unnamed: 0,stageName,url,date,distance,dateFull,averageSpeed,raceCtgr,parcoursType,PCSPointScale,start,end,mountains,numMount,racePoints
9,Gran Trittico Lombardo,race/gran-trittico-lombardo/2020/result,3.08,199.7,3rd August 2020,42.64 km/h,Men Elite,77,1.HC,Legnano,Varese,,,
17,Milano-Sanremo,race/milano-sanremo/2020/result,8.08,305.0,8th August 2020,41.96 km/h,Men Elite,48,1.WT.A,Milano,Sanremo,,,
21,National Championships Belgium - ITT,race/nc-belgium-itt/2020/result,20.08,42.1,20th August 2020,50.35 km/h,Men Elite,0*,Nationals.C3.TT,Koksijde › Koksijde,112,,,
30,National Championships Italy - Road Race,race/nc-italy/2020/result,23.08,253.8,23rd August 2020,43.68 km/h,Men Elite,0*,Nationals.C3,Bassano del Grappa › Cittadella,40,,,
68,Stage 1 - Saint-Affrique › Cazouls-lès-Béziers,race/la-route-d-occitanie/2020/stage-1,1.08,189.5,1st August 2020,41.35 km/h,Men Elite,0*,2.1.Stage,Saint-Affrique,Cazouls-lès-Béziers,,,
81,Stage 2 - Carcassonne › Cap Découverte,race/la-route-d-occitanie/2020/stage-2,2.08,182.5,2nd August 2020,41.73 km/h,Men Elite,32,2.1.Stage,Carcassonne,Cap Découverte,,,
121,Stage 3 - Saint-Gaudens › Col de Beyrède,race/la-route-d-occitanie/2020/stage-3,3.08,163.5,3rd August 2020,35.45 km/h,Men Elite,299,2.1.Stage,Saint-Gaudens,Col de Beyrède,,,
142,Stage 4 - Lectoure › Rocamadour,race/la-route-d-occitanie/2020/stage-4,4.08,195.0,4th August 2020,44.41 km/h,Men Elite,67,2.1.Stage,Lectoure,Rocamadour,,,
187,Strade Bianche,race/strade-bianche/2020/result,1.08,184.0,1st August 2020,36.93 km/h,Men Elite,104,1.WT.C,Siena,Siena,,,


In [77]:
#print(stgs_df[stgs_df.index.isin(errors.index)])
e_ix = list(errors.index)
stgs_df.loc[e_ix, 'mountains'] = 0
stgs_df.loc[e_ix, 'numMount'] = 0
stgs_df.loc[e_ix, 'racePoints'] = 0

starts_and_ends = errors[errors.start.str.contains('›')]
for er_ix, error_row in starts_and_ends.iterrows():
    start_end = error_row['start']
    start_ix = start_end.find('›')
    
    start = start_end[:start_ix]
    end = start_end[start_ix + 1:]
    racePnts = error_row['end']
    
    stgs_df.loc[er_ix, 'start'] = start
    stgs_df.loc[er_ix, 'end'] = end
    stgs_df.loc[er_ix, 'racePoints'] = racePnts

stgs_df[stgs_df.racePoints.isnull()]

Unnamed: 0,stageName,url,date,distance,dateFull,averageSpeed,raceCtgr,parcoursType,PCSPointScale,start,end,mountains,numMount,racePoints


In [38]:
#stage_url = 'https://www.procyclingstats.com/race/tour-de-france/2020/stage-3'
#stage_url = 'https://www.procyclingstats.com/race/nc-switzerland-itt/2020/result'
stage_url = 'https://www.procyclingstats.com/race/gran-trittico-lombardo/2020/result'
page = requests.get(stage_url)
stage_html = BeautifulSoup(page.content, 'html.parser')
print('(ITT)' in str(stage_html.h2), str(stage_html.h2))

res_ = stage_html.find_all("div", class_="res-right")
#res_[0]
#data_row_length = data_row_length = {'stage': 11, 'gc': 11, 'green': 10, 'youth': 11, 'kom': 10, 'teams': 6}
#data = get_stage_data(stage_html, data_row_length, True)

False <h2><span class="yearmob hide">2020  </span><span class="blue">One day race</span>  »  <span class="red">Legnano  ›  Varese  </span> <span class="red distance">(199.7k)</span></h2>


In [39]:
res_[0].find_all(text=True) 

['Race information',
 'Date: ',
 ' 3rd August 2020',
 'Avg. speed winner:',
 ' 42.64 km/h',
 'Race category: ',
 ' Men Elite',
 'Parcours type:',
 ' ',
 ' ',
 '77',
 'PCS point scale:',
 ' ',
 '1.HC',
 ' ',
 'Start/finish:',
 ' ',
 'Legnano',
 ' › ',
 'Varese',
 'Race profile',
 'Websites:',
 ' ',
 'www.tritticolombardo.it/',
 '\n',
 "\n        googletag.cmd.push(function() { googletag.display('div-gpt-ad-1502360547318-0'); });\n        ",
 '\n',
 '\n']

In [292]:
stage_df = pd.DataFrame(data['gc'], gc_col = ['gcPos', 'prevGcPos', 'gcChng', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'gcTime', 'more'])
stage_df.head()

Unnamed: 0,stagePos,posChange,url,name,age,teamName,uci,pnts,stageTime
0,1,,rider/arnaud-demare,Démare Arnaud,28,Groupama - FDJ,100,15,5:38:27
1,2,,rider/bryan-coquard,Coquard Bryan,28,B&B Hotels - Vital Concept p/b KTM,75,10,",,"
2,3,,rider/julian-alaphilippe,Alaphilippe Julian,28,Deceuninck - Quick Step,60,7,",,"
3,4,,rider/clement-venturini,Venturini Clément,26,AG2R La Mondiale,50,4,0:05
4,5,,rider/anthony-turgis,Turgis Anthony,26,Team Total Direct Energie,40,2,",,"


In [305]:
ONE_DAY_RACE = 0
FIRST_STAGE_IN_TOUR = 1
OTHER_TOUR_STAGE = 2
ITT = 3

def build_df(data, result_type):
    if result_type == ONE_DAY_RACE:
        stage_col = ['stagePos', 'gcPos', 'url', 'name', 'age', 'teamName', 'uciStage', 'pnt', 'stageTime']
    elif result_type == ITT:
        stage_col = ['stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'stgAvgPace', 'stageTime']
    elif result_type == FIRST_STAGE_IN_TOUR:
        stage_col = ['stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'stageTime']
        gc_col = ['gcPos', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'gcTime', 'more']
        green_col = ['greenPos', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'greenPntsChng']
        youth_col = ['youthPos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'youthTime']
        kom_col = ['komPos', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'komPntsChng']
        teams_col = ['teamPos', 'empty', 'teamName', 'teamTime']
    elif result_type == OTHER_TOUR_STAGE:
        stage_col = ['stagePos', 'gcPos', 'gcTime', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'stageTime']
        gc_col = ['gcPos', 'prevGcPos', 'gcChng', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'gcTime', 'more']
        green_col = ['greenPos', 'prevGreenPos', 'greenChng', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'greenPntsChng']
        youth_col = ['youthPos', 'prevYouthPos', 'youthChng', 'gcPos', 'gcTime', 'bib', 'url', 'name', 'age', 'team', 'youthTime']
        kom_col = ['komPos', 'prevKomPos', 'komChng', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'komPntsChng']
        teams_col = ['teamPos', 'prevTeamPos', 'teamChng', 'empty', 'teamName', 'teamTime']
    
    stage_df = pd.DataFrame(data['stage'], columns=stage_col)
    stage_df = fix_time(stage_df, 'stageTime')
    
    if result_type in [FIRST_STAGE_IN_TOUR, OTHER_TOUR_STAGE]:
        # one day races only have 1 data frame
        stage_df = stage_df.set_index('bib')
        gc_df = pd.DataFrame(data['gc'], columns=gc_col)
        gc_df = gc_df[['bib', 'uciGc']].set_index('bib')

        green_df = pd.DataFrame(data['green'], columns=green_col)
        green_df = green_df[['bib', 'greenPos', 'greenPnts']].set_index('bib')

        youth_df = pd.DataFrame(data['youth'], columns=youth_col)
        youth_df = youth_df[['bib', 'youthPos', 'youthTime']].set_index('bib')
        youth_df = fix_time(youth_df, 'youthTime')

        kom_df= pd.DataFrame(data['kom'], columns=kom_col)
        kom_df = kom_df[['bib', 'komPos', 'komPnts']].set_index('bib')

        team_df = pd.DataFrame(data['teams'], columns=teams_col)
        team_df = team_df[['teamPos', 'teamName', 'teamTime']]

        df = pd.concat([stage_df, gc_df, green_df, youth_df, kom_df], axis=1, sort=False)
        return df
    else:
        return stage_df

df = build_df(data, ITT)
df.head()

Unnamed: 0,stagePos,gcPos,timeAdd,bib,url,name,age,teamName,uciStage,pnt,stgAvgPace,stageTime
0,1,2,+0:58,98,rider/soren-kragh-andersen,Kragh Andersen Søren,25,Team Sunweb,60.0,50,48.064,18:51
1,2,1,+0:00,37,rider/maximilian-schachmann,Schachmann Maximilian,26,BORA - hansgrohe,25.0,30,47.81,0:06
2,3,23,+2:16,42,rider/kasper-asgreen,Asgreen Kasper,25,Deceuninck - Quick Step,10.0,18,47.559,0:12
3,4,85,+14:52,64,rider/thomas-de-gendt,De Gendt Thomas,33,Lotto Soudal,,13,47.517,0:13
4,5,29,+3:41,82,rider/pello-bilbao,Bilbao Pello,30,Bahrain - McLaren,,10,47.435,0:15


In [307]:
# get the points classifications
def get_points(url):
    ''' Extract all the races in the season for the rider on the rider's profile (url)'''
    page = requests.get(url)
    stage_html = BeautifulSoup(page.content, 'html.parser')
    
    if 'One day race' in str(stage_html.h2):
        print('ONE DAY RACE')
        data_row_length = {'stage': 9}
        result_type = ONE_DAY_RACE
    elif '(ITT)' in str(stage_html.h2):
        print('(ITT)')
        data_row_length = {'stage': 12}
        result_type = ITT
    elif url.endswith('/result') or url.endswith('/stage-1'):
        result_type = FIRST_STAGE_IN_TOUR
        data_row_length = {'stage': 11, 'gc': 9, 'green': 8, 'youth': 9, 'kom': 8, 'teams': 4}
    else:
        result_type = OTHER_TOUR_STAGE
        data_row_length = {'stage': 11, 'gc': 11, 'green': 10, 'youth': 11, 'kom': 10, 'teams': 6}
    data = get_stage_data(stage_html, data_row_length)
    df = build_df(data, result_type)
    return df
    
    
url_uniques = stages_df.url.unique()
count = 0
all_races = list()
print(len(url_uniques))
for uix, url in enumerate(url_uniques):
    
    stage_url = 'https://www.procyclingstats.com/{}'.format(url)
    print('{} in {}: {}'.format(uix, len(url_uniques), stage_url))
    #stage_url = 'https://www.procyclingstats.com/race/tour-de-france/2020/stage-2'
    rdf = get_points(stage_url)
    all_races.append(rdf)
    #if count == 5: break

190
0 in 190: race/tour-de-france/2020/stage-9
UPDATING 1:stage has 166 participants
ERROR: ['DNF', '', '+ - 38:40:01', '132', 'rider/fabio-aru', ' Aru Fabio', '30', 'UAE-Team Emirates', '', '', '-']
ERROR: ['DNF', '', '+ - 38:40:01', '152', 'rider/steff-cras', ' Cras Steff', '24', 'Lotto Soudal', '', '', ',,']
UPDATING 1:gc has 166 participants
UPDATING 1:green has 102 participants
UPDATING 1:youth has 25 participants
UPDATING 1:kom has 43 participants
UPDATING 1:teams has 22 participants
1 in 190: race/tour-de-france/2020/stage-8
UPDATING 1:stage has 168 participants
ERROR: ['DNF', '', '+ - 34:44:52', '191', 'rider/giacomo-nizzolo', ' Nizzolo Giacomo', '31', 'NTT Pro Cycling', '', '', '-']
ERROR: ['DNF', '', '+ - 34:44:52', '52', 'rider/william-bonnet', ' Bonnet William', '38', 'Groupama - FDJ', '', '', ',,']
ERROR: ['DNF', '', '+ - 34:44:52', '86', 'rider/diego-rosa', ' Rosa Diego', '31', 'Team Arkéa Samsic', '', '', ',,']
ERROR: ['DNF', '', '+ - 34:44:52', '183', 'rider/lilian-calm

UPDATING 1:stage has 151 participants
ERROR: ['DNF', '', '+ - 13:14:35', '121', 'rider/tiesj-benoot', ' Benoot Tiesj', '26', 'Team Sunweb', '', '', '-']
ERROR: ['DNF', '', '+ - 13:14:35', '193', 'rider/mathieu-burgaudeau', ' Burgaudeau Mathieu', '21', 'Team Total Direct Energie', '', '', ',,']
ERROR: ['DNF', '', '+ - 13:14:35', '225', 'rider/quentin-pacher', ' Pacher Quentin', '28', 'B&B Hotels - Vital Concept p/b KTM', '', '', ',,']
ERROR: ['DNS', '', '+ - 13:14:35', '114', 'rider/juan-pedro-lopez', ' López Juan Pedro', '23', 'Trek - Segafredo', '', '', ',,']
ERROR: ['DNS', '', '+ - 13:14:35', '171', 'rider/dan-martin', ' Martin Dan', '33', 'Israel Start-Up Nation', '', '', ',,']
UPDATING 1:gc has 2 participants
ERROR: ['-', '21', 'rider/emanuel-buchmann', ' Buchmann Emanuel', '27', 'BORA - hansgrohe', '0:20', '..', '4', '5', '▲1']
UPDATING 1:green has 1 participants
UPDATING 1:youth has 1 participants
13 in 190: race/dauphine/2020/stage-2
UPDATING 1:stage has 156 participants
ERROR: 

UPDATING 1:stage has 122 participants
ERROR: ['DNF', '', '+ - 18:49:00', '32', 'rider/pascal-ackermann', ' Ackermann Pascal', '26', 'BORA - hansgrohe', '', '', '-']
ERROR: ['DNF', '', '+ - 18:49:00', '141', 'rider/julien-el-fares', 'El Fares Julien', '34', 'NIPPO DELKO One Provence', '', '', ',,']
ERROR: ['DNF', '', '+ - 18:49:00', '17', 'rider/diego-rosa', ' Rosa Diego', '30', 'Team Arkéa Samsic', '', '', ',,']
ERROR: ['DNF', '', '+ - 18:49:00', '108', 'rider/michael-woods', ' Woods Michael', '33', 'EF Pro Cycling', '', '', ',,']
ERROR: ['DNF', '', '+ - 18:49:00', '103', 'rider/lawson-craddock', ' Craddock Lawson', '28', 'EF Pro Cycling', '', '', ',,']
ERROR: ['DNF', '', '+ - 18:49:00', '56', 'rider/oliver-naesen', ' Naesen Oliver', '29', 'AG2R La Mondiale', '', '', ',,']
ERROR: ['DNF', '', '+ - 18:49:00', '162', 'rider/frederik-backaert', ' Backaert Frederik', '29', 'B&B Hotels - Vital Concept p/b KTM', '', '', ',,']
ERROR: ['DNS', '', '+ - 18:49:00', '106', 'rider/tejay-van-garderen

IndexError: list index out of range

In [107]:
# remove the race headings
print(riders_df.shape)
riders_df = riders_df[(riders_df.distance != '') & (riders_df.gc_pos != '')]
riders_df.drop(['uci'], axis = 1, inplace=True)
print(riders_df.shape)
riders_df.head(10)

(5462, 9)
(3492, 8)


Unnamed: 0,date,pos,gc_pos,stage,distance,pcs,rider,race_id
1,4.09,152,142,Stage 7 - Millau › Lavaur,168.0,,kristoff alexander,0
2,3.09,147,139,Stage 6 - Le Teil › Mont Aigoual,191.0,,kristoff alexander,0
3,2.09,14,135,Stage 5 - Gap › Privas,183.0,2.0,kristoff alexander,0
4,1.09,158,138,Stage 4 - Sisteron › Orcières-Merlette,160.5,,kristoff alexander,0
5,31.08,15,127,Stage 3 - Nice › Sisteron,198.0,1.0,kristoff alexander,0
6,30.08,137,129,Stage 2 - Nice › Nice,186.0,,kristoff alexander,0
7,29.08,1,1,Stage 1 - Nice › Nice,156.0,100.0,kristoff alexander,0
12,15.08,138,135,Stage 4 - Ugine › Megève,148.5,,kristoff alexander,1
13,14.08,151,144,Stage 3 - Corenc › Saint-Martin-de-Belleville,157.0,,kristoff alexander,1
14,13.08,135,142,Stage 2 - Vienne › Col de Porte,135.0,,kristoff alexander,1


In [105]:
#DNF=Did not finish / DNS=Did not start / OTL = Outside time limit / DF
riders_df[riders_df.pos =='DF']

Unnamed: 0,date,pos,gc_pos,stage,distance,rider,race_id


In [79]:
#result.groupby(['race', 'stage']).count()[['pos']]
#all_df['rc'] = np.where(all_df['distance']=='', all_df['race'], '')