# Extract data
## Introduction
This notebook extracts professional cyclists' data for a season. Cleaning of the data will be done as well. Feature creation and analysis will be done in the next notebook.

We extract all the riders at the beginning of the tour. Thereafter we itterate through each rider's profile on `procyclingstats.com` and scrape the races that they have partaken in, for the season. Thereafter, we itterate through all the races and extract their profiles (diificulty, uci status). The data is saved. 

In [1]:
# imports
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
import os

In [2]:
CREATE_DATA = False
# the first stage of the race
RACE = 'tour-de-france'
YEAR = 2020
STAGE = 1
URL1 = 'https://www.procyclingstats.com/race/{}/{}/stage-{}/result/result'.format(RACE, YEAR, STAGE)

folder_loc = './temp_data/{}'.format(RACE)
if not os.path.exists(folder_loc):
    os.mkdir(folder_loc)
folder_loc = '{}/{}'.format(folder_loc, YEAR)
if not os.path.exists(folder_loc):
    os.mkdir(folder_loc)
    
FIRST_STAGE_LOC = '{}/first.csv'.format(folder_loc)
RACES_LOC = '{}/races_df.csv'.format(folder_loc)
STAGES_LOC = '{}/stages_df.csv'.format(folder_loc)
NEW_STAGES_LOC = '{}/new_stages_df.csv'.format(folder_loc)
RIDERS_LOC = '{}/riders_df.csv'.format(folder_loc)

race_page = requests.get(URL1)
race_html = BeautifulSoup(race_page.content, 'html.parser')

In [290]:
def is_not_int(value):
    ''' Assesses whether value is an int or not'''
    try:
        int(value)
        return False
    except ValueError:
        return True

def get_text(cell):
    ''' Return the text from the html cell. '''
    # some cells have a span or hyperlink element with text in it
    if cell.a != None:
        url = cell.a.get('href')
        if url.startswith('rider/') or url.startswith('race/'):
            return url, cell.a.get_text()
        return cell.a.get_text()
    elif cell.span != None:
        return cell.span.get_text()
    else:
        return cell.get_text()
    
def get_stage_data(html, data_row_length, print_row=False):
    # all the racers are in a table data cell ('td')
    # intialise variables
    tdcs = html.find_all('td')
    # there can be up to 6 data tables on an html page
    #data = {'stage': list(), 'gc': list(), 'points': list(), 'youth': list(), 'kom': list(), 'teams': list()}
    data = {}
    print('DRL@@:', data_row_length)
    datasets = list(data_row_length.keys())
    data_id = 0
    row_length = data_row_length[datasets[data_id]]
    
    old_length = row_length
    last_list_length = 0
    data_list = list()
    error_list = list()
    error_row = False
    
    row = list()
    last_ix = 1

    # itterate through all data cells and append their text values to a row
    for td_ix, cell in enumerate(tdcs):
        
        text = get_text(cell)
        if type(text) is type('str'):
            #print(row)
            row.append(text)
        else:
            row.append(text[0])
            row.append(text[1])
            
        if len(row) == 1:
            # the first element in the row is the position
            # if the rider did not finish, the position will not be an int
            # it will be: DNF, DNS, OTL
            #print(row[0])
            not_int = is_not_int(row[0])
            
            if not_int:
                error_row = True
            
            if not not_int and int(row[0]) == 1 and len(data_list) != 0:
                # a new table begins with a rider being places 1st
                # save the complete previous table to the data map
                print('UPDATING 5: {} has {} participants'.format(datasets[data_id], len(data_list)))
                if datasets[data_id] == 'teams':
                    print('............................TEAMS')
                print('di', data_id)
                print('drl', datasets)
                ds_key = datasets[data_id]
                data[ds_key] = data_list

                old_length = row_length
                
                #reinitialise variables
                data_id += 1
                data_list = list()
                last_ix = 1
                error_row = False
                
                row_length = data_row_length[datasets[data_id]]
                print('OLD ROW: {}, NEW ROW: {}'.format(old_length, row_length))

        
        if error_row and len(row) == old_length:
            # a row with a DNS, DNF, OTL rider
            # put them in last position
            # append the disqualificationto the end of the row
            row.append(row[0])
            row[0] = last_ix
            print('ERROR: {}'.format(row))
            data_list.append(row)
            row = list()
            
        elif not error_row and len(row) == row_length:
            # 'row_length' data cells make an entire row
            if print_row:
                print(row)
            # data list gets saved in data subset
            pos = int(row[0])
            # DQ/ DNF/ OL column
            row.append(np.nan)
            data_list.append(row)
            last_ix = pos + 1
            row = list()
                
    print('UPDATING 1: {} has {} participants'.format(datasets[data_id], len(data_list)))
    ds_key = datasets[data_id]
    data[ds_key] = data_list
    return data

if CREATE_DATA:
    data_row_length = {'stage': 11, 'gc': 9, 'points': 8, 'youth': 9, 'kom': 8, 'teams': 4}
    data = get_stage_data(race_html, data_row_length)

In [226]:
def fix_time(data, time_col):
    tdf = data[data[time_col] == ',,'][[time_col]]
    to_change_ix = list(tdf.index)
    data.loc[data.index.isin(to_change_ix), time_col] = None
    data[time_col] = data[time_col].fillna(method='ffill')
    return data

try:
    stage_df = pd.DataFrame(data['stage'], columns=['stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'uciStg','stagePnt', 'stageTime', 'DNF'])
    print('Reading data.')
    stage_df = fix_time(stage_df, 'stageTime')
    stage_df = stage_df.set_index('bib')

    gc_df = pd.DataFrame(data['gc'], columns=['gcPos', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'time', 'more', 'DNF'])
    gc_df = gc_df[['bib', 'uciGc']].set_index('bib')

    
    green_df = pd.DataFrame(data['points'], columns=['greenPos', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'pntsChng', 'DNF'])
    green_df = green_df[['bib', 'greenPos', 'greenPnts']].set_index('bib')

    youth_df = pd.DataFrame(data['youth'], columns=['youthPos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'youthTime', 'DNF'])[['bib', 'youthPos', 'youthTime']]
    youth_df = fix_time(youth_df, 'youthTime')
    youth_df = youth_df.set_index('bib')

    kom_df = pd.DataFrame(data['kom'], columns=['komPos', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'pntsChnge', 'DNF'])
    kom_df = kom_df[['bib', 'komPos', 'komPnts']].set_index('bib')

    teams_df = pd.DataFrame(data['teams'], columns=['teamPos', 'change', 'teamName', 'teamTime', 'DNF'])
    teams_df = teams_df[['teamPos', 'teamName', 'teamTime']]

    df = pd.concat([stage_df, gc_df, green_df, youth_df, kom_df], axis=1, sort=False)
    df = df.reset_index().rename(columns={'index': 'bibNum'})
    df.to_csv(FIRST_STAGE_LOC)
except NameError:
    print('Reading data.')
    df = pd.read_csv(FIRST_STAGE_LOC, index_col='Unnamed: 0')

df.head()

Reading data.


Unnamed: 0,bibNum,stagePos,gcPos,timeAdd,url,name,age,team,uciStg,stagePnt,stageTime,DNF,uciGc,greenPos,greenPnts,youthPos,youthTime,komPos,komPnts
0,135,1,1.0,+0:00,rider/alexander-kristoff,Kristoff Alexander,33,UAE-Team Emirates,120.0,100.0,3:46:23,,25.0,1.0,59.0,,,,
1,105,2,2.0,+0:04,rider/mads-pedersen,Pedersen Mads,24,Trek - Segafredo,50.0,70.0,3:46:23,,,2.0,30.0,1.0,3:46:17,,
2,203,3,3.0,+0:06,rider/cees-bol,Bol Cees,25,Team Sunweb,25.0,50.0,3:46:23,,,6.0,20.0,2.0,0:02,,
3,43,4,4.0,+0:10,rider/sam-bennett,Bennett Sam,29,Deceuninck - Quick Step,15.0,40.0,3:46:23,,,4.0,28.0,,,,
4,21,5,5.0,+0:10,rider/peter-sagan,Sagan Peter,30,BORA - hansgrohe,5.0,32.0,3:46:23,,,3.0,29.0,,,,


In [227]:
print(df.DNF.unique())

for finish_error in df.DNF.unique():
    if finish_error is not np.nan:
        df[finish_error] = 0

        df.loc[df.DNF == finish_error, [finish_error]] = 1
#df = df.drop(['DNF'], axis=1)

df['youth'] = 1
df.loc[df.youthTime.isnull(), ['youth']] = 0
# make the dataframe
df.name = df.name.str.upper()
df.team = df.team.str.upper()

print('There were {} cyclists that partook stage 1.\nThere are/ is {} that did not finish.'.format(df.shape[0], df[df.DNF != ''].shape[0]))
df.head()

[nan 'OTL']
There were 176 cyclists that partook stage 1.
There are/ is 176 that did not finish.


Unnamed: 0,bibNum,stagePos,gcPos,timeAdd,url,name,age,team,uciStg,stagePnt,...,DNF,uciGc,greenPos,greenPnts,youthPos,youthTime,komPos,komPnts,OTL,youth
0,135,1,1.0,+0:00,rider/alexander-kristoff,KRISTOFF ALEXANDER,33,UAE-TEAM EMIRATES,120.0,100.0,...,,25.0,1.0,59.0,,,,,0,0
1,105,2,2.0,+0:04,rider/mads-pedersen,PEDERSEN MADS,24,TREK - SEGAFREDO,50.0,70.0,...,,,2.0,30.0,1.0,3:46:17,,,0,1
2,203,3,3.0,+0:06,rider/cees-bol,BOL CEES,25,TEAM SUNWEB,25.0,50.0,...,,,6.0,20.0,2.0,0:02,,,0,1
3,43,4,4.0,+0:10,rider/sam-bennett,BENNETT SAM,29,DECEUNINCK - QUICK STEP,15.0,40.0,...,,,4.0,28.0,,,,,0,0
4,21,5,5.0,+0:10,rider/peter-sagan,SAGAN PETER,30,BORA - HANSGROHE,5.0,32.0,...,,,3.0,29.0,,,,,0,0


## Create dataframes
We are going to separate the data frame above into 3 more succinct dataframes. 

- `riders_df` that contians the 175 riders' personal infomration.
- `stages_df` that containes that stages competed in by all the 175 riders in the past season
- `races_df` that contains the information about the races that the 175 riders competed in in the past season (a race can have multiple stages)

In [228]:
if CREATE_DATA:
    riders_df = df[['name', 'age', 'url', 'team']]
    riders_df = riders_df.sort_values('name')
    riders_df['riderID'] = range(riders_df.shape[0])
    riders_df = riders_df.set_index('riderID')
    riders_df.to_csv(RIDERS_LOC)
else:
    riders_df = pd.read_csv(RIDERS_LOC, index_col='riderID')
riders_df.head()

Unnamed: 0_level_0,name,age,url,team
riderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,ALAPHILIPPE JULIAN,28,rider/julian-alaphilippe,DECEUNINCK - QUICK STEP
1,AMADOR ANDREY,34,rider/andrey-amador,INEOS GRENADIERS
2,ANACONA WINNER,32,rider/winner-anacona,TEAM ARKÉA SAMSIC
3,ARNDT NIKIAS,28,rider/nikias-arndt,TEAM SUNWEB
4,ARU FABIO,30,rider/fabio-aru,UAE-TEAM EMIRATES


In [229]:
def find_races(url, rider_id, rider_name):
    ''' Extract all the races in the season for each rider in the rider dfn the rider's profile (url)'''
    page = requests.get(url)
    rider_html = BeautifulSoup(page.content, 'html.parser')
    results_html = rider_html.body.tbody
    # all races in data rows (tr)
    rows = results_html.find_all('tr')

    races = list()
    for row in rows:
        items = row.find_all('td')
        # extract text values from data cell
        row = list()
        for item in items:
            text = get_text(item)
            if type(text) is type('str'):
                row.append(text)
            else:
                row.append(text[0])
                row.append(text[1])
        races.append(row)
    df = pd.DataFrame(races, columns=['date', 'stagePos', 'gcPos', 'unknown', 'url', 'stage', 'distance', 'pcs', 'uci', 'more'])
    print('"{}" competed in {} stages'.format(rider_name, df.shape[0]))
    df['riderID'] = rider_id
    return df

if CREATE_DATA:
    # initialise list of dataframes
    stages_list = list()
    # itterate through each cyclist and add their races for the season
    # to a list of dataframes
    for ix, row in riders_df.iterrows():
        print('{} of {}'.format(ix, riders_df.shape[0]))
        url = row['url']
        name = row['name']
        rider_url = 'https://www.procyclingstats.com/{}'.format(url)
        print(rider_url)
        riders_stages_df = find_races(rider_url, ix, name)
        stages_list.append(riders_stages_df)

In [230]:
if CREATE_DATA:
    stages_ = pd.concat(stages_list).reset_index(drop=True)
    print('{} stages have been loaded'.format(stages_.shape[0]))
    # drop 2 unnecessary columns
    stages_ = stages_.drop(['unknown', 'more'], axis=1)
    stages_.to_csv(STAGES_LOC)
else:
    stages_ = pd.read_csv(STAGES_LOC, index_col='Unnamed: 0')
    stages_.fillna('', inplace=True)
stages_.head()

Unnamed: 0,date,stagePos,gcPos,url,stage,distance,pcs,uci,riderID
0,› 20.09,,,race/tour-de-france/2020/stage-1,Tour de France,,,,0
1,10.09,11.0,38.0,race/tour-de-france/2020/stage-12,Stage 12 - Chauvigny › Sarran,218.0,8.0,,0
2,09.09,147.0,45.0,race/tour-de-france/2020/stage-11,Stage 11 - Châtelaillon-Plage › Poitiers,167.5,,,0
3,08.09,160.0,41.0,race/tour-de-france/2020/stage-10,Stage 10 - île d'Oléron (Le Château-d'Oléron) ...,168.5,,,0
4,06.09,160.0,38.0,race/tour-de-france/2020/stage-9,Stage 9 - Pau › Laruns,153.0,,,0


In [231]:
def _get_unique_races(data):
    '''
        Return a dataframe with the unique races with their information.
    '''
    data.columns = ['race', 'url']
    num_races = len(data.race.unique())
    num_url = len(data.url.unique())
    
    if num_races == num_url:
        rdf = data.groupby(['race', 'url']).count()
        rdf = rdf.reset_index()
        rdf['cutUrl'] = rdf.apply(lambda x: x['url'][0: x['url'].rindex('/')], axis =1)
        rdf.index.name = 'raceIX'
        return rdf
    else:
        return None
        
def _add_race_id_to_data(races_df, all_data):
    '''
        Append the race ID for each stage in the main df associated with the race. 
    '''
    if 'raceID' not in all_data.columns:
        print('Creating "raceID" collumn')
        all_data['raceID'] = np.nan
        
    for race_ix, race_row in races_df.iterrows():
        # for each race, put their id in the original. df
        url_begin = race_row['cutUrl']
        all_data.loc[all_data.url.str.startswith(url_begin), 'raceID'] = race_ix
        
    return all_data

def create_races_df(data):
    ''' 
    '''
    # multistage races 
    rdf = data[(data.distance == '') & (data.stagePos == '')][['stage', 'url']]
    rdf2 = _get_unique_races(rdf)
    data1 = _add_race_id_to_data(rdf2, data)
        
    # one day races
    odr = data1[data1.raceID.isnull()][['stage', 'url']]
    rdf3 = _get_unique_races(odr)
    data2 = _add_race_id_to_data(rdf3, data1)
    
    races_df = pd.concat([rdf2, rdf3], ignore_index=True)
    races_df['year'] = races_df.apply(lambda x: x['cutUrl'][x['cutUrl'].rindex('/') + 1:], axis =1)
    races_df.index.name = 'raceID'
    return races_df, data2
    
races_df, rhdf = create_races_df(stages_)
rhdf = rhdf.astype({'raceID': int})
print('There are {} unique races.\nThere are {} stages(s) without a raceID.'.format(races_df.shape[0], rhdf[rhdf.raceID.isnull()].shape[0]))
races_df.head()

Creating "raceID" collumn
There are 79 unique races.
There are 0 stages(s) without a raceID.


Unnamed: 0_level_0,race,url,cutUrl,year
raceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Critérium du Dauphiné,race/dauphine/2020/stage-1,race/dauphine/2020,2020
1,Czech Tour,race/czech-cycling-tour/2020/stage-1,race/czech-cycling-tour/2020,2020
2,Etoile de Bessèges,race/etoile-de-besseges/2020/stage-1,race/etoile-de-besseges/2020,2020
3,Jayco Herald Sun Tour,race/herald-sun-tour/2020/stage-1,race/herald-sun-tour/2020,2020
4,La Route d'Occitanie - La Dépêche du Midi,race/la-route-d-occitanie/2020/stage-1,race/la-route-d-occitanie/2020,2020


In [232]:
print(rhdf.shape)
rhdf.head()

(7292, 10)


Unnamed: 0,date,stagePos,gcPos,url,stage,distance,pcs,uci,riderID,raceID
0,› 20.09,,,race/tour-de-france/2020/stage-1,Tour de France,,,,0,13
1,10.09,11.0,38.0,race/tour-de-france/2020/stage-12,Stage 12 - Chauvigny › Sarran,218.0,8.0,,0,13
2,09.09,147.0,45.0,race/tour-de-france/2020/stage-11,Stage 11 - Châtelaillon-Plage › Poitiers,167.5,,,0,13
3,08.09,160.0,41.0,race/tour-de-france/2020/stage-10,Stage 10 - île d'Oléron (Le Château-d'Oléron) ...,168.5,,,0,13
4,06.09,160.0,38.0,race/tour-de-france/2020/stage-9,Stage 9 - Pau › Laruns,153.0,,,0,13


In [233]:
# drop multi stage race heagings
rider_race_history_df = rhdf[(rhdf.distance != '') | (rhdf.stagePos != '')]
print(rider_race_history_df.shape)
# drop end of stage race classifcation
# will be added in more detail
rider_race_history_df = rider_race_history_df[(rider_race_history_df.date != '') | (rider_race_history_df.gcPos != '')]
print(rider_race_history_df.shape)
rider_race_history_df.head()

(6508, 10)
(5536, 10)


Unnamed: 0,date,stagePos,gcPos,url,stage,distance,pcs,uci,riderID,raceID
1,10.09,11,38,race/tour-de-france/2020/stage-12,Stage 12 - Chauvigny › Sarran,218.0,8.0,,0,13
2,9.09,147,45,race/tour-de-france/2020/stage-11,Stage 11 - Châtelaillon-Plage › Poitiers,167.5,,,0,13
3,8.09,160,41,race/tour-de-france/2020/stage-10,Stage 10 - île d'Oléron (Le Château-d'Oléron) ...,168.5,,,0,13
4,6.09,160,38,race/tour-de-france/2020/stage-9,Stage 9 - Pau › Laruns,153.0,,,0,13
5,5.09,38,26,race/tour-de-france/2020/stage-8,Stage 8 - Cazères › Loudenvielle,141.0,,,0,13


In [234]:
stgs_df = rider_race_history_df[['date', 'stage', 'url', 'distance']]
print(len(stgs_df.groupby(['date']).count()))
print(len(stgs_df.groupby(['stage']).count()))
print(len(stgs_df.groupby(['url']).count()))
stgs_df = stgs_df.groupby(['stage', 'url', 'date', 'distance']).count()
stgs_df = stgs_df.reset_index()
stgs_df.index.name = 'stagesID'

print(stgs_df.shape)
stgs_df.head()

93
193
193
(193, 4)


Unnamed: 0_level_0,stage,url,date,distance
stagesID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Bretagne Classic - Ouest-France,race/bretagne-classic/2020/result,25.08,247.75
1,Cadel Evans Great Ocean Road Race,race/great-ocean-race/2020/result,2.02,171.1
2,Circuito de Getxo-Memorial Hermanos Otxoa,race/circuito-de-getxo/2020/result,2.08,177.0
3,Clasica de Almeria,race/clasica-de-almeria/2020/result,16.02,187.6
4,European Continental Championships - ITT,race/uec-road-european-championships-itt/2020/...,24.08,25.6


In [235]:
stgs_df['cutUrl'] = stgs_df.apply(lambda x: x['url'][0: x['url'].rindex('/')], axis=1)
stgs_df['raceID'] = -1
for rix, race_row in races_df.iterrows():
    stgs_df.loc[stgs_df.cutUrl == race_row['cutUrl'], 'raceID'] = rix
    #races_df.head()
print('{} stages have no raceID'.format(stgs_df[stgs_df.cutUrl == -1].shape[0]))
stgs_df.head()

0 stages have no raceID


Unnamed: 0_level_0,stage,url,date,distance,cutUrl,raceID
stagesID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Bretagne Classic - Ouest-France,race/bretagne-classic/2020/result,25.08,247.75,race/bretagne-classic/2020,27
1,Cadel Evans Great Ocean Road Race,race/great-ocean-race/2020/result,2.02,171.1,race/great-ocean-race/2020,28
2,Circuito de Getxo-Memorial Hermanos Otxoa,race/circuito-de-getxo/2020/result,2.08,177.0,race/circuito-de-getxo/2020,29
3,Clasica de Almeria,race/clasica-de-almeria/2020/result,16.02,187.6,race/clasica-de-almeria/2020,30
4,European Continental Championships - ITT,race/uec-road-european-championships-itt/2020/...,24.08,25.6,race/uec-road-european-championships-itt/2020,31


In [236]:
rider_race_history_df['stageID'] = np.nan

for stage_ix, stage_row in stgs_df.iterrows():
    stage_url = stage_row['url']
    rider_race_history_df.loc[rider_race_history_df.url == stage_url, 'stageID'] = stage_ix
    
print('{} races are unlabelled'.format(rider_race_history_df[rider_race_history_df.stageID.isnull()].shape[0]))
race_history_df = rider_race_history_df.drop(['url', 'stage', 'distance', 'date', 'raceID'], axis=1)
race_history_df = race_history_df.astype({'stageID': int})
race_history_df.head()

0 races are unlabelled


Unnamed: 0,stagePos,gcPos,pcs,uci,riderID,stageID
1,11,38,8.0,,0,77
2,147,45,,,0,76
3,160,41,,,0,75
4,160,38,,,0,187
5,38,26,,,0,185


In [237]:
races_df.head()

Unnamed: 0_level_0,race,url,cutUrl,year
raceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Critérium du Dauphiné,race/dauphine/2020/stage-1,race/dauphine/2020,2020
1,Czech Tour,race/czech-cycling-tour/2020/stage-1,race/czech-cycling-tour/2020,2020
2,Etoile de Bessèges,race/etoile-de-besseges/2020/stage-1,race/etoile-de-besseges/2020,2020
3,Jayco Herald Sun Tour,race/herald-sun-tour/2020/stage-1,race/herald-sun-tour/2020,2020
4,La Route d'Occitanie - La Dépêche du Midi,race/la-route-d-occitanie/2020/stage-1,race/la-route-d-occitanie/2020,2020


In [238]:
riders_df.head()

Unnamed: 0_level_0,name,age,url,team
riderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,ALAPHILIPPE JULIAN,28,rider/julian-alaphilippe,DECEUNINCK - QUICK STEP
1,AMADOR ANDREY,34,rider/andrey-amador,INEOS GRENADIERS
2,ANACONA WINNER,32,rider/winner-anacona,TEAM ARKÉA SAMSIC
3,ARNDT NIKIAS,28,rider/nikias-arndt,TEAM SUNWEB
4,ARU FABIO,30,rider/fabio-aru,UAE-TEAM EMIRATES


In [239]:
stgs_df.head()

Unnamed: 0_level_0,stage,url,date,distance,cutUrl,raceID
stagesID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Bretagne Classic - Ouest-France,race/bretagne-classic/2020/result,25.08,247.75,race/bretagne-classic/2020,27
1,Cadel Evans Great Ocean Road Race,race/great-ocean-race/2020/result,2.02,171.1,race/great-ocean-race/2020,28
2,Circuito de Getxo-Memorial Hermanos Otxoa,race/circuito-de-getxo/2020/result,2.08,177.0,race/circuito-de-getxo/2020,29
3,Clasica de Almeria,race/clasica-de-almeria/2020/result,16.02,187.6,race/clasica-de-almeria/2020,30
4,European Continental Championships - ITT,race/uec-road-european-championships-itt/2020/...,24.08,25.6,race/uec-road-european-championships-itt/2020,31


In [333]:
ONE_DAY_RACE = 0
FIRST_STAGE_IN_TOUR = 1
OTHER_TOUR_STAGE = 2
ITT = 3
PROLOGUE = 4
def build_df(data, result_type):
    print(result_type)
    if result_type == ONE_DAY_RACE:
        stage_col = ['stagePos', 'gcPos', 'url', 'name', 'age', 'teamName', 'uciStage', 'pnt', 'stageTime', 'DNF']
    elif result_type == ITT:
        stage_col = ['stagePos', 'gcPos', 'url', 'name', 'age', 'countryTeam', 'uciStage', 'pnt', 'stgAvgPace', 'stageTime', 'DNF']
    elif result_type == FIRST_STAGE_IN_TOUR:
        stage_col = ['stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'stageTime', 'DNF']
        gc_col = ['gcPos', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'gcTime', 'more', 'DNF']
        green_col = ['greenPos', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'greenPntsChng', 'DNF']
        youth_col = ['youthPos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'youthTime', 'DNF']
        kom_col = ['komPos', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'komPntsChng', 'DNF']
        teams_col = ['teamPos', 'empty', 'teamName', 'teamTime', 'DNF']
    elif result_type == OTHER_TOUR_STAGE:
        stage_col = ['stagePos', 'gcPos', 'gcTime', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'stageTime', 'DNF']
        gc_col = ['gcPos', 'prevGcPos', 'gcChng', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'gcTime', 'more', 'DNF']
        green_col = ['greenPos', 'prevGreenPos', 'greenChng', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'greenPntsChng', 'DNF']
        youth_col = ['youthPos', 'prevYouthPos', 'youthChng', 'gcPos', 'gcTime', 'bib', 'url', 'name', 'age', 'team', 'youthTime', 'DNF']
        kom_col = ['komPos', 'prevKomPos', 'komChng', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'komPntsChng', 'DNF']
        teams_col = ['teamPos', 'prevTeamPos', 'teamChng', 'empty', 'teamName', 'teamTime', 'DNF']
    elif result_type == PROLOGUE:
        stage_col = ['stagePos', 'gcPos', 'gcTime', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'avgTime', 'stageTime', 'DNF']
        gc_col = ['gcPos', 'bib', 'url', 'name', 'age', 'teamName', 'uciGc', 'stageTime', 'more', 'DNF']
        green_col = ['greenPos', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'greenPntsChng', 'DNF']
        youth_col = ['youthPos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'youthTime', 'DNF']
        kom_col = ['komPos', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'komPntsChng', 'DNF']
        teams_col = ['teamPos', 'empty', 'teamName', 'teamTime', 'DNF']
        
    
    print('Stage')
    print('Stage col: ', stage_col)
    print(data.keys())
    print('DATA STAGE list, 0:', data['stage'][0])
    stage_df = pd.DataFrame(data['stage'], columns=stage_col)
    stage_df = fix_time(stage_df, 'stageTime')
    
    df_set_names = data.keys()

    if 'gc' in df_set_names:
        # one day races only have 1 data frame
        stage_df = stage_df.set_index('bib')
        
        print('GC: ')
        gc_df = pd.DataFrame(data['gc'], columns=gc_col)
        gc_df = gc_df[['bib', 'uciGc']].set_index('bib')

    
        print('points')
        green_df = pd.DataFrame(data['points'], columns=green_col)
        green_df = green_df[['bib', 'greenPos', 'greenPnts']].set_index('bib')

        print('youth\n{}\n{}'.format(data['youth'][0], youth_col))
        youth_df = pd.DataFrame(data['youth'], columns=youth_col)
        youth_df = youth_df[['bib', 'youthPos', 'youthTime']].set_index('bib')
        youth_df = fix_time(youth_df, 'youthTime')

        if 'kom' in df_set_names:
            kom_df= pd.DataFrame(data['kom'], columns=kom_col)
            kom_df = kom_df[['bib', 'komPos', 'komPnts']].set_index('bib')
        
        print('TEAMS\n{}\n{}'.format(data['teams'][0], teams_col))
        team_df = pd.DataFrame(data['teams'], columns=teams_col)
        team_df = team_df[['teamPos', 'teamName', 'teamTime']]

        df = pd.concat([stage_df, gc_df, green_df, youth_df, kom_df], axis=1, sort=False)
        return df
    else:
        return stage_df

In [329]:
def select_map_subset(stage_html, data_row_length):
    '''
        Select the subset of keys and values for datasets that are in 
        the html.
    '''
    div_res_left = stage_html.find_all('div', class_="res-left")
    li_list = div_res_left[0].find_all('li')
    key_list = [get_text(li)[1].lower() for li in li_list]
    print(key_list)
    if len(key_list) > 0:
        print('KEY list: ', key_list)
        drl = {}
        for key in key_list:
            if key == 'prol.':
                key = 'stage'
            if key == '':
                key = 'stage'
            drl[key] = data_row_length[key]
        print('DRL: ', drl)
        return drl
    else:
        print('data_row_length: ', data_row_length)
        return data_row_length

# get the points classifications
def get_points(stage_html, print_=False):
    ''' 
        Extract all the races in the season for the rider on the rider's profile (url)
    '''

    if 'One day race' in str(stage_html.h2):
        print('ONE DAY RACE')
        data_row_lengths = {'stage': 9}
        result_type = ONE_DAY_RACE
    elif '(ITT)' in str(stage_html.h2) or 'Time trial' in str(stage_html.h2):
        print('(ITT)')
        data_row_lengths = {'stage': 10}
        result_type = ITT
    elif 'Prev' in str(stage_html):
        print('OTHER TOUR STAGE')
        result_type = OTHER_TOUR_STAGE
        data_row_lengths = {'stage': 11, 'gc': 11, 'points': 10, 'youth': 11, 'kom': 10, 'teams': 6}
    elif 'Prologue' in str(stage_html.h2):
        print('PROLOGUE')
        result_type = PROLOGUE
        data_row_lengths = {'stage': 12, 'gc': 9, 'points': 8, 'youth': 9, 'kom': 10, 'teams': 6}
    else:
        print('__FIRST_STAGE_IN_TOUR')
        result_type = FIRST_STAGE_IN_TOUR
        data_row_lengths = {'stage': 11, 'gc': 9, 'points': 8, 'youth': 9, 'kom': 8, 'teams': 4}

    data_cols = select_map_subset(stage_html, data_row_lengths)
    print('DATA COLUMNS: ', data_cols)
    data = get_stage_data(stage_html, data_cols, print_row=print_)
    df = build_df(data, result_type)
    return df
    
#url_uniques = stgs_df.url.unique()
#all_races = list()
#print(len(url_uniques))
#for uix, url in enumerate(url_uniques):
#    stage_url = 'https://www.procyclingstats.com/{}'.format(url)
#    print('{} in {}: {}'.format(uix, len(url_uniques), url))
#    #stage_url = 'https://www.procyclingstats.com/race/tour-de-france/2020/stage-2'
#    rdf = get_points(stage_url)
#    all_races.append(rdf)
#    #if count == 5: break

In [319]:
def not_empty_text(text):
    t = text.replace(' ', '')
    if len(t) == 0:
        return False
    else:
        return True
    
def find_more_stage_info(stage_html):
    '''
        Each stage has more ifnormation about its profile. Itterate 
        through each stage and extract the profile information. 
    '''
    res_ = stage_html.find_all("div", class_="res-right")
    
    res_text = res_[0].find_all(text=True)
    
    stage_info = list()
    mountains = list()
    found_race_rank = False
    
    for tix, text in enumerate(res_text):
        web_regex = re.search('(www.(.)+\.(.)+)+', text) \
                        or re.search('((.)+\.com(.)+)+', text) \
                        or re.search('((.)+\.(\w)*(\d)*/)', text) \
                        or 'googletag.cmd.push(' in text
        #if web_regex: print(text)
        if text not in ['Race information', 'Date: ', 'Avg. speed winner:', 'rd', \
                        'Race category: ', 'Parcours type:', 'PCS point scale:', \
                        ' ', 'Start/finish:', ' › ', 'Climbs: ', ', ', 'Race profile', \
                        'Finish photo', 'Finish photo', 'LiveStats', 'Websites:', \
                        'Race ranking position', 'ranking', 'th', 'nd', 'st', '\n', \
                        'breakdown', 'Position and points as on startdate of race.'] \
        and not web_regex:
            if len(stage_info) <= 6 or found_race_rank:
                # the first 6 cells of interest
                # or if the race rank has been found
                if '›' in text:
                    start_ix = text.find('›')
                    start = text[:start_ix]
                    if not_empty_text(start):
                        stage_info.append(start)
                    text = text[start_ix + 1:]
                if not_empty_text(text):
                    stage_info.append(text)
                if re.search('(\d)* pnt', text):
                    # after this string regex there is only adds and redundant information
                    break
            else:
                # there is a variable number of mountains 
                if is_not_int(text):
                    mountains.append(text)
                else:
                    # race rank (int value) comes right after mountains 
                    # have been listed
                    stage_info.append(mountains)
                    stage_info.append(len(mountains))
                    stage_info.append(text)
                    found_race_rank = True
        
        if text is 'ranking':
            break
    return stage_info

if CREATE_DATA or True:
    all_stages = list()
    all_racers = list()
    for stage_ix, stage  in stgs_df.iterrows():
        url = stage['url']
        stage_url = 'https://www.procyclingstats.com/{}'.format(url)
        print('{} of {} for \'{}\''.format(stage_ix, stgs_df.shape[0], url))
        page = requests.get(stage_url)
        stage_html = BeautifulSoup(page.content, 'html.parser')

        stage_row = find_more_stage_info(stage_html)
        if len(stage_row) > 11:
            print(len(stage_row))

        #stage_info = list(stage) + stage_row
        all_stages.append(stage_row)
        
        rdf = get_points(stage_html)
        all_racers.append(rdf)
    
    new_stg_info = pd.DataFrame(all_stages, columns=['dateFull', 'averageSpeed', 'raceCtgr', \
                                     'parcoursType', 'PCSPointScale', 'start',\
                                     'end', 'mountains', 'numMount', 'raceRank', \
                                     'racePoints'])
    new_stg_info.to_csv(NEW_STAGES_LOC)
else:
    new_stg_info = pd.read_csv(NEW_STAGES_LOC, index_col='Unnamed: 0')

0 of 193 for 'race/bretagne-classic/2020/result'
ONE DAY RACE
[]
data_row_length:  {'stage': 9}
DATA COLUMNS:  {'stage': 9}
DRL@@: {'stage': 9}
ERROR: [105, '206', 'rider/jasha-sutterlin', ' Sütterlin Jasha', '27', 'Team Sunweb', '', '', '-', 'DNF']
ERROR: [105, '51', 'rider/patrick-bevin', ' Bevin Patrick', '29', 'CCC Team', '', '', ',,', 'DNF']
ERROR: [105, '153', 'rider/adrien-garel', ' Garel Adrien', '24', 'B&B Hotels - Vital Concept p/b KTM', '', '', ',,', 'DNF']
ERROR: [105, '106', 'rider/sergio-samitier', ' Samitier Sergio', '24', 'Movistar Team', '', '', ',,', 'DNF']
ERROR: [105, '224', 'rider/loic-vliegen', ' Vliegen Loïc', '26', 'Circus - Wanty Gobert', '', '', ',,', 'DNF']
ERROR: [105, '62', 'rider/enrico-battaglin', ' Battaglin Enrico', '30', 'Bahrain - McLaren', '', '', ',,', 'DNF']
ERROR: [105, '191', 'rider/piet-allegaert', ' Allegaert Piet', '25', 'Cofidis, Solutions Crédits', '', '', ',,', 'DNF']
ERROR: [105, '12', 'rider/zhandos-bizhigitov', ' Bizhigitov Zhandos', '29

KeyboardInterrupt: 

In [None]:

print(len(all_racers))
all_racers_df = pd.concat(all_racers)
all_racers_df.head()

In [None]:
print(new_stg_info.shape)
new_stg_info.head()

In [20]:
print(stgs_df.shape)
stages_df  = pd.concat([stgs_df, new_stg_info], sort=False, axis=1)
stages_df['racePoints'] = stages_df.racePoints.str.replace('pnt', '')
#'raceRank' is relative to when the rank is taken
# not constant or reliable
stages_df = stages_df.drop('raceRank',  axis=1)
stages_df.index.name = 'stageID'
stages_df.head()

(193, 6)


Unnamed: 0_level_0,stage,url,date,distance,cutUrl,raceID,dateFull,averageSpeed,raceCtgr,parcoursType,PCSPointScale,start,end,mountains,numMount,racePoints
stageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,Bretagne Classic - Ouest-France,race/bretagne-classic/2020/result,25.08,247.75,race/bretagne-classic/2020,27,25th August 2020,41.15 km/h,Men Elite,45,1.WT.B,Plouay,Plouay,"['Restergal', 'Côte du Lezot', 'Restergal', 'C...",5.0,390
1,Cadel Evans Great Ocean Road Race,race/great-ocean-race/2020/result,2.02,171.1,race/great-ocean-race/2020,28,2nd February 2020,41.76 km/h,Men Elite,53,1.WT.C,Geelong,Geelong,[],0.0,395
2,Circuito de Getxo-Memorial Hermanos Otxoa,race/circuito-de-getxo/2020/result,2.08,177.0,race/circuito-de-getxo/2020,29,2nd August 2020,42.29 km/h,Men Elite,0*,1.1,Getxo,Getxo,[],0.0,238
3,Clasica de Almeria,race/clasica-de-almeria/2020/result,16.02,187.6,race/clasica-de-almeria/2020,30,16th February 2020,42.63 km/h,Men Elite,0*,1.HC,Roquetas de Mar,Roquetas de Mar,[],0.0,397
4,European Continental Championships - ITT,race/uec-road-european-championships-itt/2020/...,24.08,25.6,race/uec-road-european-championships-itt/2020,31,24th August 2020,50.69 km/h,Men Elite,0*,UCI.Cont.Ch.TT,Plouay,Plouay,[],0.0,86


In [21]:
errors = stages_df[stages_df.racePoints.isnull()]
print(errors.shape)
errors

(8, 16)


Unnamed: 0_level_0,stage,url,date,distance,cutUrl,raceID,dateFull,averageSpeed,raceCtgr,parcoursType,PCSPointScale,start,end,mountains,numMount,racePoints
stageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9,Gran Trittico Lombardo,race/gran-trittico-lombardo/2020/result,3.08,199.7,race/gran-trittico-lombardo/2020,36,3rd August 2020,42.64 km/h,Men Elite,77,1.HC,Legnano,Varese,,,
17,Milano-Sanremo,race/milano-sanremo/2020/result,8.08,305.0,race/milano-sanremo/2020,44,8th August 2020,41.96 km/h,Men Elite,48,1.WT.A,Milano,Sanremo,,,
68,Stage 1 - Saint-Affrique › Cazouls-lès-Béziers,race/la-route-d-occitanie/2020/stage-1,1.08,189.5,race/la-route-d-occitanie/2020,4,1st August 2020,41.35 km/h,Men Elite,0*,2.1.Stage,Saint-Affrique,Cazouls-lès-Béziers,,,
75,Stage 10 - île d'Oléron (Le Château-d'Oléron) ...,race/tour-de-france/2020/stage-10,8.09,168.5,race/tour-de-france/2020,13,8th September 2020,46.94 km/h,Men Elite,1,île d'Oléron (Le Château-d'Oléron),Île de Ré (Saint-Martin-de-Ré),2,,,
82,Stage 2 - Carcassonne › Cap Découverte,race/la-route-d-occitanie/2020/stage-2,2.08,182.5,race/la-route-d-occitanie/2020,4,2nd August 2020,41.73 km/h,Men Elite,32,2.1.Stage,Carcassonne,Cap Découverte,,,
122,Stage 3 - Saint-Gaudens › Col de Beyrède,race/la-route-d-occitanie/2020/stage-3,3.08,163.5,race/la-route-d-occitanie/2020,4,3rd August 2020,35.45 km/h,Men Elite,299,2.1.Stage,Saint-Gaudens,Col de Beyrède,,,
143,Stage 4 - Lectoure › Rocamadour,race/la-route-d-occitanie/2020/stage-4,4.08,195.0,race/la-route-d-occitanie/2020,4,4th August 2020,44.41 km/h,Men Elite,67,2.1.Stage,Lectoure,Rocamadour,,,
188,Strade Bianche,race/strade-bianche/2020/result,1.08,184.0,race/strade-bianche/2020,74,1st August 2020,36.93 km/h,Men Elite,104,1.WT.C,Siena,Siena,,,


### Errors
From the code above we see that there are stages that are erroneously made. This include 2 types:
 - Stages that have not list of mountains, number of mountains and racePoints (`9`, `17`, `68`)
 - Stages that have location names as PCSPointScale values (`75`)

In [22]:
# fix error type 1
stages_df.numMount.fillna(0, inplace=True)
stages_df.mountains.fillna('', inplace=True)
stages_df.racePoints.fillna(0, inplace=True)
stages_df[stages_df.racePoints.isnull()]

Unnamed: 0_level_0,stage,url,date,distance,cutUrl,raceID,dateFull,averageSpeed,raceCtgr,parcoursType,PCSPointScale,start,end,mountains,numMount,racePoints
stageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


In [23]:
# error type 2
e2_df = stages_df[stages_df.PCSPointScale.str.match(r'[\w\d\.]*\.{1}[\w\d\.]+')==False]
e2_df

Unnamed: 0_level_0,stage,url,date,distance,cutUrl,raceID,dateFull,averageSpeed,raceCtgr,parcoursType,PCSPointScale,start,end,mountains,numMount,racePoints
stageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5,European Continental Championships - Road Race,race/uec-road-european-championships/2020/result,26.08,177.45,race/uec-road-european-championships/2020,32,26th August 2020,42.19 km/h,Men Elite,45,E-CC,Plouay,Plouay,[],0.0,569
75,Stage 10 - île d'Oléron (Le Château-d'Oléron) ...,race/tour-de-france/2020/stage-10,8.09,168.5,race/tour-de-france/2020,13,8th September 2020,46.94 km/h,Men Elite,1,île d'Oléron (Le Château-d'Oléron),Île de Ré (Saint-Martin-de-Ré),2,,0.0,0
185,Stage 8 - Cazères › Loudenvielle,race/tour-de-france/2020/stage-8,5.09,141.0,race/tour-de-france/2020,13,5th September 2020,34.93 km/h,Men Elite,263,Cazères,Loudenvielle,Col de Peyresourde,"['Port de Balès', 'Col de Menté']",2.0,1508
187,Stage 9 - Pau › Laruns,race/tour-de-france/2020/stage-9,6.09,153.0,race/tour-de-france/2020,13,6th September 2020,39.02 km/h,Men Elite,212,Pau,Laruns,Col de Marie-Blanque,"[""Col d'Ichère"", 'Col de Soudet', 'Col de la H...",4.0,1508


In [24]:
for eix, err_row in e2_df.iterrows():
    if err_row['PCSPointScale'] == 'E-CC':
        continue
    start = err_row['PCSPointScale']
    end = err_row['start']
    mount = err_row['end']
    
    stages_df.loc[eix, 'PCSPointScale'] = ''
    stages_df.loc[eix, 'start'] = start
    stages_df.loc[eix, 'end'] = end
    mountains = e2_df.loc[eix, 'mountains']
    if is_not_int(mount):
        mountains = '{}, {}]'.format(mountains[:-1], mount)
        stages_df.loc[eix, 'mountains'] = mountains
    else:
        stages_df.loc[eix, 'mountains'] = ''
        stages_df.loc[eix, 'racePoints'] = mount

stages_df[stages_df.PCSPointScale.str.match(r'[\w\d\.]*\.{1}[\w\d\.]+')==False]

Unnamed: 0_level_0,stage,url,date,distance,cutUrl,raceID,dateFull,averageSpeed,raceCtgr,parcoursType,PCSPointScale,start,end,mountains,numMount,racePoints
stageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
5,European Continental Championships - Road Race,race/uec-road-european-championships/2020/result,26.08,177.45,race/uec-road-european-championships/2020,32,26th August 2020,42.19 km/h,Men Elite,45,E-CC,Plouay,Plouay,[],0.0,569
75,Stage 10 - île d'Oléron (Le Château-d'Oléron) ...,race/tour-de-france/2020/stage-10,8.09,168.5,race/tour-de-france/2020,13,8th September 2020,46.94 km/h,Men Elite,1,,île d'Oléron (Le Château-d'Oléron),Île de Ré (Saint-Martin-de-Ré),,0.0,2
185,Stage 8 - Cazères › Loudenvielle,race/tour-de-france/2020/stage-8,5.09,141.0,race/tour-de-france/2020,13,5th September 2020,34.93 km/h,Men Elite,263,,Cazères,Loudenvielle,"['Port de Balès', 'Col de Menté', Col de Peyre...",2.0,1508
187,Stage 9 - Pau › Laruns,race/tour-de-france/2020/stage-9,6.09,153.0,race/tour-de-france/2020,13,6th September 2020,39.02 km/h,Men Elite,212,,Pau,Laruns,"[""Col d'Ichère"", 'Col de Soudet', 'Col de la H...",4.0,1508


In [25]:
stages_df.head()

Unnamed: 0_level_0,stage,url,date,distance,cutUrl,raceID,dateFull,averageSpeed,raceCtgr,parcoursType,PCSPointScale,start,end,mountains,numMount,racePoints
stageID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,Bretagne Classic - Ouest-France,race/bretagne-classic/2020/result,25.08,247.75,race/bretagne-classic/2020,27,25th August 2020,41.15 km/h,Men Elite,45,1.WT.B,Plouay,Plouay,"['Restergal', 'Côte du Lezot', 'Restergal', 'C...",5.0,390
1,Cadel Evans Great Ocean Road Race,race/great-ocean-race/2020/result,2.02,171.1,race/great-ocean-race/2020,28,2nd February 2020,41.76 km/h,Men Elite,53,1.WT.C,Geelong,Geelong,[],0.0,395
2,Circuito de Getxo-Memorial Hermanos Otxoa,race/circuito-de-getxo/2020/result,2.08,177.0,race/circuito-de-getxo/2020,29,2nd August 2020,42.29 km/h,Men Elite,0*,1.1,Getxo,Getxo,[],0.0,238
3,Clasica de Almeria,race/clasica-de-almeria/2020/result,16.02,187.6,race/clasica-de-almeria/2020,30,16th February 2020,42.63 km/h,Men Elite,0*,1.HC,Roquetas de Mar,Roquetas de Mar,[],0.0,397
4,European Continental Championships - ITT,race/uec-road-european-championships-itt/2020/...,24.08,25.6,race/uec-road-european-championships-itt/2020,31,24th August 2020,50.69 km/h,Men Elite,0*,UCI.Cont.Ch.TT,Plouay,Plouay,[],0.0,86


In [28]:
races_df.head()

Unnamed: 0_level_0,race,url,cutUrl,year
raceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Critérium du Dauphiné,race/dauphine/2020/stage-1,race/dauphine/2020,2020
1,Czech Tour,race/czech-cycling-tour/2020/stage-1,race/czech-cycling-tour/2020,2020
2,Etoile de Bessèges,race/etoile-de-besseges/2020/stage-1,race/etoile-de-besseges/2020,2020
3,Jayco Herald Sun Tour,race/herald-sun-tour/2020/stage-1,race/herald-sun-tour/2020,2020
4,La Route d'Occitanie - La Dépêche du Midi,race/la-route-d-occitanie/2020/stage-1,race/la-route-d-occitanie/2020,2020


In [320]:
url = 'race/sibiu-cycling-tour/2020/prologue'
stage_url = 'https://www.procyclingstats.com/{}'.format(url)
print(stage_url)
page = requests.get(stage_url)
stage_html = BeautifulSoup(page.content, 'html.parser')

https://www.procyclingstats.com/race/sibiu-cycling-tour/2020/prologue


In [334]:
stage_row = find_more_stage_info(stage_html)
if len(stage_row) > 11:
    print(len(stage_row))

rdf = get_points(stage_html, True)
rdf
stage_col = ['gcPos', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage', 'stageTime', 'more', 'DNF']

PROLOGUE
['prol.', 'gc', 'points', 'youth', 'teams']
KEY list:  ['prol.', 'gc', 'points', 'youth', 'teams']
DRL:  {'stage': 12, 'gc': 9, 'points': 8, 'youth': 9, 'teams': 6}
DATA COLUMNS:  {'stage': 12, 'gc': 9, 'points': 8, 'youth': 9, 'teams': 6}
DRL@@: {'stage': 12, 'gc': 9, 'points': 8, 'youth': 9, 'teams': 6}
['1', '1', '+0:00', '166', 'rider/nikodemus-holler', ' Holler Nikodemus', '29', 'Bike Aid', '14', '20', '42.056', '3:34']
['2', '2', '+0:02', '146', 'rider/kacper-walkowiak', ' Walkowiak Kacper', '20', 'CCC Development Team', '5', '12', '41.667', '0:02']
['3', '3', '+0:03', '156', 'rider/wojciech-sykala', ' Sykala Wojciech', '26', 'Voster ATS Team', '3', '7', '41.475', '0:03']
['4', '4', '+0:05', '96', 'rider/adam-toupalik', ' Ťoupalík Adam', '24', 'Elkov - Kasper', '', '5', '41.096', '0:05']
['5', '5', '+0:09', '186', 'rider/matteo-rotondi', ' Rotondi Matteo', '23', 'Work Service Dynatek Vega', '', '4', '40.359', '0:09']
['6', '6', '+0:10', '196', 'rider/jon-knolle', ' Knoll

['teamPos', 'empty', 'teamName', 'teamTime', 'DNF']


ValueError: 5 columns passed, passed data had 10 columns

In [92]:
rdf[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1,10,▲9,1,rider/patrick-konrad,Konrad Patrick,28,BORA - hansgrohe,3,,,
1,2,..,2,25,▲23,2,rider/gregor-muhlberger,Mühlberger Gregor,26,BORA - hansgrohe,,4:55:54
2,2,..,3,77,▲74,13,rider/matteo-badilatti,Badilatti Matteo,27,Israel Start-Up Nation,,0:03
3,2,..,4,32,▲28,25,rider/remy-rochas,Rochas Rémy,24,NIPPO DELKO One Provence,,0:53
4,2,..,5,108,▲103,34,rider/luca-wackermann,Wackermann Luca,28,Vini Zabù - KTM,,1:20
5,2,..,6,35,▲29,152,rider/piotr-brozyna,Brożyna Piotr,25,Voster ATS Team,,2:40
6,2,..,7,37,▲30,163,rider/erik-bergstrom,Bergstrom Frisk Erik,20,Bike Aid,,3:38
7,2,..,8,44,▲36,171,rider/davide-rebellin,Rebellin Davide,48,Meridiana Kamen Team,,4:40
8,2,..,9,29,▲20,92,rider/karel-hnik,Hník Karel,28,Elkov - Kasper,,",,"
9,2,..,10,87,▲77,183,rider/raul-colombo,Colombo Raul,23,Work Service Dynatek Vega,,4:45


In [97]:
df = rdf[['stagePos', 'gcPos', 'timeAdd', 'url', 'name', 'age', 'teamName', 'uciStage', 'pnt', 'stageTime', 'DNF']]
df = df[~df.stagePos.isnull()]
left = df[df.gcPos == '']
for lix, left_row in left.iterrows():
    df.loc[lix, 'gcPos'] = left_row['stagePos']
df = df.astype({'gcPos': int})
df.sort_values(by=['gcPos'])




Unnamed: 0,stagePos,gcPos,timeAdd,url,name,age,teamName,uciStage,pnt,stageTime,DNF
1,2,1,+0:00,rider/patrick-konrad,Konrad Patrick,28,BORA - hansgrohe,5,12,4:52:11,
2,1,2,+0:03,rider/gregor-muhlberger,Mühlberger Gregor,26,BORA - hansgrohe,14,20,4:52:11,
13,3,3,+0:53,rider/matteo-badilatti,Badilatti Matteo,27,Israel Start-Up Nation,3,7,0:16,
25,4,4,+1:20,rider/remy-rochas,Rochas Rémy,24,NIPPO DELKO One Provence,,5,1:04,
34,5,5,+2:40,rider/luca-wackermann,Wackermann Luca,28,Vini Zabù - KTM,,4,1:41,
152,6,6,+3:38,rider/piotr-brozyna,Brożyna Piotr,25,Voster ATS Team,,3,3:20,
163,8,7,+4:40,rider/erik-bergstrom,Bergstrom Frisk Erik,20,Bike Aid,,1,4:22,
171,7,8,+4:40,rider/davide-rebellin,Rebellin Davide,48,Meridiana Kamen Team,,2,4:18,
92,10,9,+4:45,rider/karel-hnik,Hník Karel,28,Elkov - Kasper,,,4:30,
183,9,10,+5:10,rider/raul-colombo,Colombo Raul,23,Work Service Dynatek Vega,,,4:26,
