# Extract data
## Introduction
This notebook extracts professional cyclists' data for a season. Cleaning of the data will be done as well. Feature creation and analysis will be done in the next notebook.

We extract all the riders at the beginning of the tour. Thereafter we itterate through each rider's profile on `procyclingstats.com` and scrape the races that they have partaken in, for the season. Thereafter, we itterate through all the races and extract their profiles (diificulty, uci status). The data is saved. 

In [1]:
# imports
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import re
import os

In [2]:
CREATE_DATA = False
# the first stage of the race
RACE = 'tour-de-france'
YEAR = 2020
STAGE = 1
URL1 = 'https://www.procyclingstats.com/race/{}/{}/stage-{}/result/result'.format(RACE, YEAR, STAGE)

folder_loc = './temp_data/{}'.format(RACE)
if not os.path.exists(folder_loc):
    os.mkdir(folder_loc)
folder_loc = '{}/{}'.format(folder_loc, YEAR)
if not os.path.exists(folder_loc):
    os.mkdir(folder_loc)
    
FIRST_STAGE_LOC = '{}/first.csv'.format(folder_loc)
RACES_LOC = '{}/races_df.csv'.format(folder_loc)
STAGES_LOC = '{}/stages_df.csv'.format(folder_loc)
NEW_STAGES_LOC = '{}/new_stages_df.csv'.format(folder_loc)
RIDERS_LOC = '{}/riders_df.csv'.format(folder_loc)
ALL_RACE_DATA = '{}/ALL_RACE_DATA.csv'.format(folder_loc)

race_page = requests.get(URL1)
race_html = BeautifulSoup(race_page.content, 'html.parser')

In [107]:
def is_not_int(value):
    ''' Assesses whether value is an int or not'''
    try:
        int(value)
        return False
    except ValueError:
        return True

def get_text(cell):
    ''' Return the text from the html cell. '''
    # some cells have a span or hyperlink element with text in it
    if cell.a != None:
        url = cell.a.get('href')
        if url.startswith('rider/') or url.startswith('race/'):
            return url, cell.a.get_text()
        return cell.a.get_text()
    elif cell.span != None:
        return cell.span.get_text()
    else:
        return cell.get_text()
    
def get_stage_data(html, data_row_length, print_row=False):
    # all the racers are in a table data cell ('td')
    # intialise variables
    tdcs = html.find_all('td')
    # there can be up to 6 data tables on an html page
    #data = {'stage': list(), 'gc': list(), 'points': list(), 'youth': list(), 'kom': list(), 'teams': list()}
    data = {}
    print('Data row lengths:', data_row_length)
    datasets = list(data_row_length.keys())
    data_id = 0
    row_length = data_row_length[datasets[data_id]]
    
    old_length = row_length
    last_list_length = 0
    data_list = list()
    error_list = list()
    error_row = False
    
    row = list()
    last_ix = 1

    # itterate through all data cells and append their text values to a row
    for td_ix, cell in enumerate(tdcs):
        
        text = get_text(cell)
        if type(text) is type('str'):
            #print(row)
            row.append(text)
        else:
            row.append(text[0])
            row.append(text[1])
            
        if len(row) == 1:
            # the first element in the row is the position
            # if the rider did not finish, the position will not be an int
            # it will be: DNF, DNS, OTL
            #print(row[0])
            not_int = is_not_int(row[0])
            
            if not_int:
                error_row = True
            
            if not not_int and int(row[0]) == 1 and len(data_list) != 0:
                # a new table begins with a rider being places 1st
                # save the complete previous table to the data map
                print('UPDATING 5: {} has {} participants'.format(datasets[data_id], len(data_list)))
                if datasets[data_id] == 'teams':
                    print('............................TEAMS')
                print('di', data_id)
                print('drl', datasets)
                ds_key = datasets[data_id]
                data[ds_key] = data_list

                old_length = row_length
                
                #reinitialise variables
                data_id += 1
                data_list = list()
                last_ix = 1
                error_row = False
                
                row_length = data_row_length[datasets[data_id]]
                if print_row:
                    print('OLD ROW: {}, NEW ROW: {}'.format(old_length, row_length))

        
        if error_row and len(row) == old_length:
            # a row with a DNS, DNF, OTL rider
            # put them in last position
            # append the disqualificationto the end of the row
            row.append(row[0])
            row[0] = last_ix
            print('ERROR: {}'.format(row))
            data_list.append(row)
            row = list()
            
        elif not error_row and len(row) == row_length:
            # 'row_length' data cells make an entire row
            if print_row:
                print(row)
            # data list gets saved in data subset
            pos = int(row[0])
            # DQ/ DNF/ OL column
            row.append(np.nan)
            data_list.append(row)
            last_ix = pos + 1
            row = list()
                
    print('UPDATING 1: {} has {} participants'.format(datasets[data_id], len(data_list)))
    ds_key = datasets[data_id]
    data[ds_key] = data_list
    return data

if CREATE_DATA:
    data_row_length = {'stage': 11, 'gc': 9, 'points': 8, 'youth': 9, 'kom': 8, 'teams': 4}
    data = get_stage_data(race_html, data_row_length)

In [4]:
def fix_time(data, time_col):
    tdf = data[data[time_col] == ',,'][[time_col]]
    to_change_ix = list(tdf.index)
    data.loc[data.index.isin(to_change_ix), time_col] = None
    data[time_col] = data[time_col].fillna(method='ffill')
    return data

try:
    stage_df = pd.DataFrame(data['stage'], columns=['stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'uciStg','stagePnt', 'stageTime', 'DNF'])
    print('Reading data.')
    stage_df = fix_time(stage_df, 'stageTime')
    stage_df = stage_df.set_index('bib')

    gc_df = pd.DataFrame(data['gc'], columns=['gcPos', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'time', 'more', 'DNF'])
    gc_df = gc_df[['bib', 'uciGc']].set_index('bib')

    
    green_df = pd.DataFrame(data['points'], columns=['greenPos', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'pntsChng', 'DNF'])
    green_df = green_df[['bib', 'greenPos', 'greenPnts']].set_index('bib')

    youth_df = pd.DataFrame(data['youth'], columns=['youthPos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'youthTime', 'DNF'])[['bib', 'youthPos', 'youthTime']]
    youth_df = fix_time(youth_df, 'youthTime')
    youth_df = youth_df.set_index('bib')

    kom_df = pd.DataFrame(data['kom'], columns=['komPos', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'pntsChnge', 'DNF'])
    kom_df = kom_df[['bib', 'komPos', 'komPnts']].set_index('bib')

    teams_df = pd.DataFrame(data['teams'], columns=['teamPos', 'change', 'teamName', 'teamTime', 'DNF'])
    teams_df = teams_df[['teamPos', 'teamName', 'teamTime']]

    df = pd.concat([stage_df, gc_df, green_df, youth_df, kom_df], axis=1, sort=False)
    df = df.reset_index().rename(columns={'index': 'bibNum'})
    df.to_csv(FIRST_STAGE_LOC)
except NameError:
    print('Reading data.')
    df = pd.read_csv(FIRST_STAGE_LOC, index_col='Unnamed: 0')

df.head()

Reading data.


Unnamed: 0,bibNum,stagePos,gcPos,timeAdd,url,name,age,team,uciStg,stagePnt,stageTime,DNF,uciGc,greenPos,greenPnts,youthPos,youthTime,komPos,komPnts
0,135,1,1.0,+0:00,rider/alexander-kristoff,Kristoff Alexander,33,UAE-Team Emirates,120.0,100.0,3:46:23,,25.0,1.0,59.0,,,,
1,105,2,2.0,+0:04,rider/mads-pedersen,Pedersen Mads,24,Trek - Segafredo,50.0,70.0,3:46:23,,,2.0,30.0,1.0,3:46:17,,
2,203,3,3.0,+0:06,rider/cees-bol,Bol Cees,25,Team Sunweb,25.0,50.0,3:46:23,,,6.0,20.0,2.0,0:02,,
3,43,4,4.0,+0:10,rider/sam-bennett,Bennett Sam,29,Deceuninck - Quick Step,15.0,40.0,3:46:23,,,4.0,28.0,,,,
4,21,5,5.0,+0:10,rider/peter-sagan,Sagan Peter,30,BORA - hansgrohe,5.0,32.0,3:46:23,,,3.0,29.0,,,,


In [5]:
print(df.DNF.unique())

for finish_error in df.DNF.unique():
    if finish_error is not np.nan:
        df[finish_error] = 0

        df.loc[df.DNF == finish_error, [finish_error]] = 1
#df = df.drop(['DNF'], axis=1)

df['youth'] = 1
df.loc[df.youthTime.isnull(), ['youth']] = 0
# make the dataframe
df.name = df.name.str.upper()
df.team = df.team.str.upper()

print('There were {} cyclists that partook stage 1.\nThere are/ is {} that did not finish.'.format(df.shape[0], df[df.DNF != ''].shape[0]))
df.head()

[nan 'OTL']
There were 176 cyclists that partook stage 1.
There are/ is 176 that did not finish.


Unnamed: 0,bibNum,stagePos,gcPos,timeAdd,url,name,age,team,uciStg,stagePnt,...,DNF,uciGc,greenPos,greenPnts,youthPos,youthTime,komPos,komPnts,OTL,youth
0,135,1,1.0,+0:00,rider/alexander-kristoff,KRISTOFF ALEXANDER,33,UAE-TEAM EMIRATES,120.0,100.0,...,,25.0,1.0,59.0,,,,,0,0
1,105,2,2.0,+0:04,rider/mads-pedersen,PEDERSEN MADS,24,TREK - SEGAFREDO,50.0,70.0,...,,,2.0,30.0,1.0,3:46:17,,,0,1
2,203,3,3.0,+0:06,rider/cees-bol,BOL CEES,25,TEAM SUNWEB,25.0,50.0,...,,,6.0,20.0,2.0,0:02,,,0,1
3,43,4,4.0,+0:10,rider/sam-bennett,BENNETT SAM,29,DECEUNINCK - QUICK STEP,15.0,40.0,...,,,4.0,28.0,,,,,0,0
4,21,5,5.0,+0:10,rider/peter-sagan,SAGAN PETER,30,BORA - HANSGROHE,5.0,32.0,...,,,3.0,29.0,,,,,0,0


## Create dataframes
We are going to separate the data frame above into 3 more succinct dataframes. 

- `riders_df` that contians the 175 riders' personal infomration.
- `stages_df` that containes that stages competed in by all the 175 riders in the past season
- `races_df` that contains the information about the races that the 175 riders competed in in the past season (a race can have multiple stages)

In [6]:
if CREATE_DATA:
    riders_df = df[['name', 'age', 'url', 'team']]
    riders_df = riders_df.sort_values('name')
    riders_df['riderID'] = range(riders_df.shape[0])
    riders_df = riders_df.set_index('riderID')
    riders_df.to_csv(RIDERS_LOC)
else:
    riders_df = pd.read_csv(RIDERS_LOC, index_col='riderID')
riders_df.head()

Unnamed: 0_level_0,name,age,url,team
riderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,ALAPHILIPPE JULIAN,28,rider/julian-alaphilippe,DECEUNINCK - QUICK STEP
1,AMADOR ANDREY,34,rider/andrey-amador,INEOS GRENADIERS
2,ANACONA WINNER,32,rider/winner-anacona,TEAM ARKÉA SAMSIC
3,ARNDT NIKIAS,28,rider/nikias-arndt,TEAM SUNWEB
4,ARU FABIO,30,rider/fabio-aru,UAE-TEAM EMIRATES


In [7]:
def find_races(url, rider_id, rider_name):
    ''' Extract all the races in the season for each rider in the rider dfn the rider's profile (url)'''
    page = requests.get(url)
    rider_html = BeautifulSoup(page.content, 'html.parser')
    results_html = rider_html.body.tbody
    # all races in data rows (tr)
    rows = results_html.find_all('tr')

    races = list()
    for row in rows:
        items = row.find_all('td')
        # extract text values from data cell
        row = list()
        for item in items:
            text = get_text(item)
            if type(text) is type('str'):
                row.append(text)
            else:
                row.append(text[0])
                row.append(text[1])
        races.append(row)
    df = pd.DataFrame(races, columns=['date', 'stagePos', 'gcPos', 'unknown', 'url', 'stage', 'distance', 'pcs', 'uci', 'more'])
    print('"{}" competed in {} stages'.format(rider_name, df.shape[0]))
    df['riderID'] = rider_id
    return df

if CREATE_DATA:
    # initialise list of dataframes
    stages_list = list()
    # itterate through each cyclist and add their races for the season
    # to a list of dataframes
    for ix, row in riders_df.iterrows():
        print('{} of {}'.format(ix, riders_df.shape[0]))
        url = row['url']
        name = row['name']
        rider_url = 'https://www.procyclingstats.com/{}'.format(url)
        print(rider_url)
        riders_stages_df = find_races(rider_url, ix, name)
        stages_list.append(riders_stages_df)

In [8]:
if CREATE_DATA:
    stages_ = pd.concat(stages_list).reset_index(drop=True)
    print('{} stages have been loaded'.format(stages_.shape[0]))
    # drop 2 unnecessary columns
    stages_ = stages_.drop(['unknown', 'more'], axis=1)
    stages_.to_csv(STAGES_LOC)
else:
    stages_ = pd.read_csv(STAGES_LOC, index_col='Unnamed: 0')
    stages_.fillna('', inplace=True)
stages_.head()

Unnamed: 0,date,stagePos,gcPos,url,stage,distance,pcs,uci,riderID
0,› 20.09,,,race/tour-de-france/2020/stage-1,Tour de France,,,,0
1,10.09,11.0,38.0,race/tour-de-france/2020/stage-12,Stage 12 - Chauvigny › Sarran,218.0,8.0,,0
2,09.09,147.0,45.0,race/tour-de-france/2020/stage-11,Stage 11 - Châtelaillon-Plage › Poitiers,167.5,,,0
3,08.09,160.0,41.0,race/tour-de-france/2020/stage-10,Stage 10 - île d'Oléron (Le Château-d'Oléron) ...,168.5,,,0
4,06.09,160.0,38.0,race/tour-de-france/2020/stage-9,Stage 9 - Pau › Laruns,153.0,,,0


In [9]:
def _get_unique_races(data):
    '''
        Return a dataframe with the unique races with their information.
    '''
    data.columns = ['race', 'url']
    num_races = len(data.race.unique())
    num_url = len(data.url.unique())
    
    if num_races == num_url:
        rdf = data.groupby(['race', 'url']).count()
        rdf = rdf.reset_index()
        rdf['cutUrl'] = rdf.apply(lambda x: x['url'][0: x['url'].rindex('/')], axis =1)
        rdf.index.name = 'raceIX'
        return rdf
    else:
        return None
        
def _add_race_id_to_data(races_df, all_data):
    '''
        Append the race ID for each stage in the main df associated with the race. 
    '''
    if 'raceID' not in all_data.columns:
        print('Creating "raceID" collumn')
        all_data['raceID'] = np.nan
        
    for race_ix, race_row in races_df.iterrows():
        # for each race, put their id in the original. df
        url_begin = race_row['cutUrl']
        all_data.loc[all_data.url.str.startswith(url_begin), 'raceID'] = race_ix
        
    return all_data

def create_races_df(data):
    ''' 
    '''
    # multistage races 
    rdf = data[(data.distance == '') & (data.stagePos == '')][['stage', 'url']]
    rdf2 = _get_unique_races(rdf)
    data1 = _add_race_id_to_data(rdf2, data)
        
    # one day races
    odr = data1[data1.raceID.isnull()][['stage', 'url']]
    rdf3 = _get_unique_races(odr)
    data2 = _add_race_id_to_data(rdf3, data1)
    
    races_df = pd.concat([rdf2, rdf3], ignore_index=True)
    races_df['year'] = races_df.apply(lambda x: x['cutUrl'][x['cutUrl'].rindex('/') + 1:], axis =1)
    races_df.index.name = 'raceID'
    return races_df, data2
    
races_df, rhdf = create_races_df(stages_)
rhdf = rhdf.astype({'raceID': int})
print('There are {} unique races.\nThere are {} stages(s) without a raceID.'.format(races_df.shape[0], rhdf[rhdf.raceID.isnull()].shape[0]))
races_df.head()

Creating "raceID" collumn
There are 79 unique races.
There are 0 stages(s) without a raceID.


Unnamed: 0_level_0,race,url,cutUrl,year
raceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Critérium du Dauphiné,race/dauphine/2020/stage-1,race/dauphine/2020,2020
1,Czech Tour,race/czech-cycling-tour/2020/stage-1,race/czech-cycling-tour/2020,2020
2,Etoile de Bessèges,race/etoile-de-besseges/2020/stage-1,race/etoile-de-besseges/2020,2020
3,Jayco Herald Sun Tour,race/herald-sun-tour/2020/stage-1,race/herald-sun-tour/2020,2020
4,La Route d'Occitanie - La Dépêche du Midi,race/la-route-d-occitanie/2020/stage-1,race/la-route-d-occitanie/2020,2020


In [10]:
print(rhdf.shape)
rhdf.head()

(7292, 10)


Unnamed: 0,date,stagePos,gcPos,url,stage,distance,pcs,uci,riderID,raceID
0,› 20.09,,,race/tour-de-france/2020/stage-1,Tour de France,,,,0,13
1,10.09,11.0,38.0,race/tour-de-france/2020/stage-12,Stage 12 - Chauvigny › Sarran,218.0,8.0,,0,13
2,09.09,147.0,45.0,race/tour-de-france/2020/stage-11,Stage 11 - Châtelaillon-Plage › Poitiers,167.5,,,0,13
3,08.09,160.0,41.0,race/tour-de-france/2020/stage-10,Stage 10 - île d'Oléron (Le Château-d'Oléron) ...,168.5,,,0,13
4,06.09,160.0,38.0,race/tour-de-france/2020/stage-9,Stage 9 - Pau › Laruns,153.0,,,0,13


In [11]:
# drop multi stage race heagings
rider_race_history_df = rhdf[(rhdf.distance != '') | (rhdf.stagePos != '')]
print(rider_race_history_df.shape)
# drop end of stage race classifcation
# will be added in more detail
rider_race_history_df = rider_race_history_df[(rider_race_history_df.date != '') | (rider_race_history_df.gcPos != '')]
print(rider_race_history_df.shape)
rider_race_history_df.head()

(6508, 10)
(5536, 10)


Unnamed: 0,date,stagePos,gcPos,url,stage,distance,pcs,uci,riderID,raceID
1,10.09,11,38,race/tour-de-france/2020/stage-12,Stage 12 - Chauvigny › Sarran,218.0,8.0,,0,13
2,9.09,147,45,race/tour-de-france/2020/stage-11,Stage 11 - Châtelaillon-Plage › Poitiers,167.5,,,0,13
3,8.09,160,41,race/tour-de-france/2020/stage-10,Stage 10 - île d'Oléron (Le Château-d'Oléron) ...,168.5,,,0,13
4,6.09,160,38,race/tour-de-france/2020/stage-9,Stage 9 - Pau › Laruns,153.0,,,0,13
5,5.09,38,26,race/tour-de-france/2020/stage-8,Stage 8 - Cazères › Loudenvielle,141.0,,,0,13


In [12]:
stgs_df = rider_race_history_df[['date', 'stage', 'url', 'distance']]
print(len(stgs_df.groupby(['date']).count()))
print(len(stgs_df.groupby(['stage']).count()))
print(len(stgs_df.groupby(['url']).count()))
stgs_df = stgs_df.groupby(['stage', 'url', 'date', 'distance']).count()
stgs_df = stgs_df.reset_index()
stgs_df.index.name = 'stagesID'

print(stgs_df.shape)
stgs_df.head()

93
193
193
(193, 4)


Unnamed: 0_level_0,stage,url,date,distance
stagesID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Bretagne Classic - Ouest-France,race/bretagne-classic/2020/result,25.08,247.75
1,Cadel Evans Great Ocean Road Race,race/great-ocean-race/2020/result,2.02,171.1
2,Circuito de Getxo-Memorial Hermanos Otxoa,race/circuito-de-getxo/2020/result,2.08,177.0
3,Clasica de Almeria,race/clasica-de-almeria/2020/result,16.02,187.6
4,European Continental Championships - ITT,race/uec-road-european-championships-itt/2020/...,24.08,25.6


In [13]:
stgs_df['cutUrl'] = stgs_df.apply(lambda x: x['url'][0: x['url'].rindex('/')], axis=1)
stgs_df['raceID'] = -1
for rix, race_row in races_df.iterrows():
    stgs_df.loc[stgs_df.cutUrl == race_row['cutUrl'], 'raceID'] = rix
    #races_df.head()
print('{} stages have no raceID'.format(stgs_df[stgs_df.cutUrl == -1].shape[0]))
stgs_df.head()

0 stages have no raceID


Unnamed: 0_level_0,stage,url,date,distance,cutUrl,raceID
stagesID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Bretagne Classic - Ouest-France,race/bretagne-classic/2020/result,25.08,247.75,race/bretagne-classic/2020,27
1,Cadel Evans Great Ocean Road Race,race/great-ocean-race/2020/result,2.02,171.1,race/great-ocean-race/2020,28
2,Circuito de Getxo-Memorial Hermanos Otxoa,race/circuito-de-getxo/2020/result,2.08,177.0,race/circuito-de-getxo/2020,29
3,Clasica de Almeria,race/clasica-de-almeria/2020/result,16.02,187.6,race/clasica-de-almeria/2020,30
4,European Continental Championships - ITT,race/uec-road-european-championships-itt/2020/...,24.08,25.6,race/uec-road-european-championships-itt/2020,31


In [14]:
rider_race_history_df['stageID'] = np.nan

for stage_ix, stage_row in stgs_df.iterrows():
    stage_url = stage_row['url']
    rider_race_history_df.loc[rider_race_history_df.url == stage_url, 'stageID'] = stage_ix
    
print('{} races are unlabelled'.format(rider_race_history_df[rider_race_history_df.stageID.isnull()].shape[0]))
race_history_df = rider_race_history_df.drop(['url', 'stage', 'distance', 'date', 'raceID'], axis=1)
race_history_df = race_history_df.astype({'stageID': int})
race_history_df.head()

0 races are unlabelled


Unnamed: 0,stagePos,gcPos,pcs,uci,riderID,stageID
1,11,38,8.0,,0,77
2,147,45,,,0,76
3,160,41,,,0,75
4,160,38,,,0,187
5,38,26,,,0,185


In [15]:
races_df.head()

Unnamed: 0_level_0,race,url,cutUrl,year
raceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Critérium du Dauphiné,race/dauphine/2020/stage-1,race/dauphine/2020,2020
1,Czech Tour,race/czech-cycling-tour/2020/stage-1,race/czech-cycling-tour/2020,2020
2,Etoile de Bessèges,race/etoile-de-besseges/2020/stage-1,race/etoile-de-besseges/2020,2020
3,Jayco Herald Sun Tour,race/herald-sun-tour/2020/stage-1,race/herald-sun-tour/2020,2020
4,La Route d'Occitanie - La Dépêche du Midi,race/la-route-d-occitanie/2020/stage-1,race/la-route-d-occitanie/2020,2020


In [16]:
riders_df.head()

Unnamed: 0_level_0,name,age,url,team
riderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,ALAPHILIPPE JULIAN,28,rider/julian-alaphilippe,DECEUNINCK - QUICK STEP
1,AMADOR ANDREY,34,rider/andrey-amador,INEOS GRENADIERS
2,ANACONA WINNER,32,rider/winner-anacona,TEAM ARKÉA SAMSIC
3,ARNDT NIKIAS,28,rider/nikias-arndt,TEAM SUNWEB
4,ARU FABIO,30,rider/fabio-aru,UAE-TEAM EMIRATES


In [17]:
stgs_df.head()

Unnamed: 0_level_0,stage,url,date,distance,cutUrl,raceID
stagesID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Bretagne Classic - Ouest-France,race/bretagne-classic/2020/result,25.08,247.75,race/bretagne-classic/2020,27
1,Cadel Evans Great Ocean Road Race,race/great-ocean-race/2020/result,2.02,171.1,race/great-ocean-race/2020,28
2,Circuito de Getxo-Memorial Hermanos Otxoa,race/circuito-de-getxo/2020/result,2.08,177.0,race/circuito-de-getxo/2020,29
3,Clasica de Almeria,race/clasica-de-almeria/2020/result,16.02,187.6,race/clasica-de-almeria/2020,30
4,European Continental Championships - ITT,race/uec-road-european-championships-itt/2020/...,24.08,25.6,race/uec-road-european-championships-itt/2020,31


In [102]:
# constants for stage scraping
ONE_DAY_RACE = 0
FIRST_STAGE_IN_TOUR = 1
OTHER_TOUR_STAGE = 2
ITT = 3
PROLOGUE = 4
TTT = 5

stage_columns = [None] * 6
stage_columns[ONE_DAY_RACE] = {'stage': ['stageID', 'stagePos', 'bib', 'createBib', 'url', 'name', 'age', 'teamName', 'uciStage', 'pnt', 'stageTime', 'DNF']}
stage_columns[ITT] = {'stage': ['stageID', 'stagePos', 'bib', 'createBib', 'url', 'name', 'age', 'countryTeam', 'uciStage', 'pnt', 'stgAvgPace', 'stageTime', 'DNF']}
stage_columns[FIRST_STAGE_IN_TOUR] = {'stage': ['stageID', 'stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'stageTime', 'DNF'],
                                          'gc': ['stageID', 'gcPos', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'gcTime', 'more', 'DNF'], \
                                          'points': ['stageID', 'greenPos', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'greenPntsChng', 'DNF'], \
                                          'youth': ['stageID', 'youthPos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'youthTime', 'DNF'], \
                                          'kom': ['stageID', 'komPos', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'komPntsChng', 'DNF'], \
                                          'teams': ['stageID', 'teamPos', 'empty', 'teamName', 'teamTime', 'DNF'] \
                                        }
stage_columns[OTHER_TOUR_STAGE] = {'stage': ['stageID', 'stagePos', 'gcPos', 'gcTime', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'stageTime', 'DNF'], \
                                       'gc': ['stageID', 'gcPos', 'prevGcPos', 'gcChng', 'bib', 'url', 'name', 'age', 'team', 'uciGc', 'gcTime', 'more', 'DNF'], \
                                       'points': ['stageID', 'greenPos', 'prevGreenPos', 'greenChng', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'greenPntsChng', 'DNF'], \
                                       'youth': ['stageID', 'youthPos', 'prevYouthPos', 'youthChng', 'gcPos', 'gcTime', 'bib', 'url', 'name', 'age', 'team', 'youthTime', 'DNF'], \
                                       'kom': ['stageID', 'komPos', 'prevKomPos', 'komChng', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'komPntsChng', 'DNF'], \
                                       'teams': ['stageID', 'teamPos', 'prevTeamPos', 'teamChng', 'empty', 'teamName', 'teamTime', 'DNF']\
                                      }
stage_columns[PROLOGUE] = {'stage': ['stageID', 'stagePos', 'gcPos', 'gcTime', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'avgTime', 'stageTime', 'DNF'], \
                                       'gc': ['stageID', 'gcPos', 'bib', 'url', 'name', 'age', 'teamName', 'uciGc', 'stageTime', 'more', 'DNF'], \
                                       'points': ['stageID', 'greenPos', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'greenPntsChng', 'DNF'], \
                                       'youth': ['stageID', 'youthPos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'youthTime', 'DNF'], \
                                       'kom': ['stageID', 'komPos', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'komPntsChng', 'DNF'], \
                                       'teams': ['stageID', 'teamPos', 'empty', 'teamName', 'teamTime', 'DNF']
                                     }
stage_columns[TTT] = {'stage': ['stageID', 'stagePos', 'gcPos', 'gcTime', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage','pnt', 'avgTime', 'stageTime', 'DNF'], \
                                       'gc': ['stageID', 'gcPos', 'bib', 'url', 'name', 'age', 'teamName', 'stageTime', 'more', 'DNF'], \
                                       'points': ['stageID', 'greenPos', 'bib', 'url', 'name', 'age', 'team', 'greenPnts', 'greenPntsChng', 'DNF'], \
                                       'youth': ['stageID', 'youthPos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'team', 'youthTime', 'DNF'], \
                                       'kom': ['stageID', 'komPos', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'komPntsChng', 'DNF'], \
                                       'teams': ['stageID', 'teamPos', 'empty', 'teamName', 'teamTime', 'DNF']
                                     }
al = []
for stage_dss in stage_columns:
    stage_keys = stage_dss.keys()
    for key in stage_keys:
        if key != 'teams':
            al = al + stage_dss[key]


al = list(set(al))
for el in al:
    cb = re.match('[\w]*Chng', el)
    lb = el in ['empty', 'more', 'bib']
    pb = re.match('prev[\w]*', el)
    if cb or lb or pb:
        print(el)
        al.remove(el)
al.remove('prevGreenPos')
al.remove('greenPntsChng')
all_labels = sorted(al)
all_labels

prevKomPos
gcChng
prevYouthPos
greenChng
more
bib
prevGcPos
komChng
youthChng
komPntsChng


['DNF',
 'age',
 'avgTime',
 'countryTeam',
 'createBib',
 'gcPos',
 'gcTime',
 'greenPnts',
 'greenPos',
 'komPnts',
 'komPos',
 'name',
 'pnt',
 'stageID',
 'stagePos',
 'stageTime',
 'stgAvgPace',
 'team',
 'teamName',
 'timeAdd',
 'uciGc',
 'uciStage',
 'url',
 'youthPos',
 'youthTime']

In [250]:
class Stage():
    def __init__(self, stage_ID, stage_url):
        self.ID = stage_ID
        self.stage_url = stage_url
        self.__read_html__()

        self.stage_data = None
        self.team_df = None
        self.all_df = None
        self.__init_datasets__()
        self.column_subsets = {
            'gc': ['bib', 'uciGc'],
            'points': ['bib', 'greenPos', 'greenPnts'], \
            'youth': ['bib', 'youthPos', 'youthTime'], \
            'kom': ['bib', 'komPos', 'komPnts'], \
            'teams': ['teamPos', 'empty', 'teamName', 'teamTime', 'DNF']
        }
        
    def __read_html__(self):
        page = requests.get(self.stage_url)
        self.stage_html = BeautifulSoup(page.content, 'html.parser')
        if 'One day race' in str(self.stage_html.h2):
            print('ONE DAY RACE')
            self.stage_type = ONE_DAY_RACE
        elif '(ITT)' in str(self.stage_html.h2) or 'Time trial' in str(self.stage_html.h2):
            print('(ITT)')
            self.stage_type = ITT
        elif 'Prev' in str(self.stage_html):
            print('OTHER TOUR STAGE')
            self.stage_type = OTHER_TOUR_STAGE
        elif 'Prologue' in str(self.stage_html.h2):
            print('PROLOGUE')
            self.stage_type = PROLOGUE
        elif 'TTT' in str(self.stage_html.h2):
            print('TTT')
            self.stage_type = TTT
        else:
            print('FIRST_STAGE_IN_TOUR')
            self.stage_type = FIRST_STAGE_IN_TOUR
        
    def __init_datasets__(self):
        self.__update_datasets__(stage_columns[self.stage_type])
            
        
    def __update_datasets__(self, new_ds):
        self.stage_datasets = new_ds
        
    def get_row_lengths(self):
        row_lengths = list()
        for dataset_name in self.stage_datasets.keys():
            row_lengths.append(len(self.stage_datasets[dataset_name]) - 1)
        return row_lengths
    
    def verify_datasets(self):
        div_res_left = self.stage_html.find_all('div', class_="res-left")
        li_list = div_res_left[0].find_all('li')
        html_datasets_list = [get_text(li)[1].lower() for li in li_list]
        print('HTML datasets list: {}'.format(html_datasets_list))
        if len(html_datasets_list) > 0:
            new_stage_datasets = {}
            for key in html_datasets_list:
                if key in ['prol.']:
                    key = 'stage'
                if key not in self.stage_datasets.keys() and key != '':
                    print('KEY PROBLEM!!! Want ', key)
                    raise ValueError
                if key != '':
                    # time trial positions
                    new_stage_datasets[key] = self.stage_datasets[key]
            print('Old stage datasets: ', self.stage_datasets.keys())
            print('New stage datasets: ', new_stage_datasets.keys())
            self.__update_datasets__(new_stage_datasets)
        
    def scrape_stage_data(self, print_row=False):
        html = self.stage_html
        data_row_length = self.get_row_lengths()
        # all the racers are in a table data cell ('td')
        # intialise variables
        td_cells = html.find_all('td')
        # there can be up to 6 data tables on an html page
        self.stage_data = {}
        print('DRL@@:', data_row_length)
        
        data_id = 0
        row_length = data_row_length[data_id]

        old_length = row_length
        last_list_length = 0
        data_list = list()
        error_list = list()
        error_row = False
        ds_names = list(self.stage_datasets.keys())

        row = [self.ID]
        last_ix = 1
        error_ix = 0

        # itterate through all data cells and append their text values to a row
        print('TD: number of cells {}'.format(len(td_cells)))
        for td_ix, cell in enumerate(td_cells):
            text = get_text(cell)
            if type(text) is type('str'):
                #if self.stage_type == ITT and self.ID == 22:
                #print(row)
                row.append(text)
            else:
                row.append(text[0])
                row.append(text[1])

            if len(row) == 2:
                # the second element in the row is the position the rider finished
                # if the rider did not finish, the position will not be an int
                # it will be: DNF, DNS, OTL
                not_int = is_not_int(row[1])

                if not_int:
                    error_row = True

                if not not_int and int(row[1]) == 1 and len(data_list) != 0:
                    # a new table begins with a rider being places 1st
                    # save the complete previous table to the data map
                    self.__add_data_list__(data_id, data_list)
                    old_length = row_length

                    #reinitialise variables
                    data_id += 1
                    data_list = list()
                    last_ix = 1
                    error_row = False

                    row_length = data_row_length[data_id]
                    print('OLD ROW: {}, NEW ROW: {}'.format(old_length, row_length))
            if self.stage_type in [ITT, ONE_DAY_RACE] and len(row) in [3, 4]:
                # some ITTs have no bib numbers for the riders
                # we make it the postition number
                row = self.__check_bib__(row, last_ix + error_ix)
                
            if ds_names[data_id] == 'gc' and len(row) == 8:
                row = self.__check_uciGc__(row)
            
            if error_row and len(row) == old_length:
                # a row with a DNS, DNF, OTL rider
                # put the DNF reason in last position or the row list
                # put the rider in the last position
                row.append(row[1])
                row[1] = last_ix
                print('ERROR: {}'.format(row))
                data_list.append(row)
                row = [self.ID]

            elif not error_row and len(row) == row_length:
                # 'row_length' data cells make an entire row
                if print_row:
                    print(row)
                # data list gets saved in data subset
                pos = int(row[1])
                # DQ/ DNF/ OL column
                row.append(np.nan)
                data_list.append(row)
                last_ix = pos + 1
                error_ix += 1
                row = [self.ID]
                

        self.__add_data_list__(data_id, data_list)
        
    def __check_bib__(self, row, row_id):
        #print('Checking bib row:', row, row_id)
        r3t = row[2]
        #print('WHAT is this (Should be bib): ', r3t)
        not_int = is_not_int(r3t)
        if not_int:
            rank = row[1]
            if is_not_int(rank):
                rank = 1000 + row_id
            url_or_space = r3t
            before = row[:2]
        #    print('BEFORE:', before)

        #    print('URL OR SPACE (IF SPACE False) ', url_or_space != '')
            if url_or_space != '':
                after = row[2:]
        #        print('AFTER:', after)
                row = before + [rank, True] + after
            else:
                row[2] = rank
                row.append(True)
        else:
            row.append(False)
        #print('AFTER ROW', row)
        return row
    
    def __check_uciGc__(self, row):
        # there is no uciGC points
        uciGc = row[-1]
        if uciGc != '' and is_not_int(uciGc):
            time = uciGc
            row[-1] = ''
            row.append(time)
        return row
        
    def __add_data_list__(self, key_id, data_list):
        dataset_keys = list(self.stage_datasets.keys())
        ds_key = dataset_keys[key_id]
        print('UPDATING: {} has {} participants'.format(ds_key, len(data_list)))
        if len(data_list) <= 1:
            print('ERROR WITH NUMBER OF PARTICIPANTS')
            #print(self.stage_datasets[ds_key], data_list[0])
            raise ValueError
        self.stage_data[ds_key] = data_list

    def build_df(self):
        if self.stage_data is None:
            self.scrape_stage_data()

        dfs = list()
        dfs_to_decrease = self.column_subsets.keys()
        for dataset_name in self.stage_datasets.keys():
            print('{} in {}'.format(dataset_name, self.stage_data.keys()))
            ds = self.stage_data[dataset_name]
            df_cols = self.stage_datasets[dataset_name]
            print('"{}" with columns: {}'.format(dataset_name, df_cols))
            print(ds[0])
            df = pd.DataFrame(ds, columns=df_cols)
            
            # fix times
            if dataset_name == 'stage':
                df = fix_time(df, 'stageTime')
            if dataset_name == 'youth':
                df = fix_time(df, 'youthTime')
            
            # select necessary 
            if dataset_name in dfs_to_decrease:
                if 'stage' in self.stage_datasets.keys() or dataset_name != 'gc':
                    df = df[self.column_subsets[dataset_name]]
                
            if dataset_name != 'teams':
                print("BIBS UNIQUE LENGTH: {} of full length {} and {}".format(len(df.index), df.shape[0], df.index))
                print('CHANGING index from {}'.format(df.index.name))
                df = df.set_index('bib')
                print('to {}'.format(df.index.name))
                lid = list(df.index)
                for l in lid:
                    if is_not_int(l):
                        print(lid, l)
                        raise ValueError
                if len(df.index) != df.shape[0]:
                    print("NON UNIQUE INDICES!!!!!!")
                    raise ValueError
                dfs.append(df)
            else:
                self.team_df = df

        for df in dfs:
            print('INDEX: ', df.index.name, len(df.index.unique()) == df.shape[0])
        self.all_df = pd.concat(dfs, axis=1, sort=False)
        print('ALL TOGETHER INDEX: {}'.format(self.all_df.index.name))
        self.__create_all_columns__()
        return dfs
        
    def __create_all_columns__(self):
        # TODO: include this in the file
        current_columns = self.all_df.columns
        for label in all_labels:
            if label not in current_columns:
                self.all_df[label] = ''
        self.all_df = self.all_df[all_labels]
        self.all_df.index.name = 'bib'
        
    def get_all_df(self):
        return self.all_df

In [254]:
# get the points classifications
def get_points(stage_ID, stage_url, print_=False):
    ''' 
        Extract all the races in the season for the rider on the rider's profile (url)
    '''
    stage = Stage(stage_ID, stage_url)
    stage.verify_datasets()
    stage.scrape_stage_data(print_row=print_)
    print('DONE SCRAPING')
    stage.build_df()
    df = stage.get_all_df()
    df = df.reset_index()
    stgs_df.loc[stage_ID, 'stageType'] = stage.stage_type
    return df

def get_all_dfs(stage_ID, stage_url, print_=False):
    ''' 
        Extract all the races in the season for the rider on the rider's profile (url)
    '''
    stage = Stage(stage_ID, stage_url)
    stage.verify_datasets()
    stage.scrape_stage_data(print_row=print_)
    print('DONE SCRAPING')
    dfs = stage.build_df()
    return dfs

url_uniques = stgs_df.url.unique()
all_races = list()
written_1 = False
races_already = None
already_ids = None
most_columns = 0
try:
    races_already = pd.read_csv(ALL_RACE_DATA, index_col='Unnamed: 0')
    already_ids = races_already.stageID.unique()
    most_columns = len(races_already.columns)
except:
    stgs_df['stageType'] = 0
    print('No df')

for srix, srow in stgs_df.iterrows():
    url = srow['url']
    stage_id = srix
    if races_already is None or stage_id not in already_ids:
        print('\n\n')
        stage_url = 'https://www.procyclingstats.com/{}'.format(url)
        print('{} in {}: {}'.format(srix, len(url_uniques), url))
        #stage_url = 'https://www.procyclingstats.com/race/tour-de-france/2020/stage-2'
        rdf = get_points(srix, stage_url, False)
        all_races.append(rdf)

        if srix % 5 == 0 and srix > 0:
            all_df = pd.concat(all_races, axis=0, sort=False, ignore_index=True)
            if written_1 == False and races_already is None:
                all_df.to_csv(ALL_RACE_DATA)
                most_columns = len(all_df.columns)
                print('....................COLUMNS LENGTH: {}...'.format(len(all_df.columns)))
            else:
                all_df.to_csv(ALL_RACE_DATA, mode='a', header=False)
                    
            written_1 = True
            all_races = list()
    else:
        print('{} with id {} already in csv'.format(url, stage_id))

race/bretagne-classic/2020/result with id 0 already in csv
race/great-ocean-race/2020/result with id 1 already in csv
race/circuito-de-getxo/2020/result with id 2 already in csv
race/clasica-de-almeria/2020/result with id 3 already in csv
race/uec-road-european-championships-itt/2020/result with id 4 already in csv
race/uec-road-european-championships/2020/result with id 5 already in csv
race/les-boucles-du-dus-ardeche/2020/result with id 6 already in csv
race/giro-dell-emilia/2020/result with id 7 already in csv
race/gran-piemonte/2020/result with id 8 already in csv
race/gran-trittico-lombardo/2020/result with id 9 already in csv
race/gp-d-ouverture/2020/result with id 10 already in csv
race/gp-de-la-ville-de-lillers/2020/result with id 11 already in csv
race/grote-prijs-jean-pierre-monsere/2020/result with id 12 already in csv
race/il-lombardia/2020/result with id 13 already in csv
race/kuurne-brussel-kuurne/2020/result with id 14 already in csv
race/gp-samyn/2020/result with id 15 

FIRST_STAGE_IN_TOUR
HTML datasets list: ['stage', 'gc', 'youth', 'kom', 'teams']
Old stage datasets:  dict_keys(['stage', 'gc', 'points', 'youth', 'kom', 'teams'])
New stage datasets:  dict_keys(['stage', 'gc', 'youth', 'kom', 'teams'])
DRL@@: [12, 10, 10, 9, 5]
TD: number of cells 3027
ERROR: [58, 137, '', '+ -4:30:19', '86', 'rider/paul-ourselin', ' Ourselin Paul', '26', 'Team Total Direct Energie', '', '', '-', 'DNF']
ERROR: [58, 137, '', '+ -4:30:19', '186', 'rider/tom-dernies', ' Dernies Tom', '29', 'Natura4Ever - Roubaix Lille Métropole', '', '', ',,', 'DNF']
ERROR: [58, 137, '', '+ -4:30:19', '191', 'rider/bryan-alaphilippe', ' Alaphilippe Bryan', '25', 'St Michel - Auber93', '', '', ',,', 'DNF']
UPDATING: stage has 139 participants
OLD ROW: 12, NEW ROW: 10
UPDATING: gc has 136 participants
OLD ROW: 10, NEW ROW: 10
UPDATING: youth has 49 participants
OLD ROW: 10, NEW ROW: 9
UPDATING: kom has 11 participants
OLD ROW: 9, NEW ROW: 5
UPDATING: teams has 20 participants
DONE SCRAPING

FIRST_STAGE_IN_TOUR
HTML datasets list: ['stage', 'gc', 'points', 'kom', 'teams']
Old stage datasets:  dict_keys(['stage', 'gc', 'points', 'youth', 'kom', 'teams'])
New stage datasets:  dict_keys(['stage', 'gc', 'points', 'kom', 'teams'])
DRL@@: [12, 10, 9, 9, 5]
TD: number of cells 2443
ERROR: [61, 121, '', '+ -4:24:00', '53', 'rider/enric-mas', ' Mas Enric', '25', 'Movistar Team', '', '', '-', 'DNF']
ERROR: [61, 121, '', '+ -4:24:00', '34', 'rider/edward-planckaert', ' Planckaert Edward', '25', 'Sport Vlaanderen - Baloise', '', '', ',,', 'DNF']
ERROR: [61, 121, '', '+ -4:24:00', '142', 'rider/sergey-karmazhakov', ' Karmazhakov Sergey', '18', 'Lokosphinx', '', '', ',,', 'DNF']
ERROR: [61, 121, '', '+ -4:24:00', '124', 'rider/daniel-silva2', ' Silva Daniel', '21', 'Aviludo-Louletano', '', '', ',,', 'DNF']
ERROR: [61, 121, '', '+ -4:24:00', '122', 'rider/andre-evangelista', ' Evangelista Andre', '24', 'Aviludo-Louletano', '', '', ',,', 'DNF']
UPDATING: stage has 125 participants
OLD ROW

FIRST_STAGE_IN_TOUR
HTML datasets list: ['stage', 'gc', 'points', 'youth', 'kom', 'teams']
Old stage datasets:  dict_keys(['stage', 'gc', 'points', 'youth', 'kom', 'teams'])
New stage datasets:  dict_keys(['stage', 'gc', 'points', 'youth', 'kom', 'teams'])
DRL@@: [12, 10, 9, 10, 9, 5]
TD: number of cells 3645
ERROR: [64, 176, '', '+ -3:46:13', '155', 'rider/john-degenkolb', ' Degenkolb John', '31', 'Lotto Soudal', '', '', '-', 'OTL']
UPDATING: stage has 176 participants
OLD ROW: 12, NEW ROW: 10
UPDATING: gc has 175 participants
OLD ROW: 10, NEW ROW: 9
UPDATING: points has 24 participants
OLD ROW: 9, NEW ROW: 10
UPDATING: youth has 26 participants
OLD ROW: 10, NEW ROW: 9
UPDATING: kom has 3 participants
OLD ROW: 9, NEW ROW: 5
UPDATING: teams has 22 participants
DONE SCRAPING
stage in dict_keys(['stage', 'gc', 'points', 'youth', 'kom', 'teams'])
"stage" with columns: ['stageID', 'stagePos', 'gcPos', 'timeAdd', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage', 'pnt', 'stageTime', 'DNF'

FIRST_STAGE_IN_TOUR
HTML datasets list: ['stage', 'gc', 'points', 'youth', 'teams']
Old stage datasets:  dict_keys(['stage', 'gc', 'points', 'youth', 'kom', 'teams'])
New stage datasets:  dict_keys(['stage', 'gc', 'points', 'youth', 'teams'])
DRL@@: [12, 10, 9, 10, 5]
TD: number of cells 2646
ERROR: [67, 121, '', '+ -3:52:02', '87', 'rider/tom-wirtgen', ' Wirtgen Tom', '23', 'Bingoal - Wallonie Bruxelles', '', '', '-', 'DNF']
ERROR: [67, 121, '', '+ -3:52:02', '97', 'rider/nicolas-sessler', ' Sessler Nícolas', '25', 'Burgos-BH', '', '', ',,', 'DNF']
ERROR: [67, 121, '', '+ -3:52:02', '126', 'rider/ty-magner', ' Magner Ty', '28', 'Rally Cycling', '', '', ',,', 'DNF']
ERROR: [67, 121, '', '+ -3:52:02', '133', 'rider/piotr-havik', ' Havik Piotr', '25', 'Riwal Readynez Cycling Team', '', '', ',,', 'DNS']
UPDATING: stage has 124 participants
OLD ROW: 12, NEW ROW: 10
UPDATING: gc has 120 participants
OLD ROW: 10, NEW ROW: 9
UPDATING: points has 10 participants
OLD ROW: 9, NEW ROW: 10
UPDATIN

OTHER TOUR STAGE
HTML datasets list: ['stage', 'gc', 'points', 'youth', 'kom', 'teams']
Old stage datasets:  dict_keys(['stage', 'gc', 'points', 'youth', 'kom', 'teams'])
New stage datasets:  dict_keys(['stage', 'gc', 'points', 'youth', 'kom', 'teams'])
DRL@@: [12, 12, 11, 12, 11, 7]
TD: number of cells 3361
ERROR: [70, 127, '', '+ -4:55:54', '53', 'rider/carol-eduard-novak', ' Novak Carol Eduard', '43', 'Team Novak', '', '', '-', 'DNF']
ERROR: [70, 127, '', '+ -4:55:54', '131', 'rider/adrian-kurek', ' Kurek Adrian', '32', 'Mazowsze Serce Polski', '', '', ',,', 'DNS']
UPDATING: stage has 128 participants
OLD ROW: 12, NEW ROW: 12
UPDATING: gc has 126 participants
OLD ROW: 12, NEW ROW: 11
UPDATING: points has 29 participants
OLD ROW: 11, NEW ROW: 12
UPDATING: youth has 33 participants
OLD ROW: 12, NEW ROW: 11


ValueError: invalid literal for int() with base 10: 'rider/patrick-konrad'

In [255]:
stgs_df.stageType.unique()
#stgs_df[stgs_df.stageType == 1]

array([0, 3, 4, 5, 1])

In [238]:
stgs_df.loc[54, 'url']

'race/vuelta-a-la-comunidad-valenciana/2020/stage-1'

In [256]:
# ['stageID', 'komPos', 'prevKomPos', 'komChng', 'bib', 'url', 'name', 'age', 'team', 'komPnts',  'komPntsChng', 'DNF']
#for srix in [41, 42, 43, 44, 45]:
#    srow = stgs_df.loc[srix, ]
url = 'race/sibiu-cycling-tour/2020/stage-1'
#    stage_id = srix

stage_url = 'https://www.procyclingstats.com/{}'.format(url)
rdf = get_points(srix, stage_url, True)
# the problem is that after the prologue there were no KOM points so because of that there are no previous columns. 
# Hence no change either. 

OTHER TOUR STAGE
HTML datasets list: ['stage', 'gc', 'points', 'youth', 'kom', 'teams']
Old stage datasets:  dict_keys(['stage', 'gc', 'points', 'youth', 'kom', 'teams'])
New stage datasets:  dict_keys(['stage', 'gc', 'points', 'youth', 'kom', 'teams'])
DRL@@: [12, 12, 11, 12, 11, 7]
TD: number of cells 3361
[70, '1', '2', '+0:03', '2', 'rider/gregor-muhlberger', ' Mühlberger Gregor', '26', 'BORA - hansgrohe', '14', '20', '4:52:11']
[70, '2', '1', '+0:00', '1', 'rider/patrick-konrad', ' Konrad Patrick', '28', 'BORA - hansgrohe', '5', '12', ',,']
[70, '3', '3', '+0:53', '13', 'rider/matteo-badilatti', ' Badilatti Matteo', '27', 'Israel Start-Up Nation', '3', '7', '0:16']
[70, '4', '4', '+1:20', '25', 'rider/remy-rochas', ' Rochas Rémy', '24', 'NIPPO DELKO One Provence', '', '5', '1:04']
[70, '5', '5', '+2:40', '34', 'rider/luca-wackermann', ' Wackermann Luca', '28', 'Vini Zabù - KTM', '', '4', '1:41']
[70, '6', '6', '+3:38', '152', 'rider/piotr-brozyna', ' Brożyna Piotr', '25', 'Voster 

ValueError: invalid literal for int() with base 10: 'rider/patrick-konrad'

In [252]:
rdf.head()

Unnamed: 0,bib,DNF,age,avgTime,countryTeam,createBib,gcPos,gcTime,greenPnts,greenPos,...,stageTime,stgAvgPace,team,teamName,timeAdd,uciGc,uciStage,url,youthPos,youthTime
0,45,,26,,,,1,,25,1,...,4:07:40,,,Team Jumbo-Visma,+0:00,,20.0,rider/dylan-groenewegen,,
1,25,,23,,,,2,,20,2,...,4:07:40,,,Deceuninck - Quick Step,+0:00,,10.0,rider/fabio-jakobsen,1.0,4:07:40
2,53,,32,,,,3,,16,3,...,4:07:40,,,UAE-Team Emirates,+0:00,,5.0,rider/alexander-kristoff,,
3,37,,32,,,,4,,14,4,...,4:07:40,,,Team INEOS,+0:00,,,rider/ben-swift,,
4,152,,31,,,,5,,12,5,...,4:07:40,,,Fundación - Orbea,+0:00,,,rider/juan-jose-lobato,,


In [226]:
all_df[all_df.stageID == 45].loc[518,]

bib                               166
DNF                               NaN
age                                29
avgTime                        42.056
countryTeam                          
createBib                            
gcPos                               1
gcTime                          +0:00
greenPnts                          25
greenPos                            1
komPnts                              
komPos                               
name                 Holler Nikodemus
pnt                                20
stageID                            45
stagePos                            1
stageTime                        3:34
stgAvgPace                           
team                                 
teamName                     Bike Aid
timeAdd                              
uciGc                               3
uciStage                           14
url            rider/nikodemus-holler
youthPos                          NaN
youthTime                         NaN
Name: 518, d

In [221]:
all_df.columns

Index(['bib', 'DNF', 'age', 'avgTime', 'countryTeam', 'createBib', 'gcPos',
       'gcTime', 'greenPnts', 'greenPos', 'komPnts', 'komPos', 'name', 'pnt',
       'stageID', 'stagePos', 'stageTime', 'stgAvgPace', 'team', 'teamName',
       'timeAdd', 'uciGc', 'uciStage', 'url', 'youthPos', 'youthTime'],
      dtype='object')

In [213]:
len(all_df.columns.unique()), all_df.shape

(25, (129, 25))

In [161]:
print(rdfs[0].shape[0])
len(set(rdfs[0].index.unique()))


175


175

In [165]:
rdfs[].head()

IndexError: list index out of range

In [163]:
t = pd.concat([rdfs[0], rdfs[1]], axis=1, sort=False)
t

IndexError: list index out of range

In [105]:
for col in rdf.columns:
    if col not in all_labels:
        print(col)
print('bib' in rdf.columns)
rdf.head()

index
False


Unnamed: 0,index,DNF,age,avgTime,countryTeam,createBib,gcPos,gcTime,greenPnts,greenPos,...,stageTime,stgAvgPace,team,teamName,timeAdd,uciGc,uciStage,url,youthPos,youthTime
0,166,,29,42.056,,,1,+0:00,25,1,...,3:34,,,Bike Aid,,3.0,14.0,rider/nikodemus-holler,,
1,146,,20,41.667,,,2,+0:02,20,2,...,0:02,,,CCC Development Team,,,5.0,rider/kacper-walkowiak,1.0,3:36
2,156,,26,41.475,,,3,+0:03,16,3,...,0:03,,,Voster ATS Team,,,3.0,rider/wojciech-sykala,,
3,96,,24,41.096,,,4,+0:05,14,4,...,0:05,,,Elkov - Kasper,,,,rider/adam-toupalik,,
4,186,,23,40.359,,,5,+0:09,12,5,...,0:09,,,Work Service Dynatek Vega,,,,rider/matteo-rotondi,,


In [234]:
races_already = pd.read_csv(ALL_RACE_DATA, index_col='Unnamed: 0')
already_ids = races_already.stageID.unique()
races_already.stageID.unique()

stgs_df.stageType.unique()

array([0, 3, 4, 5, 1])

In [58]:
races_already = pd.read_csv(ALL_RACE_DATA, index_col='Unnamed: 0')
already_ids = races_already.stageID.unique()

ParserError: Error tokenizing data. C error: Expected 15 fields in line 4209, saw 24


In [None]:
all_races[3].columns

In [None]:
all_races[4].columns

In [None]:
all_races[4].head()

In [None]:
df1 = pd.concat([all_races[0], all_races[1], all_races[2], all_races[3]], axis=0, sort=False).reset_index(drop=True)
#df2 = pd.append(all_races[2], sort=False, ignore_index=True)
#df3 = pd.append(all_races[3], sort=False, ignore_index=True)
df4 = pd.concat([df1, all_races[4]], axis=0, sort=False)
#df4 = pd.concat([df1, all_races[4]], axis=0).reset_index(drop=True)
#df = pd.append(all_races, sort=False, axis=1)
print(all_races[4].columns, '\n\n', all_races[4].index)
all_races[4].head()

In [None]:
id_ = 4
#axis=1, sort=False)
print(len(all_races[id_].columns))
list(all_races[id_].columns)

In [None]:
stage_ID = 45
stage_url = 'https://www.procyclingstats.com/race/nc-switzerland-itt/2020/result'
stage = Stage(stage_ID, stage_url)
stage.stage_html.h2

In [None]:
temp = stgs_df[~stgs_df.stageType.isnull()]
temp.stageType.unique()

In [None]:
temp[temp.stageType == 3]

In [None]:
races_already = pd.read_csv(ALL_RACE_DATA, index_col='index')
ra = races_already[races_already.stageID == 4]
print(ra.shape)
ra.head()

In [None]:
def not_empty_text(text):
    t = text.replace(' ', '')
    if len(t) == 0:
        return False
    else:
        return True
    
def find_more_stage_info(stage_html):
    '''
        Each stage has more ifnormation about its profile. Itterate 
        through each stage and extract the profile information. 
    '''
    res_ = stage_html.find_all("div", class_="res-right")
    
    res_text = res_[0].find_all(text=True)
    
    stage_info = list()
    mountains = list()
    found_race_rank = False
    
    for tix, text in enumerate(res_text):
        web_regex = re.search('(www.(.)+\.(.)+)+', text) \
                        or re.search('((.)+\.com(.)+)+', text) \
                        or re.search('((.)+\.(\w)*(\d)*/)', text) \
                        or 'googletag.cmd.push(' in text
        #if web_regex: print(text)
        if text not in ['Race information', 'Date: ', 'Avg. speed winner:', 'rd', \
                        'Race category: ', 'Parcours type:', 'PCS point scale:', \
                        ' ', 'Start/finish:', ' › ', 'Climbs: ', ', ', 'Race profile', \
                        'Finish photo', 'Finish photo', 'LiveStats', 'Websites:', \
                        'Race ranking position', 'ranking', 'th', 'nd', 'st', '\n', \
                        'breakdown', 'Position and points as on startdate of race.'] \
        and not web_regex:
            if len(stage_info) <= 6 or found_race_rank:
                # the first 6 cells of interest
                # or if the race rank has been found
                if '›' in text:
                    start_ix = text.find('›')
                    start = text[:start_ix]
                    if not_empty_text(start):
                        stage_info.append(start)
                    text = text[start_ix + 1:]
                if not_empty_text(text):
                    stage_info.append(text)
                if re.search('(\d)* pnt', text):
                    # after this string regex there is only adds and redundant information
                    break
            else:
                # there is a variable number of mountains 
                if is_not_int(text):
                    mountains.append(text)
                else:
                    # race rank (int value) comes right after mountains 
                    # have been listed
                    stage_info.append(mountains)
                    stage_info.append(len(mountains))
                    stage_info.append(text)
                    found_race_rank = True
        
        if text is 'ranking':
            break
    return stage_info

if CREATE_DATA or True:
    all_stages = list()
    all_racers = list()
    for stage_ix, stage  in stgs_df.iterrows():
        url = stage['url']
        stage_url = 'https://www.procyclingstats.com/{}'.format(url)
        print('{} of {} for \'{}\''.format(stage_ix, stgs_df.shape[0], url))
        page = requests.get(stage_url)
        stage_html = BeautifulSoup(page.content, 'html.parser')

        stage_row = find_more_stage_info(stage_html)
        if len(stage_row) > 11:
            print(len(stage_row))

        #stage_info = list(stage) + stage_row
        all_stages.append(stage_row)
        
        rdf = get_points(stage_html)
        all_racers.append(rdf)
    
    new_stg_info = pd.DataFrame(all_stages, columns=['dateFull', 'averageSpeed', 'raceCtgr', \
                                     'parcoursType', 'PCSPointScale', 'start',\
                                     'end', 'mountains', 'numMount', 'raceRank', \
                                     'racePoints'])
    new_stg_info.to_csv(NEW_STAGES_LOC)
else:
    new_stg_info = pd.read_csv(NEW_STAGES_LOC, index_col='Unnamed: 0')

In [None]:

print(len(all_racers))
all_racers_df = pd.concat(all_racers)
all_racers_df.head()

In [None]:
print(new_stg_info.shape)
new_stg_info.head()

In [None]:
print(stgs_df.shape)
stages_df  = pd.concat([stgs_df, new_stg_info], sort=False, axis=1)
stages_df['racePoints'] = stages_df.racePoints.str.replace('pnt', '')
#'raceRank' is relative to when the rank is taken
# not constant or reliable
stages_df = stages_df.drop('raceRank',  axis=1)
stages_df.index.name = 'stageID'
stages_df.head()

In [None]:
errors = stages_df[stages_df.racePoints.isnull()]
print(errors.shape)
errors

### Errors
From the code above we see that there are stages that are erroneously made. This include 2 types:
 - Stages that have not list of mountains, number of mountains and racePoints (`9`, `17`, `68`)
 - Stages that have location names as PCSPointScale values (`75`)

In [None]:
# fix error type 1
stages_df.numMount.fillna(0, inplace=True)
stages_df.mountains.fillna('', inplace=True)
stages_df.racePoints.fillna(0, inplace=True)
stages_df[stages_df.racePoints.isnull()]

In [None]:
# error type 2
e2_df = stages_df[stages_df.PCSPointScale.str.match(r'[\w\d\.]*\.{1}[\w\d\.]+')==False]
e2_df

In [None]:
for eix, err_row in e2_df.iterrows():
    if err_row['PCSPointScale'] == 'E-CC':
        continue
    start = err_row['PCSPointScale']
    end = err_row['start']
    mount = err_row['end']
    
    stages_df.loc[eix, 'PCSPointScale'] = ''
    stages_df.loc[eix, 'start'] = start
    stages_df.loc[eix, 'end'] = end
    mountains = e2_df.loc[eix, 'mountains']
    if is_not_int(mount):
        mountains = '{}, {}]'.format(mountains[:-1], mount)
        stages_df.loc[eix, 'mountains'] = mountains
    else:
        stages_df.loc[eix, 'mountains'] = ''
        stages_df.loc[eix, 'racePoints'] = mount

stages_df[stages_df.PCSPointScale.str.match(r'[\w\d\.]*\.{1}[\w\d\.]+')==False]

In [None]:
stages_df.head()

In [None]:
races_df.head()

In [None]:
url = 'race/sibiu-cycling-tour/2020/prologue'
stage_url = 'https://www.procyclingstats.com/{}'.format(url)
print(stage_url)
page = requests.get(stage_url)
stage_html = BeautifulSoup(page.content, 'html.parser')

In [None]:
stage_row = find_more_stage_info(stage_html)
if len(stage_row) > 11:
    print(len(stage_row))

rdf = get_points(stage_html, True)
rdf
stage_col = ['gcPos', 'bib', 'url', 'name', 'age', 'teamName', 'uciStage', 'stageTime', 'more', 'DNF']

In [None]:
rdf[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]

In [None]:
df = rdf[['stagePos', 'gcPos', 'timeAdd', 'url', 'name', 'age', 'teamName', 'uciStage', 'pnt', 'stageTime', 'DNF']]
df = df[~df.stagePos.isnull()]
left = df[df.gcPos == '']
for lix, left_row in left.iterrows():
    df.loc[lix, 'gcPos'] = left_row['stagePos']
df = df.astype({'gcPos': int})
df.sort_values(by=['gcPos'])


